def svr_main(X, Y): X_train = X[:TRAIN_SIZE] Y_train = Y[:TRAIN_SIZE] X_test = X[TRAIN_SIZE:] Y_test = Y[TRAIN_SIZE:] clf = SVR(kernel='rbf', C=1e3, gamma=0.00001) #clf.fit(X_train,Y_train) #y_pred = clf.predict(X_test) #plt.plot(X_test, y_pred, linestyle='-', color='red') #clf = GradientBoostingRegressor(n_estimators=100,max_depth=1) #clf = DecisionTreeRegressor(max_depth=25) #clf = ExtraTreesRegressor(n_estimators=2000,max_depth=14) #clf = xgb.XGBRegressor(n_estimators=2000,max_depth=25) #clf = RandomForestRegressor(n_estimators=1000,max_depth=26,n_jobs=7) predict_list = [] for i in xrange(TEST_SIZE): X = [ [x] for x in xrange(i, TRAIN_SIZE+i)] clf.fit(X, Y[i:TRAIN_SIZE+i]) y_pred = clf.predict([TRAIN_SIZE+1+i]) predict_list.append(y_pred) print "mean_squared_error:%s"%mean_squared_error(Y_test, predict_list) print "sqrt of mean_squared_error:%s"%np.sqrt(mean_squared_error(Y_test, predict_list)) origin_data = Y_test print "origin data:%s"%origin_data plt.plot([ x for x in xrange(TRAIN_SIZE+1, TRAIN_SIZE+TEST_SIZE+1)], predict_list, linestyle='-', color='red', label='prediction model') plt.plot(X_test, Y_test, linestyle='-', color='blue', label='actual model') plt.legend(loc=1, prop={'size': 12}) plt.show()
def compute_mse(regressor, horizon): # get wind park and corresponding target. windpark = NREL().get_windpark(NREL.park_id['tehachapi'], 3, 2004, 2005) target = windpark.get_target() # use power mapping for pattern-label mapping. feature_window = 3 mapping = PowerMapping() X = mapping.get_features_park(windpark, feature_window, horizon) y = mapping.get_labels_turbine(target, feature_window, horizon) # train roughly for the year 2004, test for 2005. train_to = int(math.floor(len(X) * 0.5)) test_to = len(X) train_step, test_step = 25, 25 X_train=X[:train_to:train_step] y_train=y[:train_to:train_step] X_test=X[train_to:test_to:test_step] y_test=y[train_to:test_to:test_step] if(regressor == 'svr'): reg = SVR(kernel='rbf', epsilon=0.1, C = 100.0,\ gamma = 0.0001).fit(X_train,y_train) mse = mean_squared_error(reg.predict(X_test),y_test) elif(regressor == 'knn'): reg = KNeighborsRegressor(10, 'uniform').fit(X_train,y_train) mse = mean_squared_error(reg.predict(X_test),y_test) return mse
def evaluate_learner(X_train, X_test, y_train, y_test): ''' Run multiple times with different algorithms to get an idea of the relative performance of each configuration. Returns a sequence of tuples containing: (title, expected values, actual values) for each learner. ''' # Use a support vector machine for regression from sklearn.svm import SVR # Train using a radial basis function svr = SVR(kernel='rbf', gamma=0.1) svr.fit(X_train, y_train) y_pred = svr.predict(X_test) r_2 = svr.score(X_test, y_test) yield 'RBF Model ($R^2={:.3f}$)'.format(r_2), y_test, y_pred # Train using a linear kernel svr = SVR(kernel='linear') svr.fit(X_train, y_train) y_pred = svr.predict(X_test) r_2 = svr.score(X_test, y_test) yield 'Linear Model ($R^2={:.3f}$)'.format(r_2), y_test, y_pred # Train using a polynomial kernel svr = SVR(kernel='poly', degree=2) svr.fit(X_train, y_train) y_pred = svr.predict(X_test) r_2 = svr.score(X_test, y_test) yield 'Polynomial Model ($R^2={:.3f}$)'.format(r_2), y_test, y_pred
class SVR(PlayerModel): ### a wrapper for support vector regression using scikit-learn for this project def __init__(self): PlayerModel.__init__(self) # configure support vector regression and start training self.regr = SupportVectorRegression(kernel = 'linear', C = 1000) self.regr.fit(self.dataset_X_train, self.dataset_Y_train) print "Finish building player model." print "Parameters: ", self.regr.get_params() print "============================================================" def testScore(self, test_X): score = self.regr.predict(self.normalizeTest(test_X)) return np.mean(score) def getParams(self): return self.regr.get_params() def visualize(self): x = np.zeros((10, self.col - 1)) mean = self.dataset_X_train.mean(0) for i in range(10): x[i, :] = mean x[:, 0:1] = np.array([np.arange(0.0, 1.1, 0.11)]).T # print x y = self.regr.predict(x) # print y pyplot.scatter(self.dataset_X_train[:, 0:1], self.dataset_Y_train, c='k', label='data') pyplot.hold('on') pyplot.plot(x[:, 0:1], y, c = "r", label='Support Vector Regression') pyplot.xlabel('data collect from player') pyplot.ylabel('score') pyplot.title('Support Vector Regression') pyplot.legend() pyplot.show()
def svr_predict(sampled_data_FC1,sampled_data_FC2,len_real=1800,len_train=1100): sampled_data_FC1_minus_FC2=sampled_data_FC2 sampled_data_FC1_minus_FC2=np.array(sampled_data_FC1_minus_FC2) # sampled_data_FC1_minus_FC2=FC1_minus_FC2.FC1_minus_FC2(sampled_data_FC1,sampled_data_FC2) ## solve method 2: normalize sampled_data_FC=[] for sdF in sampled_data_FC1_minus_FC2: sampled_data_FC.append(sdF[1]) ## time normalize temp_time_end=sampled_data_FC1_minus_FC2[-1,0] sampled_data_FC1_minus_FC2[:,0]=sampled_data_FC1_minus_FC2[:,0]/temp_time_end sampled_data=np.column_stack((sampled_data_FC1_minus_FC2[:,0],sampled_data_FC)) X = sampled_data[0:len_real,0] # X is the all real time_data Y =sampled_data[0:len_real,1] # Y is the all real value_data pat_list=[] for i in range(len_train-regressors_num+1): pat_list.append(list(sampled_data[i:i+regressors_num,1])+[sampled_data[i+regressors_num,0],sampled_data[i+regressors_num,1]]) # make the value couple,like [[x1,x2,x3,x4],[x5]] pat_list=np.array(pat_list) X1=pat_list[:,0:regressors_num+1] X=X.reshape([len_real,1]) Y1=pat_list[:,regressors_num+1] # Y1 is the train real value_data ############################################################################### # Fit regression model svr_rbf = SVR(kernel='rbf', epsilon=0.0083, C=1000, gamma=0.05) svr_rbf.fit(X1, Y1) ######################################################################## # prognostic phase y_rbf_prog=[] for i in range(len_real-len_train): if i ==0: X_temp1=list(X1[-1][:-1])+[sampled_data[len_train,0]] X_temp1=np.array(X_temp1) y_rbf_prog.append(float(svr_rbf.predict(X_temp1))) elif i < regressors_num: X_temp=list(X1[-1][-(regressors_num-i)-1:-1])+y_rbf_prog+[sampled_data[i+len_train,0]] X_temp=np.array(X_temp) y_rbf_prog.append(float(svr_rbf.predict(X_temp))) elif i >= regressors_num: X_temp=y_rbf_prog[-regressors_num:]+[sampled_data[i+len_train,0]] X_temp=np.array(X_temp) y_rbf_prog.append(float(svr_rbf.predict(X_temp))) FC2_prog_pred=sampled_data_FC1[len_train:len_real,1]-y_rbf_prog # return FC2_prog_pred return y_rbf_prog
def SVM(Xtrain, ytrain, Xtest=None, C=1): model = SVR(C=C) ## module imported from Scikit-Learn model.fit(Xtrain, ytrain) pred = model.predict(Xtrain) if Xtest is None: return pred else: pred_test = model.predict(Xtest) return pred, pred_test
def supportVectorRegression(X, Y_casual, Y_registered, testSet_final): svr1 = SVR(kernel='rbf', gamma=0.1) svr2 = SVR(kernel='rbf', gamma=0.1) svr1.fit(X, Y_casual) svr2.fit(X, Y_registered) svr1_Y = np.exp(svr1.predict(testSet_final))-1 svr2_Y = np.exp(svr2.predict(testSet_final))-1 final_prediction = np.intp(np.around(svr1_Y + svr2_Y)) return final_prediction
class W2VPool: def __init__(self, poolingDim = 20): self.clf = SVR(C = 0.5) self.model = Word2Vec.load("vectors.bin") self.poolingDim = poolingDim def getFeatures(self, data): sentenceAs = [data[0] for data in data] sentenceBs = [data[1] for data in data] scores = [float(data[2]) for data in data] features = [] for i in range(len(sentenceAs)): mat = self.simMatrix(self.model, sentenceAs[i], sentenceBs[i]) mat = self.dynamicPooling(mat, self.poolingDim) features.append(np.ndarray.flatten(mat)) return features, scores def simMatrix(self, model, sentence1, sentence2): tokens1 = word_tokenize(sentence1) tokens2 = word_tokenize(sentence2) mat = np.zeros((len(tokens1), len(tokens2))) for index1, token1 in enumerate(tokens1): for index2, token2 in enumerate(tokens2): vec1 = model[token1] if token1 in model else np.zeros((len(model['the']))) vec2 = model[token2] if token2 in model else np.zeros((len(model['the']))) mat[index1][index2] = cosine(vec1, vec2) return mat def dynamicPooling(self, matrix, finalDim): finalMatrix = np.zeros((finalDim, finalDim)) for i in range(finalDim): for j in range(finalDim): compressionArea = [] for a in range(int(float(i) / finalDim * matrix.shape[0]), int(float(i + 1) / finalDim * matrix.shape[0])): for b in range(int(float(j) / finalDim * matrix.shape[1]), int(float(j + 1) / finalDim * matrix.shape[1])): compressionArea.append(matrix[a][b]) if len(compressionArea) == 0: finalMatrix[i][j] = matrix[int(float(i) / finalDim * matrix.shape[0])][int(float(j) / finalDim * matrix.shape[1])] else: finalMatrix[i][j] = min(compressionArea) return np.nan_to_num(finalMatrix) def train(self, trainData): features, scores = self.getFeatures(trainData) self.clf.fit(features, scores) results = self.clf.predict(features) print("Training Error") print(sklearn.metrics.mean_squared_error(results, np.array(scores))) def test(self, test): features, scores = self.getFeatures(test) results = self.clf.predict(features) print("Testing Error") print(sklearn.metrics.mean_squared_error(results, np.array(scores)))
def svr_model(x_train, y_train, x_test, x_valid, cache_name, use_cache=False): if use_cache: fhand = open(cache_name, 'r') data_dict = pickle.load(fhand) return data_dict['test_pred'], data_dict['valid_pred'] np.random.seed(seed=123) model = SVR() model.fit(x_train, np.log(y_train)) test_pred = np.exp(model.predict(x_test)) valid_pred = np.exp(model.predict(x_valid)) data_dict = {'test_pred': test_pred, 'valid_pred': valid_pred} fhand = open(cache_name, 'w') pickle.dump(data_dict, fhand) fhand.close() return test_pred, valid_pred
def method_laprls(vecX, vecy, train, test, states=2, params=[1.0, 0.1, 0.1], true_latent=None, plot=False): ks = 2 A = np.zeros((vecX.shape[0], vecX.shape[0])) for k in range(1, ks): for i in range(vecX.shape[0]-k): A[i, i+k] = 1 A[i+k, i] = 1 print A.shape # A = A[train, :] # A = A[:, train] print A.shape D = np.diag(np.sum(A, axis=1)) L = D - A # K_transd = get_kernel(vecX[train, :], vecX[train, :], type='rbf', sigma=params[2]) K_transd = get_kernel(vecX, vecX, type='rbf', sigma=params[2]) # deformation radius r = 0.01 I = np.eye(K_transd.shape[0]) M = L Ktilde = np.linalg.inv(I + r*K_transd.dot(M)).dot(K_transd) lap_param = r*M.dot(Ktilde) lap_param = lap_param[train, :] lap_param = lap_param[:, train] clf = SVR(C=params[0], epsilon=params[1], shrinking=False, kernel=partial(get_kernel, type='lap', sigma=params[2], lap_param=lap_param)) clf.fit(vecX[train, :], vecy[train]) # clf.fit(vecX, vecy) return 'Laplacian reg. SVR (RBF)', clf.predict(vecX[test, :]), np.ones(len(test))
def compute_rmse(features, labels, train_index, test_index): x_train, x_test = features[train_index], features[test_index] y_train, y_test = labels[train_index], labels[test_index] r, c = x_train.shape if r < 15: return None if NORMALIZATION_FLAG: feature_scaler = StandardScaler().fit(x_train) x_train = feature_scaler.transform(x_train) x_test = feature_scaler.transform(x_test) label_scaler = StandardScaler().fit(y_train) y_train = label_scaler.transform(y_train).ravel() clf = SVR(C=100, gamma=0.001, kernel='rbf').fit(x_train, y_train) y_pred = clf.predict(x_test) if NORMALIZATION_FLAG: y_pred = y_pred*label_scaler.scale_ + label_scaler.mean_ if LOG_FLAG: actual_pred = numpy.array([10 ** y for y in y_pred]) actual_price = numpy.array([10 ** y for y in y_test]) else: actual_pred = y_pred actual_price = y_test actual_rmse_pc = numpy.sqrt(numpy.mean(((actual_pred - actual_price) / actual_price) ** 2)) actual_rmse = numpy.sqrt(numpy.mean((actual_pred - actual_price) ** 2)) return actual_rmse, actual_rmse_pc
def tecnicaSVR(): parametros = [{'kernel':'linear', 'C':0.1, 'epsilon':0.2}, {'kernel':'linear', 'C':1.0, 'epsilon':0.2}, {'kernel':'rbf', 'degree':3, 'gamma':.0001, 'C':1.0, 'epsilon':0.2}, {'kernel':'rbf', 'degree':2, 'gamma':.01, 'C':0.1, 'epsilon':0.2}] mae=mse=r2=0 for c in parametros: clf = SVR(**c) #VALIDACION CRUZADA mae=mse=r2=0 kf = KFold(len(boston_Y), n_folds=10, indices=True) for train, test in kf: trainX, testX, trainY, testY=boston_X[train], boston_X[test], boston_Y[train], boston_Y[test] clf.fit(trainX, trainY) prediccion=clf.predict(testX) mae+=metrics.mean_absolute_error(testY, prediccion) mse+=metrics.mean_squared_error(testY, prediccion) r2+=metrics.r2_score(testY, prediccion) print clf.coef_ print "Parametros: ", c print 'Error abs: ', mae/len(kf), 'Error cuadratico: ', mse/len(kf), 'R cuadrado: ', r2/len(kf) mae=mse=r2=0
def test4(): ''' We assume that for each year, 7.1~9.30 belongs to summer model(model-1), 12.1~2.28 belongs to winter model(model-3), the others, 3.1~6.30, 10.1~11.30 belongs to spring model(model-2)''' model_1_train_x = x[:15]+x[285:375]+x[645:745]+x[1015:1105]+x[1375:1465] model_1_train_y = y[:15]+y[285:375]+y[645:745]+y[1015:1105]+y[1375:1465] model_2_train_x = x[15:75]+x[375:435]+x[745:805]+x[1105:1165] model_2_train_y = y[15:75]+y[375:435]+y[745:805]+y[1105:1165] model_3_train_x = x[75:165]+x[435:525]+x[805:895]+x[1165:1255] model_3_train_y = y[75:165]+y[435:525]+y[805:895]+y[1165:1255] model_4_train_x = x[165:285]+x[525:645]+x[895:1015]+x[1255:1375] model_4_train_y = y[165:285]+y[525:645]+y[895:1015]+y[1255:1375] model_1, model_2, model_3, model_4 = SVR(), SVR(), SVR(), SVR() model_1.fit(model_1_train_x, model_1_train_y) model_2.fit(model_2_train_x, model_2_train_y) model_3.fit(model_3_train_x, model_3_train_y) model_4.fit(model_4_train_x, model_4_train_y) model_1_test_x = x[1735:1825] model_1_test_y = y[1735:1825] model_2_test_x = x[1465:1525]+x[1825:1885] model_2_test_y = y[1465:1525]+y[1825:1885] model_3_test_x = x[1525:1615]+x[1885:1975] model_3_test_y = y[1525:1615]+y[1885:1975] model_4_test_x = x[1615:1735]+x[1975:] model_4_test_y = y[1615:1735]+y[1975:] model_1_pred, model_2_pred, model_3_pred, model_4_pred = model_1.predict(model_1_test_x), model_2.predict(model_2_test_x), model_3.predict(model_3_test_x), model_4.predict(model_4_test_x) calc_err(model_1_pred, model_1_test_y) calc_err(model_2_pred, model_2_test_y) calc_err(model_3_pred, model_3_test_y) calc_err(model_4_pred, model_4_test_y) calc_err(list(model_1_pred)+list(model_2_pred)+list(model_3_pred)+list(model_4_pred), model_1_test_y+model_2_test_y+model_3_test_y+model_4_test_y)
class HotTweets: ''' Train and get tweet hotness ''' def __init__(self, kernel='rbf', C=1e3, gamma=0.1, epsilon=0.1, n_comp=100): ''' Prepare support vector regression ''' self.svr = SVR(kernel=kernel, C=C, gamma=gamma, epsilon=epsilon, verbose=True) #self.svr = LogisticRegression(random_state=42, verbose=0) self.n_comp = n_comp def fit_scaler(self, dev, i_dev): ''' Train normalizers for features and importances ''' # importance scaler self.std_scaler_i = sklearn.preprocessing.StandardScaler() self.std_scaler_i.fit(i_dev) self.norm = sklearn.preprocessing.StandardScaler() self.norm.fit(dev[:,0:self.n_comp]) self.n_comp = self.n_comp def train(self, features, importances): ''' Train regression ''' importances = self.std_scaler_i.transform(importances) features = self.norm.transform(features[:,0:self.n_comp]) self.svr.fit(features, importances) def predict(self, features): ''' Predict importances ''' features = self.norm.transform(features[:,0:self.n_comp]) results = self.svr.predict(features) #print results[0:100:5] results = self.std_scaler_i.inverse_transform(results) #print results[0:100:5] return results
def svm_regressor(features,target,test_size_percent=0.2,cv_split=5): scale=preprocessing.MinMaxScaler() X_array = scale.fit_transform(features) y_array = scale.fit_transform(target) X_train, X_test, y_train, y_test = train_test_split(X_array, y_array.T.squeeze(), test_size=test_size_percent, random_state=4) svr = SVR(kernel='rbf',C=10,gamma=1) svr.fit(X_train,y_train.ravel()) test_prediction = svr.predict(X_test) tscv = TimeSeriesSplit(cv_split) training_score = cross_val_score(svr,X_train,y_train,cv=tscv.n_splits) testing_score = cross_val_score(svr,X_test,y_test,cv=tscv.n_splits) print"Cross-val Training score:", training_score.mean() # print"Cross-val Testing score:", testing_score.mean() training_predictions = cross_val_predict(svr,X_train,y_train,cv=tscv.n_splits) testing_predictions = cross_val_predict(svr,X_test,y_test,cv=tscv.n_splits) training_accuracy = metrics.r2_score(y_train,training_predictions) # test_accuracy_model = metrics.r2_score(y_test,test_prediction_model) test_accuracy = metrics.r2_score(y_test,testing_predictions) # print"Cross-val predicted accuracy:", training_accuracy print"Test-predictions accuracy:",test_accuracy return svr
def CaSVRModel(X_train, Y_train, X_test, Y_test, cv_iterator): # # param_grid = {'C':[10000], # 'epsilon':[0.001, 0.01, 0.05, 0.1, 0.15, 1] # } # # svr = SVR(random_state=42, cache_size=1000, verbose=2) # search = GridSearchCV(svr, param_grid, scoring="mean_squared_error", n_jobs= 1, iid=True, cv=cv_iterator) # search.fit(X_train, Y_train["Ca"]) # #search.grid_scores_ # # model = search.best_estimator_ #scaler = StandardScaler() model = SVR(C=10000, epsilon = 0.01, cache_size=1000) model.fit(X_train, Y_train["Ca"]) #model.fit(X_train, Y_train["Ca"]) #model.fit(X_train, Y_train["Ca"]) #test = cross_val_score(svr, X_train.astype('float64'), Y_train["Ca"].astype('float64'), scoring="mean_squared_error", cv=cv_iterator) yhat_svr = model.predict(X_test) test_error = math.sqrt(mean_squared_error(Y_test["Ca"], yhat_svr)) return model, test_error
def fit(self, start_date, end_date): for ticker in self.tickers: self.stocks[ticker] = Stock(ticker) params_svr = [{ 'kernel': ['rbf', 'sigmoid', 'linear'], 'C': [0.01, 0.1, 1, 10, 100], 'epsilon': [0.0000001, 0.000001, 0.00001] }] params = ParameterGrid(params_svr) # Find the split for training and CV mid_date = train_test_split(start_date, end_date) for ticker, stock in self.stocks.items(): X_train, y_train = stock.get_data(start_date, mid_date, fit=True) # X_train = self.pca.fit_transform(X_train.values) X_train = X_train.values # pdb.set_trace() X_cv, y_cv = stock.get_data(mid_date, end_date) # X_cv = self.pca.transform(X_cv.values) X_cv = X_cv.values lowest_mse = np.inf for i, param in enumerate(params): svr = SVR(**param) # ada = AdaBoostRegressor(svr) svr.fit(X_train, y_train.values) mse = mean_squared_error( y_cv, svr.predict(X_cv)) if mse <= lowest_mse: self.models[ticker] = svr return self
def machinelearning(csv_file): # parse CSV d = {} d['date'] = [] d['radiation'] = [] d['humidity'] = [] d['temperature'] = [] d['wind'] = [] d['demand'] = [] dictreader = csv.DictReader(csv_file, fieldnames=['date', 'radiation', 'humidity', 'temperature', 'wind', 'demand'], delimiter=',') next(dictreader) for row in dictreader: for key in row: d[key].append(row[key]) # interpolate weather data interpolate(d['radiation']) interpolate(d['humidity']) interpolate(d['temperature']) interpolate(d['wind']) # train machine learning algorithm training_x = np.array(zip(d['radiation'], d['humidity'], d['temperature'], d['wind'])[:32]) training_y = np.array(d['demand'][:32]) poly_svr = SVR(kernel='poly', degree=2) poly_svr.fit(training_x, training_y) prediction_x = np.array(zip(d['radiation'], d['humidity'], d['temperature'], d['wind'])[32:]) demand_predictions = poly_svr.predict(prediction_x) return demand_predictions
def Sand_SVR(X_train, Y_train, X_test, Y_test, cv_iterator): #=========================================================================== # param_grid = {'C':[100,500,1000, 5000, 10000, 100000], # 'epsilon':[0.075,0.1, 0.125] # } # # svr = SVR(cache_size = 1000, random_state=42) # search = GridSearchCV(svr, param_grid, scoring="mean_squared_error", cv=cv_iterator) #=========================================================================== #search.fit(X_train, Y_train["Sand"]) #search.grid_scores_ #svr = search.best_estimator_ #svr.fit(X_train, Y_train["SAND"]) #test = cross_val_score(svr, X_train.astype('float64'), Y_train["Ca"].astype('float64'), scoring="mean_squared_error", cv=cv_iterator) svr = SVR(C=10000) svr.fit(X_train, Y_train["Sand"]) yhat_svr = svr.predict(X_test) test_error = math.sqrt(mean_squared_error(Y_test["Sand"], yhat_svr)) return svr, test_error
def train_model(train, test, labels): clf = SVR(C=1.0, epsilon=0.2) clf.fit(train, labels) #clf = GaussianNB() #clf.fit(train, labels) print "Good!" predictions = clf.predict(test) print predictions.shape predictions = pd.DataFrame(predictions, columns = ['relevance']) print "Good again!" print "Predictions head -------" print predictions.head() print predictions.shape print "TEST head -------" print test.head() print test.shape test['id'].to_csv("TEST_TEST.csv",index=False) predictions.to_csv("PREDICTIONS.csv",index=False) #test = test.reset_index() #predictions = predictions.reset_index() #test = test.groupby(level=0).first() #predictions = predictions.groupby(level=0).first() predictions = pd.concat([test['id'],predictions], axis=1, verify_integrity=False) print predictions return predictions
def main(args): (training_file, label_file, test_file, test_label, c, e) = args svr = SVR(C=float(c), epsilon=float(e), kernel='rbf') X = load_feat(training_file) y = [float(line.strip()) for line in open(label_file)] X = np.asarray(X) y = np.asarray(y) test_X = load_feat(test_file) test_X = np.asarray(test_X) test_X[np.isnan(test_X)] = 0 svr.fit(X, y) pred = svr.predict(test_X) if test_label != 'none': test_y = [float(line.strip()) for line in open(test_label)] test_y = np.asarray(test_y) print 'MAE: ', mean_absolute_error(test_y, pred) print 'RMSE: ', sqrt(mean_squared_error(test_y, pred)) print 'corrpearson: ', sp.stats.pearsonr(test_y, pred) print 'r-sqr: ', sp.stats.linregress(test_y, pred)[2] ** 2 print mquantiles(test_y, prob=[0.10, 0.90]) print mquantiles(pred, prob=[0.10, 0.90]) with open(test_file + '.svr.pred', 'w') as output: for p in pred: print >>output, p return
def svr_rbf(X_train, Y_train, X_validate): """Support vector regression, using RBF kernel""" SVR_RBF = SVR(kernel='rbf') SVR_RBF.fit(X_train, Y_train) Y_pred = SVR_RBF.predict(X_validate) write_to_file("SVR_RBF_Y_pred.csv", Y_pred) return Y_pred
def learn(X, y): # do pca pca = PCA(n_components=6) pca_6 = pca.fit(X) print('variance ratio') print(pca_6.explained_variance_ratio_) X = pca.fit_transform(X) # X = np.concatenate((X_pca[:, 0].reshape(X.shape[0], 1), X_pca[:, 5].reshape(X.shape[0], 1)), axis=1) # do svr svr_rbf = SVR(kernel='rbf', C=1) svr_rbf.fit(X, y) # print(model_rbf) y_rbf = svr_rbf.predict(X) print(y_rbf) print(y) # see difference y_rbf = np.transpose(y_rbf) deviation(y, y_rbf) # pickle model with open('rbfmodel.pkl', 'wb') as f: pickle.dump(svr_rbf, f) with open('pcamodel.pkl', 'wb') as f: pickle.dump(pca_6, f)
def SVM(train, test, tunings=None, smoteit=True, bin=True, regress=False): "SVM " if not isinstance(train, pd.core.frame.DataFrame): train = csv2DF(train, as_mtx=False, toBin=bin) if not isinstance(test, pd.core.frame.DataFrame): test = csv2DF(test, as_mtx=False, toBin=True) if smoteit: train = SMOTE(train, resample=True) # except: set_trace() if not tunings: if regress: clf = SVR() else: clf = SVC() else: if regress: clf = SVR() else: clf = SVC() features = train.columns[:-1] klass = train[train.columns[-1]] # set_trace() clf.fit(train[features], klass) actual = test[test.columns[-1]].as_matrix() try: preds = clf.predict(test[test.columns[:-1]]) except: set_trace() return actual, preds
class SVMRegressor(Regressor): def findImportantFeatures(self, numFeatures = 1000): #Selecting the important features self.features = [] count = 0 for key in sorted(self.trainSet.getVocabulary(), key = lambda word: self.trainSet.getUniqueWeightOf(word), reverse=True): count += 1 self.features.append(key) if count == numFeatures: break def train(self, numFeatures = 1000): self.findImportantFeatures(numFeatures) self.vectorizer = CountVectorizer(vocabulary = self.features,min_df = 1) self.regressor = SVR(kernel='linear', C=25, epsilon=10) strings = [] Y = [] for docKey in self.trainSet.getDocuments(): document = self.trainSet.getDocument(docKey) strings.append(" ".join(document.getBagOfWords2("all"))) Y.append(document.getSalary()) X = self.vectorizer.fit_transform(strings) self.regressor.fit(X,Y) Coef = self.regressor.coef_ coef_list = Coef.toarray() #for i in range(len(coef_list[0])): # if math.fabs(coef_list[0][i]-0.0) > 0.1: # print self.features[i],coef_list[0][i] def predict(self, document): strings = [] strings.append(" ".join(document.getBagOfWords2("all"))) Z = self.vectorizer.fit_transform(strings) return self.regressor.predict(Z)[0]
def P_SVRModel(X_train, Y_train, X_test, Y_test, cv_iterator): #=========================================================================== # scaler = StandardScaler() # X_train = scaler.fit_transform(X_train) # X_test = scaler.transform(X_test) # # # param_grid = {'C':[0.0001, 0.001, 0.01, 0.1], # 'epsilon':[0.1, 0.01] # } # # svr = SVR(random_state=42, verbose = 2) # search = GridSearchCV(svr, param_grid, scoring="mean_squared_error", n_jobs=1, cv=cv_iterator, iid=False) # search.fit(X_train, Y_train["P"]) # #search.grid_scores_ # #svr = search.best_estimator_ #=========================================================================== svr = SVR(C=10000, epsilon=0.1) svr.fit(X_train, Y_train["P"]) #test = cross_val_score(svr, X_train.astype('float64'), Y_train["Ca"].astype('float64'), scoring="mean_squared_error", cv=cv_iterator) yhat_svr = svr.predict(X_test) test_error = math.sqrt(mean_squared_error(Y_test["P"], yhat_svr)) return svr, test_error
class SVMLearner(object): def __init__(self, kernel="linear", C=1e3, gamma=0.1, degree=2, verbose = False): self.name = "{} Support Vector Machine Learner".format(kernel.capitalize()) self.kernel=kernel if kernel=="linear": self.svr = SVR(kernel=kernel, C=C) elif kernel=="rbf": self.svr = SVR(kernel=kernel, C=C, gamma=gamma) elif kernel=="poly": self.svr = SVR(kernel=kernel, C=C, degree=degree) def addEvidence(self,dataX,dataY): """ @summary: Add training data to learner @param dataX: X values of data to add @param dataY: the Y training values """ # build and save the model self.svr.fit(dataX, dataY) def query(self,points): """ @summary: Estimate a set of test points given the model we built. @param points: should be a numpy array with each row corresponding to a specific query. @returns the estimated values according to the saved model. """ return self.svr.predict(points)
def main(args): (training_file, label_file, test_file, u_file, e, c, output_file, components) = args X_training = load_feat(training_file) n = len(X_training) U = load_feat(u_file) y_training = [float(line.strip()) for line in open(label_file)] U = np.asarray(U) X_training = np.asarray(X_training) #X = preprocessing.normalize(X, norm='l2') y_training = np.asarray(y_training) X_test = load_feat(test_file) y_test = [float(line.strip()) for line in open(test_label)] X_test = np.asarray(X_test) X_test[np.isnan(X_test)] = 0.0 #test_X = preprocessing.normalize(test_X, norm='l2') y_test = np.asarray(y_test) s = min(len(X_training), len(U)) cca = CCA(n_components=components, max_iter=50) (X_cca, U_cca) = cca.fit_transform(X_training[:s], U[:s]) X_test_cca = cca.transform(X_test) svr = SVR(C=c, epsilon=e, kernel='rbf') svr.fit(X_cca, y_training[:s]) pred = svr.predict(X_test_cca) with open(output_file, 'w') as output: for p in pred: print >>output, p return
def predict_device_byday_SVR(): X,Y_unique,Y_all,X_raw = load_device_counter_byday() from sklearn.svm import SVR model = SVR() # model = SVR(kernel='linear') training_size = 160 # model.fit(X[:training_size],Y_unique[:training_size]) model.fit(X[:training_size],Y_all[:training_size]) start_index = 180 end_index = 190 X_to_predict = X[start_index:end_index] # X_to_predict.append([date_str_toordinal('2017-04-18')]) # X_to_predict.append([date_str_toordinal('2017-03-27')]) print X_to_predict # Y_real = Y_unique[start_index:end_index] Y_real = Y_all[start_index:end_index] print X_raw[start_index:end_index] y_predicted=model.predict(X_to_predict) # print y_predicted y_predicted = np.array(y_predicted).astype(int) print y_predicted print Y_real # print y_predicted - np.array(Y_real) # plt.subplot(111) # plt.scatter(X_to_predict,Y_real,c='r') plt.scatter(X_to_predict,y_predicted) # plt.plot(X_to_predict,y_predicted) plt.show()
def build_model(titles, X1, X3, X4, titles_test, X1_test, X3_test, X4_test, y, weights=None, params=[400, 10, 0, 0], top_words=10): ''' X1: query lenght,title lenght,description presetn flag,number of words from query that also occured in title, compression distance between query and title ,1 - edit distance between query and title, 1 - average(maximum edit distance between word from query and every word from title), last word from query present in title flag,ratio of words from query that also occured in title X3: Stanislav's features X4: Mikhail's features params list: [Number of SVD components, C in SVR, gamma in SVR] ''' if top_words == 10: X5 = np.loadtxt(config.path_features + 'train_ext_counts_top10.txt') X5_test = np.loadtxt(config.path_features + 'test_ext_counts_top10.txt') queries_ext = np.array(pd.read_csv( config.path_features + 'train_ext_top10.csv')['query']) queries_ext_test = np.array(pd.read_csv( config.path_features + 'test_ext_top10.csv')['query']) elif top_words == 15: X5 = np.loadtxt(config.path_features + 'train_ext_counts_top15.txt') X5_test = np.loadtxt(config.path_features + 'test_ext_counts_top15.txt') queries_ext = np.array(pd.read_csv( config.path_features + 'train_ext_top15.csv')['query']) queries_ext_test = np.array(pd.read_csv( config.path_features + 'test_ext_top15.csv')['query']) else: print 1 / 0 df_train = pd.DataFrame(np.c_[queries_ext, titles], columns=[ 'query', 'product_title']) df_test = pd.DataFrame(np.c_[queries_ext_test, titles_test], columns=[ 'query', 'product_title']) train_qt = list(df_train.apply(lambda x: '%s %s' % (x['query'], x['product_title']), axis=1)) test_qt = list(df_test.apply(lambda x: '%s %s' % (x['query'], x['product_title']), axis=1)) tfv = text.TfidfVectorizer(min_df=10, max_features=None, strip_accents='unicode', analyzer='char', token_pattern=r'\w{1,}', ngram_range=(1, 3), use_idf=1, smooth_idf=1, sublinear_tf=1, stop_words='english') tfv.fit(train_qt) X2 = tfv.transform(train_qt) X2_test = tfv.transform(test_qt) svd = TruncatedSVD(n_components=params[0]) mms = MinMaxScaler() X = np.c_[svd.fit_transform(X2), X1, X4, X3, X5] X_test = np.c_[svd.transform(X2_test), X1_test, X4_test, X3_test, X5_test] X = mms.fit_transform(X) X_test = mms.transform(X_test) # train model clf = SVR(C=params[1], gamma=params[2], cache_size=2048, kernel='rbf') clf.fit(X, y, sample_weight=weights) p = clf.predict(X_test) return p
x_train, y=y_train, scoring='neg_mean_squared_error', cv=2)))) sexp = squaredExponential() gp = GaussianProcess(sexp) acq = Acquisition(mode='ExpectedImprovement') param = OrderedDict() param['x'] = ('cont', [1, 100]) param['y'] = ('cont', [1, 100]) gpgo = GPGO(gp, acq, f, param) gpgo.run(max_iter=200) best_x, best_y = gpgo.getResult() print('best_x:', best_x) print('best_y:', best_y) model_SVR = SVR(C=best_x[0], gamma=best_x[1]) model_SVR.fit(x_train, y_train) y_predict = model_SVR.predict(x_test) RMSE_SVR = np.sqrt(mean_squared_error(y_test, y_predict)) R2_SVR = r2_score(y_test, y_predict) MAE_SVR = median_absolute_error(y_test, y_predict) print('****************' + 'SVR' + '****************') print('RMSE_SVR:', RMSE_SVR) print('R2_SVR:', R2_SVR) print('MAE_SVR:', MAE_SVR)
# SVR # Importing the libraries import numpy as np import matplotlib.pyplot as plt import pandas as pd # Importing the dataset dataset = pd.read_csv('c:\\stock_market.csv') X = dataset.iloc[:, 1:30].values y = dataset.iloc[:, 30].values #print(y) # Splitting the dataset into the Training set and Test set from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) # Fitting SVR to the dataset from sklearn.svm import SVR regressor = SVR(kernel='linear') regressor.fit(X, y) y_pred = regressor.predict(X_test) print(" The Predicted Output is \n", y_pred) print(" The Actual Values are \n", y_test)
dfTrain.drop('Expected', axis=1, inplace=True) dfTrain.drop('Id', axis=1, inplace=True) print "Training SVR with rbf Kernel..." model.fit(dfTrain, yTrain) print "Test Sets:", testSets for i in testSets: print "Loading dataset basic_no_" + str(i) + 'sur16 into testSets...' l_dfTest.append( pd.read_csv('./data/f_df_train_' + featuresSet + '_' + str(i) + 'sur16.csv', index_col=0)) dfTest = pd.concat(l_dfTest) dfTest.dropna(inplace=True) yTest = dfTest.Expected dfTest.drop('Expected', axis=1, inplace=True) dfTest.drop('Id', axis=1, inplace=True) print "Predicting values..." pred = model.predict(dfTest) print "Calculating MAE scores:" scores = (np.abs(pred - yTest)).mean() print scores print "" t_end = time.time() print "Time to process: ", t_end - t_start print "-----------------------------------------"
start_time = time.time() #loading the model with the chosen hyperparameters SVR_Model_chosen = SVR(kernel="rbf", epsilon=485.8971280256051, C=371.8743120624125, gamma=76.01511064212842) #training the model SVR_Model_chosen.fit(X_train_fs, Y_train.ravel()) #calculating the time taken to train print("--- %s seconds ---" % (time.time() - start_time)) # Training RMSE - calculate and print Y_train_SVR = SVR_Model_chosen.predict(X_train_fs) rmse = (np.sqrt(mean_squared_error(Y_train, Y_train_SVR))) * 100 print(rf"The RMSE is {rmse:2.4f}%") # ### Testing the model using Validation Data # In[102]: # Use the model to make predicitions about test data Y_predict_SVR = SVR_Model_chosen.predict(X_test_fs) # Generating a plot for actual and predicted values plt.figure(figsize=[10, 4]) plt.plot(Y_test, "b-", label="Actual Value of Y") plt.plot(Y_predict_SVR, "g*", label="Predicted Value of Y") plt.title("Predicition using Support Vector Regressor")
### SVR Regression ### ################################ # ** NOTE - SVR does not do feature scaling ss_x = StandardScaler() ss_y = StandardScaler() X_scaled = ss_x.fit_transform(X) y_scaled = ss_y.fit_transform(y.reshape(-1, 1)) svr_regressor = SVR(kernel="rbf") svr_regressor.fit(X_scaled, y_scaled) # Predict - since we did feature scaling - # So have to scale/transform 6.5 also position_val = ss_x.transform([[6.5]]) pred_val_scaled = svr_regressor.predict(position_val) # The above statement will return scaled predicted value # So have to convert that using inverse transform svr_pred = ss_y.inverse_transform(pred_val_scaled) print( 'The predicted salary of a person at 6.5 Level with Support Vector Regression is ', svr_pred) ################################ ### Decision Tree Regression ### ################################ tree_regressor = DecisionTreeRegressor(criterion="mse") tree_regressor.fit(X, y) # Predict tree_pred = tree_regressor.predict([[6.5]])
def hyperopt_obj(param, feat_folder, feat_name, trial_counter): global loaded, split_data if loaded is None: split_data, X_train_all, labels_train_all = load_data(1, 1) log_loss_cv = np.zeros((split_data.shape[0], split_data.shape[1]), dtype=float) year = datetime.datetime.now().year # for run in range(1, split_data.shape[0] + 1): # range(start, end)前包括后不包括 # for fold in range(1, split_data.shape[1] + 1): # rng = np.random.RandomState(datetime.datetime.now().year + 1000 * run + 10 * fold) # #### all the path # path = "%s/Run%d/Fold%d" % (feat_folder, run, fold) # save_path = "%s/Run%d/Fold%d" % (output_path, run, fold) # if not os.path.exists(save_path): # os.makedirs(save_path) # # feat: combine feat file # feat_train_path = "%s/valid.feat" % path # feat_valid_path = "%s/train.feat" % path # raw_pred_valid_path = "%s/valid.raw.pred.%s_[Id@%d].csv" % (save_path, feat_name, trial_counter) # # rank_pred_valid_path = "%s/valid.pred.%s_[Id@%d].csv" % (save_path, feat_name, trial_counter) # # X_train, labels_train, X_valid, labels_valid = split_data[run -1, fold -1] # numTrain = X_train.shape[0] # numValid = X_valid.shape[0] # Y_valid = labels_valid # # ## make evalerror func 评价函数 # # evalerror_regrank_valid = lambda preds,dtrain: evalerror_regrank_cdf(preds, dtrain, cdf_valid) # # evalerror_softmax_valid = lambda preds,dtrain: evalerror_softmax_cdf(preds, dtrain, cdf_valid) # # evalerror_softkappa_valid = lambda preds,dtrain: evalerror_softkappa_cdf(preds, dtrain, cdf_valid) # # evalerror_ebc_valid = lambda preds,dtrain: evalerror_ebc_cdf(preds, dtrain, cdf_valid, ebc_hard_threshold) # # evalerror_cocr_valid = lambda preds,dtrain: evalerror_cocr_cdf(preds, dtrain, cdf_valid) # # ############## # ## Training ## # ############## # ## you can use bagging to stabilize the predictions 还可以使用 bagging 来使模型更加稳定 # preds_bagging = np.zeros((numValid, bagging_size), dtype=float) # for n in range(bagging_size): # if bootstrap_replacement: # sampleSize = int(numTrain * bootstrap_ratio) # bootstrap_ratio: 使用训练样本的比例 # index_base = rng.randint(numTrain, size=sampleSize) # index_meta = [i for i in range(numTrain) if i not in index_base] # else: # randnum = rng.uniform(size=numTrain) # 产生 0-1 之间的唯一的均匀分布的随机数 # index_base = [i for i in range(numTrain) if randnum[i] < bootstrap_ratio] # index_meta = [i for i in range(numTrain) if randnum[i] >= bootstrap_ratio] # # # 如果是xgb则先把数据转换成xgb需要的格式 # if "booster" in param: # dvalid_base = xgb.DMatrix(X_valid, label=labels_valid) # , weight=weight_valid # dtrain_base = xgb.DMatrix(X_train[index_base], # label=labels_train[index_base]) # , weight=weight_train[index_base] # # watchlist = [] # if verbose_level >= 2: # watchlist = [(dtrain_base, 'train'), (dvalid_base, 'valid')] # # ## various models # if param["task"] in ["regression", "ranking"]: # ## regression & pairwise ranking with xgboost # bst = xgb.train(param, dtrain_base, param['num_round'], # watchlist) # , feval=evalerror_regrank_valid # pred = bst.predict(dvalid_base) # # if param["task"] in ["classification"]: # ## regression & pairwise ranking with xgboost # bst = xgb.train(param, dtrain_base, param['num_round'], # watchlist) # , feval=evalerror_regrank_valid # pred = bst.predict(dvalid_base) # # elif param["task"] in ["softmax"]: # ## softmax regression with xgboost # bst = xgb.train(param, dtrain_base, param['num_round'], # watchlist) # , feval=evalerror_softmax_valid # pred = bst.predict(dvalid_base) # w = np.asarray(range(1, numValid)) # pred = pred * w[np.newaxis, # :] # np.newaxis: 插入一个维度,等价于w[np.newaxis],这里pred是n*1矩阵,而w[np.newaxis,:]是1*n矩阵,注意w原是数组 # pred = np.sum(pred, axis=1) # # elif param["task"] in ["softkappa"]: # ## softkappa with xgboost 自定义损失函数 # # obj = lambda preds, dtrain: softkappaObj(preds, dtrain, hess_scale=param['hess_scale']) # bst = xgb.train(param, dtrain_base, param['num_round'], # watchlist) # , obj=obj, feval=evalerror_softkappa_valid # pred = softmax(bst.predict(dvalid_base)) # w = np.asarray(range(1, numValid)) # pred = pred * w[np.newaxis, :] # pred = np.sum(pred, axis=1) # # elif param["task"] in ["ebc"]: # ## ebc with xgboost 自定义损失函数 # # obj = lambda preds, dtrain: ebcObj(preds, dtrain) # bst = xgb.train(param, dtrain_base, param['num_round'], # watchlist) # , obj=obj, feval=evalerror_ebc_valid # pred = sigmoid(bst.predict(dvalid_base)) # pred = applyEBCRule(pred, hard_threshold=ebc_hard_threshold) # # elif param["task"] in ["cocr"]: # ## cocr with xgboost 自定义损失函数 # # obj = lambda preds, dtrain: cocrObj(preds, dtrain) # bst = xgb.train(param, dtrain_base, param['num_round'], # watchlist) # , obj=obj, feval=evalerror_cocr_valid # pred = bst.predict(dvalid_base) # pred = applyCOCRRule(pred) # # elif param['task'] == "reg_skl_rf": # ## regression with sklearn random forest regressor # rf = RandomForestRegressor(n_estimators=param['n_estimators'], # max_features=param['max_features'], # n_jobs=param['n_jobs'], # random_state=param['random_state']) # rf.fit(X_train[index_base], labels_train[index_base]) # , sample_weight=weight_train[index_base] # pred = rf.predict(X_valid) # # elif param['task'] == "reg_skl_etr": # ## regression with sklearn extra trees regressor # etr = ExtraTreesRegressor(n_estimators=param['n_estimators'], # max_features=param['max_features'], # n_jobs=param['n_jobs'], # random_state=param['random_state']) # etr.fit(X_train[index_base], labels_train[index_base]) # , sample_weight=weight_train[index_base] # pred = etr.predict(X_valid) # # elif param['task'] == "reg_skl_gbm": # ## regression with sklearn gradient boosting regressor # gbm = GradientBoostingRegressor(n_estimators=param['n_estimators'], # max_features=param['max_features'], # learning_rate=param['learning_rate'], # max_depth=param['max_depth'], # subsample=param['subsample'], # random_state=param['random_state']) # gbm.fit(X_train.toarray()[index_base], # labels_train[index_base]) # , sample_weight=weight_train[index_base] # pred = gbm.predict(X_valid.toarray()) # # elif param['task'] == "clf_skl_lr": # ## classification with sklearn logistic regression # lr = LogisticRegression(penalty="l2", dual=True, tol=1e-5, # C=param['C'], fit_intercept=True, intercept_scaling=1.0, # class_weight='auto', random_state=param['random_state']) # lr.fit(X_train[index_base], labels_train[index_base]) # pred = lr.predict_proba(X_valid) # w = np.asarray(range(1, numValid)) # pred = pred * w[np.newaxis, :] # pred = np.sum(pred, axis=1) # # elif param['task'] == "reg_skl_svr": # ## regression with sklearn support vector regression # X_train, X_valid = X_train.toarray(), X_valid.toarray() # scaler = StandardScaler() # X_train[index_base] = scaler.fit_transform(X_train[index_base]) # X_valid = scaler.transform(X_valid) # svr = SVR(C=param['C'], gamma=param['gamma'], epsilon=param['epsilon'], # degree=param['degree'], kernel=param['kernel']) # svr.fit(X_train[index_base], labels_train[index_base]) # , sample_weight=weight_train[index_base] # pred = svr.predict(X_valid) # # elif param['task'] == "reg_skl_ridge": # ## regression with sklearn ridge regression # ridge = Ridge(alpha=param["alpha"], normalize=True) # ridge.fit(X_train[index_base], labels_train[index_base]) # , sample_weight=weight_train[index_base] # pred = ridge.predict(X_valid) # # elif param['task'] == "reg_skl_lasso": # ## regression with sklearn lasso # lasso = Lasso(alpha=param["alpha"], normalize=True) # lasso.fit(X_train[index_base], labels_train[index_base]) # pred = lasso.predict(X_valid) # # elif param['task'] == 'reg_libfm': # ## regression with factorization machine (libfm) # ## to array # X_train = X_train.toarray() # X_valid = X_valid.toarray() # # ## scale # scaler = StandardScaler() # X_train[index_base] = scaler.fit_transform(X_train[index_base]) # X_valid = scaler.transform(X_valid) # # ## dump feat # dump_svmlight_file(X_train[index_base], labels_train[index_base], feat_train_path + ".tmp") # dump_svmlight_file(X_valid, labels_valid, feat_valid_path + ".tmp") # # ## train fm # cmd = "%s -task r -train %s -test %s -out %s -dim '1,1,%d' -iter %d > libfm.log" % ( \ # libfm_exe, feat_train_path + ".tmp", feat_valid_path + ".tmp", raw_pred_valid_path, \ # param['dim'], param['iter']) # os.system(cmd) # os.remove(feat_train_path + ".tmp") # os.remove(feat_valid_path + ".tmp") # # ## extract libfm prediction # pred = np.loadtxt(raw_pred_valid_path, dtype=float) # ## labels are in [0,1,2,3] # pred += 1 # # # elif param['task'] == "reg_keras_dnn": # # ## regression with keras' deep neural networks # # model = Sequential() # # ## input layer # # model.add(Dropout(param["input_dropout"])) # # ## hidden layers # # first = True # # hidden_layers = param['hidden_layers'] # # while hidden_layers > 0: # # if first: # # dim = X_train.shape[1] # # first = False # # else: # # dim = param["hidden_units"] # # model.add(Dense(dim, param["hidden_units"], init='glorot_uniform')) # # if param["batch_norm"]: # # model.add(BatchNormalization((param["hidden_units"],))) # # if param["hidden_activation"] == "prelu": # # model.add(PReLU((param["hidden_units"],))) # # else: # # model.add(Activation(param['hidden_activation'])) # # model.add(Dropout(param["hidden_dropout"])) # # hidden_layers -= 1 # # # # ## output layer # # model.add(Dense(param["hidden_units"], 1, init='glorot_uniform')) # # model.add(Activation('linear')) # # # # ## loss # # model.compile(loss='mean_squared_error', optimizer="adam") # # # # ## to array # # X_train = X_train.toarray() # # X_valid = X_valid.toarray() # # # # ## scale # # scaler = StandardScaler() # # X_train[index_base] = scaler.fit_transform(X_train[index_base]) # # X_valid = scaler.transform(X_valid) # # # # ## train # # model.fit(X_train[index_base], labels_train[index_base], # # nb_epoch=param['nb_epoch'], batch_size=param['batch_size'], # # validation_split=0, verbose=0) # # # # ##prediction # # pred = model.predict(X_valid, verbose=0) # # pred.shape = (X_valid.shape[0],) # # elif param['task'] == "reg_rgf": # ## regression with regularized greedy forest (rgf) # ## to array # X_train, X_valid = X_train.toarray(), X_valid.toarray() # # train_x_fn = feat_train_path + ".x" # train_y_fn = feat_train_path + ".y" # valid_x_fn = feat_valid_path + ".x" # valid_pred_fn = feat_valid_path + ".pred" # # model_fn_prefix = "rgf_model" # # np.savetxt(train_x_fn, X_train[index_base], fmt="%.6f", delimiter='\t') # np.savetxt(train_y_fn, labels_train[index_base], fmt="%d", delimiter='\t') # np.savetxt(valid_x_fn, X_valid, fmt="%.6f", delimiter='\t') # # np.savetxt(valid_y_fn, labels_valid, fmt="%d", delimiter='\t') # # # pars = [ # "train_x_fn=", train_x_fn, "\n", # "train_y_fn=", train_y_fn, "\n", # # "train_w_fn=",weight_train_path,"\n", # "model_fn_prefix=", model_fn_prefix, "\n", # "reg_L2=", param['reg_L2'], "\n", # # "reg_depth=", 1.01, "\n", # "algorithm=", "RGF", "\n", # "loss=", "LS", "\n", # # "opt_interval=", 100, "\n", # "valid_interval=", param['max_leaf_forest'], "\n", # "max_leaf_forest=", param['max_leaf_forest'], "\n", # "num_iteration_opt=", param['num_iteration_opt'], "\n", # "num_tree_search=", param['num_tree_search'], "\n", # "min_pop=", param['min_pop'], "\n", # "opt_interval=", param['opt_interval'], "\n", # "opt_stepsize=", param['opt_stepsize'], "\n", # "NormalizeTarget" # ] # pars = "".join([str(p) for p in pars]) # # rfg_setting_train = "./rfg_setting_train" # with open(rfg_setting_train + ".inp", "wb") as f: # f.write(pars) # # ## train fm # cmd = "perl %s %s train %s >> rgf.log" % ( # call_exe, rgf_exe, rfg_setting_train) # # print cmd # os.system(cmd) # # model_fn = model_fn_prefix + "-01" # pars = [ # "test_x_fn=", valid_x_fn, "\n", # "model_fn=", model_fn, "\n", # "prediction_fn=", valid_pred_fn # ] # # pars = "".join([str(p) for p in pars]) # # rfg_setting_valid = "./rfg_setting_valid" # with open(rfg_setting_valid + ".inp", "wb") as f: # f.write(pars) # cmd = "perl %s %s predict %s >> rgf.log" % ( # call_exe, rgf_exe, rfg_setting_valid) # # print cmd # os.system(cmd) # # pred = np.loadtxt(valid_pred_fn, dtype=float) # # ## weighted averageing over different models # pred_valid = pred # ## this bagging iteration # preds_bagging[:, n] = pred_valid # preds_bagging的第n+1列为pred_valid # pred_raw = np.mean(preds_bagging[:, :(n + 1)], axis=1) # 按行(同行多列)进行平均值 # # pred_rank = pred_raw.argsort().argsort() # argsort: 获取排序的索引值(index),但索引值本身不排序,第二次是归位 # # pred_score, cutoff = getScore(pred_rank, cdf_valid, valid=True) # 根据cdf来生成分数 # # kappa_valid = quadratic_weighted_kappa(pred_score, Y_valid) # 计算kappa分数 # log_loss_valid = elementwise.log_loss(Y_valid, pred_raw) # print('Y_valid mean:', np.mean(Y_valid)) # print('pred_raw mean:', np.mean(pred_raw)) # if (n + 1) != bagging_size: # print(" {:>3} {:>3} {:>3} {:>6} {} x {}".format( # run, fold, n + 1, np.round(log_loss_valid, 6), X_train.shape[0], X_train.shape[1])) # else: # print(" {:>3} {:>3} {:>3} {:>8} {} x {}".format( # run, fold, n + 1, np.round(log_loss_valid, 6), X_train.shape[0], X_train.shape[1])) # log_loss_cv[run - 1, fold - 1] = log_loss_valid # ## save this prediction 保存的是单行的预测值 # dfPred = pd.DataFrame({"target": Y_valid, "prediction": pred_raw}) # dfPred.to_csv(raw_pred_valid_path, index=False, header=True, columns=["target", "prediction"]) # # save this prediction 保存的是根据预测值排序之后,然后使用cdf来生成的预测值 # # dfPred = pd.DataFrame({"target": Y_valid, "prediction": pred_rank}) # # dfPred.to_csv(rank_pred_valid_path, index=False, header=True, columns=["target", "prediction"]) # # log_loss_cv_mean = np.mean(log_loss_cv) # log_loss_cv_std = np.std(log_loss_cv) # if verbose_level >= 1: # print(" Mean: %.6f" % log_loss_cv_mean) # print(" Std: %.6f" % log_loss_cv_std) #################### #### Retraining #### #################### #### all the path path = "%s/All" % (feat_folder) save_path = "%s/All" % output_path subm_path = "%s/Subm" % output_path if not os.path.exists(save_path): os.makedirs(save_path) if not os.path.exists(subm_path): os.makedirs(subm_path) # feat feat_train_path = "%s/train.feat" % path feat_test_path = "%s/test.feat" % path # weight # weight_train_path = "%s/train.feat.weight" % path # info info_train_path = "%s/train.info" % path info_test_path = "%s/test.info" % path # cdf # cdf_test_path = "%s/test.cdf" % path # raw prediction path (rank) raw_pred_test_path = "%s/test.raw.pred.%s_[Id@%d]_[Run_Time@%s].csv" % ( save_path, feat_name, trial_counter, time.strftime("%Y%m%d%H%M%S", time.localtime())) rank_pred_test_path = "%s/test.pred.%s_[Id@%d].csv" % ( save_path, feat_name, trial_counter) # submission path (is_duplicate as in [0, 1]) # subm_path = "%s/test.pred.%s_[Id@%d]_[Mean%.6f]_[Std%.6f].csv" % (subm_path, feat_name, trial_counter, log_loss_cv_mean, log_loss_cv_std) #### load data ## load feat # X_train, labels_train = load_svmlight_file(feat_train_path) X_train, labels_train = X_train_all, labels_train_all print('X_train_all.shape:', X_train_all.shape) print('labels_train_all.mean:', np.mean(labels_train_all)) X_test, labels_test = load_svmlight_file(feat_test_path) # if X_test.shape[1] < X_train.shape[1]: # X_test = hstack([X_test, np.zeros((X_test.shape[0], X_train.shape[1]-X_test.shape[1]))]) # elif X_test.shape[1] > X_train.shape[1]: # X_train = hstack([X_train, np.zeros((X_train.shape[0], X_test.shape[1]-X_train.shape[1]))]) # X_train = X_train.tocsr() # X_test = X_test.tocsr() # 缩小训练数据比例以使训练数据和测试数据比例一致 ## load train weight # weight_train = np.loadtxt(weight_train_path, dtype=float) ## load test info info_train = pd.read_csv(info_train_path) numTrain = X_train.shape[0] info_test = pd.read_csv(info_test_path) numTest = info_test.shape[0] id_test = info_test["test_id"] numValid = info_test.shape[0] ## load cdf # cdf_test = np.loadtxt(cdf_test_path, dtype=float) # ## 评价函数 # evalerror_regrank_test = lambda preds,dtrain: evalerror_regrank_cdf(preds, dtrain, cdf_test) # evalerror_softmax_test = lambda preds,dtrain: evalerror_softmax_cdf(preds, dtrain, cdf_test) # evalerror_softkappa_test = lambda preds,dtrain: evalerror_softkappa_cdf(preds, dtrain, cdf_test) # evalerror_ebc_test = lambda preds,dtrain: evalerror_ebc_cdf(preds, dtrain, cdf_test, ebc_hard_threshold) # evalerror_cocr_test = lambda preds,dtrain: evalerror_cocr_cdf(preds, dtrain, cdf_test) ## bagging preds_bagging = np.zeros((numTest, bagging_size), dtype=float) for n in range(bagging_size): print("", n, " runs training start") rng = np.random.RandomState(datetime.datetime.now().year + 1000 * n + 10 * 1) if bootstrap_replacement: sampleSize = int(numTrain * bootstrap_ratio) #index_meta = rng.randint(numTrain, size=sampleSize) #index_base = [i for i in range(numTrain) if i not in index_meta] index_base = rng.randint(numTrain, size=sampleSize) index_meta = [i for i in range(numTrain) if i not in index_base] else: randnum = rng.uniform(size=numTrain) index_base = [ i for i in range(numTrain) if randnum[i] < bootstrap_ratio ] index_meta = [ i for i in range(numTrain) if randnum[i] >= bootstrap_ratio ] # 如果是xgb则先把数据转换成xgb需要的格式 if "booster" in param: dtest = xgb.DMatrix(X_test, label=labels_test) dtrain = xgb.DMatrix(X_train[index_base], label=labels_train[index_base] ) # , weight=weight_train[index_base] watchlist = [] if verbose_level >= 2: watchlist = [(dtrain, 'train')] ## train if param["task"] in ["regression", "ranking"]: bst = xgb.train(param, dtrain, param['num_round'], watchlist) # , feval=evalerror_regrank_test pred = bst.predict(dtest) if param["task"] in ["classification"]: ## regression & pairwise ranking with xgboost bst = xgb.train(param, dtrain, param['num_round'], watchlist) # , feval=evalerror_softmax_test pred = bst.predict(dtest) elif param["task"] in ["softmax"]: bst = xgb.train(param, dtrain, param['num_round'], watchlist) # , feval=evalerror_softmax_test pred = bst.predict(dtest) w = np.asarray(range(1, numValid)) pred = pred * w[np.newaxis, :] pred = np.sum(pred, axis=1) elif param["task"] in ["softkappa"]: # 自定义损失函数 # obj = lambda preds, dtrain: softkappaObj(preds, dtrain, hess_scale=param['hess_scale']) bst = xgb.train( param, dtrain, param['num_round'], watchlist) # , obj=obj, feval=evalerror_softkappa_test pred = softmax(bst.predict(dtest)) w = np.asarray(range(1, numValid)) pred = pred * w[np.newaxis, :] pred = np.sum(pred, axis=1) elif param["task"] in ["ebc"]: # 自定义损失函数 # obj = lambda preds, dtrain: ebcObj(preds, dtrain) bst = xgb.train(param, dtrain, param['num_round'], watchlist) # , obj=obj, feval=evalerror_ebc_test pred = sigmoid(bst.predict(dtest)) pred = applyEBCRule(pred, hard_threshold=ebc_hard_threshold) elif param["task"] in ["cocr"]: # 自定义损失函数 obj = lambda preds, dtrain: cocrObj(preds, dtrain) bst = xgb.train(param, dtrain, param['num_round'], watchlist) # , obj=obj, feval=evalerror_cocr_test pred = bst.predict(dtest) pred = applyCOCRRule(pred) elif param['task'] == "reg_skl_rf": ## random forest regressor rf = RandomForestRegressor(n_estimators=param['n_estimators'], max_features=param['max_features'], n_jobs=param['n_jobs'], random_state=param['random_state']) rf.fit(X_train[index_base], labels_train[index_base] ) # , sample_weight=weight_train[index_base] pred = rf.predict(X_test) elif param['task'] == "reg_skl_etr": ## extra trees regressor etr = ExtraTreesRegressor(n_estimators=param['n_estimators'], max_features=param['max_features'], n_jobs=param['n_jobs'], random_state=param['random_state']) etr.fit(X_train[index_base], labels_train[index_base] ) # , sample_weight=weight_train[index_base] pred = etr.predict(X_test) elif param['task'] == "reg_skl_gbm": ## gradient boosting regressor gbm = GradientBoostingRegressor( n_estimators=param['n_estimators'], max_features=param['max_features'], learning_rate=param['learning_rate'], max_depth=param['max_depth'], subsample=param['subsample'], random_state=param['random_state']) gbm.fit(X_train.toarray()[index_base], labels_train[index_base] ) #, sample_weight=weight_train[index_base] pred = gbm.predict(X_test.toarray()) elif param['task'] == "clf_skl_lr": lr = LogisticRegression(penalty="l2", dual=True, tol=1e-5, C=param['C'], fit_intercept=True, intercept_scaling=1.0, class_weight='auto', random_state=param['random_state']) lr.fit(X_train[index_base], labels_train[index_base]) pred = lr.predict_proba(X_test) w = np.asarray(range(1, numValid)) pred = pred * w[np.newaxis, :] pred = np.sum(pred, axis=1) elif param['task'] == "reg_skl_svr": ## regression with sklearn support vector regression X_train, X_test = X_train.toarray(), X_test.toarray() scaler = StandardScaler() X_train[index_base] = scaler.fit_transform(X_train[index_base]) X_test = scaler.transform(X_test) svr = SVR(C=param['C'], gamma=param['gamma'], epsilon=param['epsilon'], degree=param['degree'], kernel=param['kernel']) svr.fit(X_train[index_base], labels_train[index_base] ) # , sample_weight=weight_train[index_base] pred = svr.predict(X_test) elif param['task'] == "reg_skl_ridge": ridge = Ridge(alpha=param["alpha"], normalize=True) ridge.fit(X_train[index_base], labels_train[index_base] ) # , sample_weight=weight_train[index_base] pred = ridge.predict(X_test) elif param['task'] == "reg_skl_lasso": lasso = Lasso(alpha=param["alpha"], normalize=True) lasso.fit(X_train[index_base], labels_train[index_base]) pred = lasso.predict(X_test) elif param['task'] == 'reg_libfm': ## to array X_train, X_test = X_train.toarray(), X_test.toarray() ## scale scaler = StandardScaler() X_train[index_base] = scaler.fit_transform(X_train[index_base]) X_test = scaler.transform(X_test) ## dump feat dump_svmlight_file(X_train[index_base], labels_train[index_base], feat_train_path + ".tmp") dump_svmlight_file(X_test, labels_test, feat_test_path + ".tmp") ## train fm cmd = "%s -task r -train %s -test %s -out %s -dim '1,1,%d' -iter %d > libfm.log" % ( \ libfm_exe, feat_train_path+".tmp", feat_test_path+".tmp", raw_pred_test_path, \ param['dim'], param['iter']) os.system(cmd) os.remove(feat_train_path + ".tmp") os.remove(feat_test_path + ".tmp") ## extract libfm prediction pred = np.loadtxt(raw_pred_test_path, dtype=float) ## labels are in [0,1,2,3] pred += 1 # elif param['task'] == "reg_keras_dnn": # ## regression with keras deep neural networks # model = Sequential() # ## input layer # model.add(Dropout(param["input_dropout"])) # ## hidden layers # first = True # hidden_layers = param['hidden_layers'] # while hidden_layers > 0: # if first: # dim = X_train.shape[1] # first = False # else: # dim = param["hidden_units"] # model.add(Dense(dim, param["hidden_units"], init='glorot_uniform')) # if param["batch_norm"]: # model.add(BatchNormalization((param["hidden_units"],))) # if param["hidden_activation"] == "prelu": # model.add(PReLU((param["hidden_units"],))) # else: # model.add(Activation(param['hidden_activation'])) # model.add(Dropout(param["hidden_dropout"])) # hidden_layers -= 1 # # ## output layer # model.add(Dense(param["hidden_units"], 1, init='glorot_uniform')) # model.add(Activation('linear')) # # ## loss # model.compile(loss='mean_squared_error', optimizer="adam") # # ## to array # X_train = X_train.toarray() # X_test = X_test.toarray() # # ## scale # scaler = StandardScaler() # X_train[index_base] = scaler.fit_transform(X_train[index_base]) # X_test = scaler.transform(X_test) # # ## train # model.fit(X_train[index_base], labels_train[index_base], # nb_epoch=param['nb_epoch'], batch_size=param['batch_size'], verbose=0) # # ##prediction # pred = model.predict(X_test, verbose=0) # pred.shape = (X_test.shape[0],) elif param['task'] == "reg_rgf": ## to array X_train, X_test = X_train.toarray(), X_test.toarray() train_x_fn = feat_train_path + ".x" train_y_fn = feat_train_path + ".y" test_x_fn = feat_test_path + ".x" test_pred_fn = feat_test_path + ".pred" model_fn_prefix = "rgf_model" np.savetxt(train_x_fn, X_train[index_base], fmt="%.6f", delimiter='\t') np.savetxt(train_y_fn, labels_train[index_base], fmt="%d", delimiter='\t') np.savetxt(test_x_fn, X_test, fmt="%.6f", delimiter='\t') # np.savetxt(valid_y_fn, labels_valid, fmt="%d", delimiter='\t') pars = [ "train_x_fn=", train_x_fn, "\n", "train_y_fn=", train_y_fn, "\n", #"train_w_fn=",weight_train_path,"\n", "model_fn_prefix=", model_fn_prefix, "\n", "reg_L2=", param['reg_L2'], "\n", #"reg_depth=", 1.01, "\n", "algorithm=", "RGF", "\n", "loss=", "LS", "\n", "test_interval=", param['max_leaf_forest'], "\n", "max_leaf_forest=", param['max_leaf_forest'], "\n", "num_iteration_opt=", param['num_iteration_opt'], "\n", "num_tree_search=", param['num_tree_search'], "\n", "min_pop=", param['min_pop'], "\n", "opt_interval=", param['opt_interval'], "\n", "opt_stepsize=", param['opt_stepsize'], "\n", "NormalizeTarget" ] pars = "".join([str(p) for p in pars]) rfg_setting_train = "./rfg_setting_train" with open(rfg_setting_train + ".inp", "wb") as f: f.write(pars) ## train fm cmd = "perl %s %s train %s >> rgf.log" % (call_exe, rgf_exe, rfg_setting_train) #print cmd os.system(cmd) model_fn = model_fn_prefix + "-01" pars = [ "test_x_fn=", test_x_fn, "\n", "model_fn=", model_fn, "\n", "prediction_fn=", test_pred_fn ] pars = "".join([str(p) for p in pars]) rfg_setting_test = "./rfg_setting_test" with open(rfg_setting_test + ".inp", "wb") as f: f.write(pars) cmd = "perl %s %s predict %s >> rgf.log" % (call_exe, rgf_exe, rfg_setting_test) #print cmd os.system(cmd) pred = np.loadtxt(test_pred_fn, dtype=float) ## weighted averageing over different models pred_test = pred # if abs(np.mean(pred_test) - 0.17426) < 0.1: preds_bagging[:, n] = pred_test print('pred_test mean:', np.mean(pred_test)) # 去掉误差太大的 # cols = [] # for col in range(0, preds_bagging.shape[1]): # if abs(np.mean(preds_bagging[:, col]) - 0.17426) < 0.1: # cols.append(col) # if len(cols) > 0: pred_raw = np.mean(preds_bagging, axis=1) # pred_rank = pred_raw.argsort().argsort() # ## write output = pd.DataFrame({"test_id": id_test, "is_duplicate": pred_raw}) output.to_csv(raw_pred_test_path, index=False)
X = df.drop(['Total_Feeder'], axis=1).values y = df['Total_Feeder'].values tscv = TimeSeriesSplit() for train_index, test_index in tscv.split(X): print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] # train_split = 0.9 # num_train = int(len(X) * 0.9) # X_train = X[:num_train] # X_test = X[num_train:] # # y_train = y[:num_train] # y_test = y[num_train:] svr = SVR(kernel='rbf', C=40, gamma='auto') svr.fit(X_train, y_train) accuracy = svr.score(X_test, y_test) print(accuracy) ## only 20% predictions = svr.predict(X) df['Prediction'] = predictions df['Total_Feeder'].plot() df['Prediction'].plot() plt.show()
class SVMRegression(): def __init__(self, dataset, type_of_kernel = 'rbf', cross_validation_type = 'loo', split_for_validation = None, dataset_validation = None, svm_random_state = 1, svm_degree=3, svm_gamma='scale', svm_coef=0.0, svm_tol=1e-10, svm_epsilon=0.1, svm_max_iter=-1): self.dataset = dataset self.kernel = type_of_kernel self.cross_validation_type = cross_validation_type self.split_for_validation = split_for_validation self.dataset_validation = dataset_validation self.svm_random_state = svm_random_state self.degree = svm_degree self.gamma = svm_gamma self.coef0 = svm_coef self.tol = svm_tol self.epsilon = svm_epsilon self.max_iter = svm_max_iter self._xCal = pd.DataFrame() self._xVal = pd.DataFrame() self._yCal = pd.DataFrame() self._yVal = pd.DataFrame() self._cv = None self.metrics = {} # checking if the parameters was inserted correctly if not isinstance(self.dataset, pd.DataFrame): raise ValueError('The dataset should be a pd.DataFrame.') if (self.dataset_validation is None) and (self.split_for_validation is None): raise ValueError('Should be defined the samples for validation or size of test size for split the dataset.') # x = dataset.iloc[:, 2:] # y = dataset.iloc[:, 1] if (not self.split_for_validation is None) and (self.dataset_validation is None): if self.split_for_validation == 'all': self._xCal = self.dataset.iloc[:, 2:] self._yCal = self.dataset.iloc[:, 1] elif isinstance(self.split_for_validation, float): self._xCal, self._xVal, self._yCal, self._yVal = train_test_split(self.dataset.iloc[:, 2:], self.dataset.iloc[:, 1], test_size=split_for_validation, random_state=self.svm_random_state) else: raise ValueError("split_for_validation need be a float value between 0 and 1 for split dataset. Use 1 for calibrate with all samples of dataset.") if not self.dataset_validation is None: if isinstance(self.dataset_validation, pd.DataFrame): self._xCal = self.dataset.iloc[:, 2:] self._yCal = self.dataset.iloc[:, 1] self._xVal = self.dataset_validation.iloc[:, 2:] self._yVal = self.dataset_validation.iloc[:, 1] else: raise ValueError("dataset_validation need be a pd.DataFrame") if isinstance(cross_validation_type, str): if cross_validation_type == "loo": self._cv = LeaveOneOut() elif (type(cross_validation_type) in [int]) and (cross_validation_type > 0): cv = KFold(cross_validation_type, shuffle=True, random_state=self.svm_random_state) self._cv = cv else: raise ValueError("The cross_validation_type should be a positive integer for k-fold method ou 'loo' for leave one out cross validation.") def search_hyperparameters(self, kernel = ['rbf'], degree = [ 3 ], gamma=[ 'scale' ], coef0=[ 0.0, 0.1 ], epsilon=[ 0.1, 2.0 ], tol = [1e-3, 1e-10], max_iter = [ -1 ], n_processors = 1, verbose = 0, scoring = 'neg_root_mean_squared_error'): step_value = lambda list_of_values: 0.5 if (len(list_of_values) < 3) else list_of_values[2] epsilon = [round(x, 3) for x in np.arange(start = epsilon[0], stop = epsilon[1], step = step_value(epsilon))] coef0 = [round(x, 3) for x in np.arange(start = coef0[0], stop = coef0[1], step = step_value(coef0))] random_grid = { "kernel": kernel, "degree": degree, "gamma": gamma, "coef0": coef0, "epsilon": epsilon, "max_iter": max_iter, "tol": tol } svm_regression = SVR() svm_regresion_grid = GridSearchCV(estimator = svm_regression, param_grid = random_grid, cv = self._cv, n_jobs = n_processors, verbose=verbose, scoring=scoring) svm_regresion_grid.fit(self._xCal, self._yCal) get_params = lambda dict_params, param, default_params: dict_params[param] if (param in dict_params) else default_params self._best_params = svm_regresion_grid.best_params_ self.kernel = get_params(svm_regresion_grid.best_params_, 'kernel', self.kernel) self.degree = get_params(svm_regresion_grid.best_params_, 'degree', self.degree) self.gamma = get_params(svm_regresion_grid.best_params_, 'gamma', self.gamma) self.coef0 = get_params(svm_regresion_grid.best_params_, 'coef0', self.coef0) self.tol = get_params(svm_regresion_grid.best_params_, 'tol', self.tol) self.epsilon = get_params(svm_regresion_grid.best_params_, 'epsilon', self.epsilon) self.max_iter = get_params(svm_regresion_grid.best_params_, 'max_iter', self.max_iter) def calibrate(self): self.model = SVR(kernel = self.kernel, degree = self.degree, gamma = self.gamma, coef0 = self.coef0, tol = self.tol, epsilon = self.epsilon, max_iter = self.max_iter) self.model.fit(self._xCal, self._yCal) y_cal_predict = self.model.predict(self._xCal) r_correlation = np.corrcoef(self._yCal, y_cal_predict)[0][1] r2_cal = self.model.score(self._xCal, self._yCal) rmse = mean_squared_error(self._yCal, y_cal_predict, squared=False) nsamples = self._xCal.shape[0] calibration_metrics = {'n_samples': nsamples, 'R': r_correlation, 'R2': r2_cal, 'RMSE': rmse} self.metrics['calibration'] = calibration_metrics def cross_validate(self): r_correlation, r2_cv, rmse_cv, bias, predicted_values = cross_validation(self.model, self._xCal, self._yCal, self._cv, correlation_based=False) method = 'Leave One Out' if isinstance(self._cv, KFold): method = "{}-fold".format(self._cv.n_splits) cross_validation_metrics = {'R': r_correlation, 'R2': r2_cv, 'RMSE': rmse_cv, 'bias': bias, 'method': method, 'predicted_values': predicted_values } self.metrics['cross_validation'] = cross_validation_metrics def validate(self): r_correlation, r2_ve, rmse_ve, bias, predicted_values = external_validation(self.model, self._xVal, self._yVal, correlation_based=False) nsamples = self._xVal.shape[0] validation = {'R': r_correlation, 'R2': r2_ve, 'RMSE': rmse_ve, 'bias': bias, 'n_samples': nsamples, 'predicted_values': predicted_values} self.metrics['validation'] = validation def create_model(self): self.calibrate() self.cross_validate() self.validate()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)""" # Feature Scaling from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() sc_y = StandardScaler() X = sc_X.fit_transform(X) y = sc_y.fit_transform(y.reshape(-1, 1)) # Fitting SVR to the dataset from sklearn.svm import SVR regressor = SVR(kernel='rbf') regressor.fit(X, y) # Predicting a new result y_pred = regressor.predict(sc_X.transform(np.array(([[6.5]])))) y_pred = sc_y.inverse_transform(y_pred) """# Applying k-Fold Cross Validation (model evaluation) from sklearn.model_selection import cross_val_score accuracies = cross_val_score(estimator = regressor, X = X, y = y, cv = 10) accuracies.mean() accuracies.std()""" # Visualising the SVR results plt.scatter(X, y, color='red') plt.plot(X, regressor.predict(X), color='blue') plt.title('Truth or Bluff (SVR)') plt.xlabel('Position level') plt.ylabel('Salary') plt.show()
lr = LinearRegression() lr.fit(x_train, y_train) lr_confidence = lr.score(x_test, y_test) lr_confidence = "{:.2%}".format(lr_confidence) print("LR confidence: ", lr_confidence) # Kernel Ridge Regression: kridge = KernelRidge(alpha=1.0) kridge.fit(x_train, y_train) kridge_confidence = kridge.score(x_test, y_test) kridge_confidence = "{:.2%}".format(kridge_confidence) print("Kernel Ridge Confidence: ", kridge_confidence) x_forecast = np.array(df.drop(['Prediction'], 1))[-forecast_out:] lr_prediction = lr.predict(x_forecast) svm_prediction = svr_rbf.predict(x_forecast) kridge_prediction = kridge.predict(x_forecast) last_date = datetime.datetime.strptime(df_full['Date'].iloc[-1], '%Y-%m-%d') last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) df_full['LR Prediction'] = np.NaN df_full['SVM Prediction'] = np.NaN df_full['Ridge Prediction'] = np.NaN for i, j, k in zip(lr_prediction, svm_prediction, kridge_prediction): next_date = next_unix next_unix += datetime.timedelta(days=1) next_date_str = next_date.strftime('%Y-%m-%d') df_full.loc[len(df_full)] = [ next_date_str, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, i, j, k
features_train = train.ix[:,:175] target_train = train.ix[:,[176]] features_test = test.ix[:,:175] target_test= test.ix[:,[176]] #convert to a numpy array features_train_np = np.array(features_train) target_train_np = np.array(target_train) features_test_np = np.array(features_test) target_test_np = np.array(target_test) # Our dataset and targets X = features_train_np y = target_train_np imp.fit(X) X = imp.transform(X) imp.fit(y) y = imp.transform(y).ravel() print X.shape, y.shape clf = SVR(C=1.0, epsilon=0.2) clf.fit(X, y) clf.score(X, y) predicted= clf.predict(features_test_np)
svr_model.fit(x,y) print(" finished at", datetime.now()) print(" sv = ",svr_model.n_support_) print("Generating plot...") plt.figure(figsize=(14,5), dpi=100) plt.plot(pd.to_datetime(dates),y, marker='x', markersize=3, linewidth=0, label=selected_feature+" - actual data") plt.plot(pd.to_datetime(dates[:-3]), \ np.convolve(np.pad(y,3,mode='edge'), np.ones(7)/7, mode='valid')[:-3], \ alpha=0.5, label=selected_feature+" - 7 days moving average") #plt.plot(pd.to_datetime(dates), np.convolve(np.pad(y,7,mode='edge'), np.ones(15)/15, mode='valid'), \ # alpha=0.5, label=selected_feature+" - 15-days window mean") plt.plot(pd.to_datetime(dates)[:-15], \ np.convolve(np.pad(y,15,mode='edge'), np.ones(31)/31, mode='valid')[:-15], \ alpha=0.5, label=selected_feature+" - 31 days moving average") plt.plot(pd.to_datetime(dates),svr_model.predict(x), color='red', label=selected_feature+" - SVR model") days_to_predict = 14 x_pred = np.arange(x[-1][0],x[-1][0]+days_to_predict).reshape(-1,1) x_pred_dates = [pd.to_datetime(dates)[-1] + pd.Timedelta(i,'day') for i in range(0,days_to_predict)] plt.plot(x_pred_dates,svr_model.predict(x_pred), ':', color='red', label=selected_feature+" - SVR prediction") plt.legend() plt.gcf().savefig('Figura-4-SVR.pdf') print("Saving and opening export folder...") os.system("open ./") #plt.show() plt.close()
def WZ_result(X1, y1, X, y, wz): #X = X.values y = y.values rmses = [] rf = [] loo = LeaveOneOut() for train, test in loo.split(X): train_X, test_X, train_y, test_y = X[train], X[test], y[train], y[test] clf = SVR(kernel='rbf', C=10, gamma=0.01) clf.fit(train_X, train_y) predicted = clf.predict(test_X) rmse = mean_squared_error(test_y, predicted)**0.5 rmses.append(rmse) rf.append(clf) index = rmses.index(min(rmses)) predict = rf[index].predict(X1) # rmse mae r2 r2 = r2_score(y1, predict) print("R2", r2) mae = mean_absolute_error(y1, predict) print("mae", mae) rmse = mean_squared_error(y1, predict)**0.5 print("rmse", rmse) figsize = 9, 9 figure, ax = plt.subplots(figsize=figsize) p0, = plt.plot([0, 8], [0, 8], '--', color='black', label='line', linewidth=1.0) color = ['limegreen', 'mediumslateblue', 'cyan', 'gold'] marker = ['*', 'o', 'd', '<'] p1 = plt.scatter(y1[0:12], predict[0:12], c=color[0], marker=marker[0], label='NQ1', s=280, edgecolors='black') p2 = plt.scatter(y1[12:24], predict[12:24], c=color[1], marker=marker[1], label='NQ5', s=150, edgecolors='black') p3 = plt.scatter(y1[24:36], predict[24:36], c=color[2], marker=marker[2], label='NQ7', s=180, edgecolors='black') p4 = plt.scatter(y1[36:48], predict[36:48], c=color[3], marker=marker[3], label='NNQ9', s=180, edgecolors='black') ############# 设置坐标刻度值的大小以及刻度值的字体 ############# #plt.xlim(3, 7.5) #plt.ylim(3, 7.5) plt.xlim(0, 1) plt.ylim(0, 1) plt.tick_params(labelsize=18) labels = ax.get_yticklabels() [label.set_fontname('Times New Roman') for label in labels] labels = ax.get_xticklabels() [label.set_fontname('Times New Roman') for label in labels] ############# 设置图例并且设置图例的字体及大小 ############# font1 = { 'family': 'Times New Roman', 'weight': 'normal', 'size': 25, } plt.legend(prop=font1, frameon=False) # 图例 plt.ylabel(' (g / 100g)', font1) plt.xlabel(' (g / 100g)', font1) ax = plt.gca() ax.set_aspect(1) #plt.savefig('C:\\Users\\shaoqi\\Desktop\\'+ wz +'.eps', dpi=2000) plt.show() return predict
X = np.array(ct.fit_transform(X)) # SPLITTING THE DATA X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # FEATURE SCALING sc_X = StandardScaler() sc_y = StandardScaler() X_train[:, 42:] = sc_X.fit_transform(X_train[:, 42:]) X_test[:, 42:] = sc_X.transform(X_test[:, 42:]) y_train = sc_y.fit_transform(y_train) y_test = sc_y.transform(y_test) # TRAIN THE SVR MODEL regressor = SVR( kernel='rbf') # rfb = radial basis function (non linear function) regressor.fit(X_train, y_train) # PREDICTING TEST SET RESULTS y_pred = sc_y.inverse_transform(regressor.predict(X_test)) np.set_printoptions(precision=2) print( np.concatenate( (y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1)) # EVALUATING THE SVR PERFORMANCE print(r2_score(sc_y.inverse_transform(y_test), y_pred))
''' 支持向量迴歸 使用支持向量迴歸,建議先將X.Y做標準化或規一化 可以看DataPreprocessing的ScaleTransform.py 下面以標準化為例 SVR內參數可調整 kernel: 'linear','poly','rbf','sigmoid','precomputed' C: 懲罰項 越大越容易overfittin gamma: 越大,支持向量越少,越小,支持向量越多 ''' #標準化 from sklearn.preprocessing import StandardScaler scaler_X = StandardScaler() scaler_Y = StandardScaler() X = scaler_X.fit_transform(X) Y = scaler_Y.fit_transform(Y.reshape(-1, 1)) # SVR訓練 from sklearn.svm import SVR SVR_regressor = SVR(kernel='rbf') SVR_regressor.fit(X_train, Y) # SVR預測,預測完的Y記得轉換回來 X_test = scaler_X.transform(X_test) y_pred = SVR_regressor.predict(X_test) y_pred = scaler_Y.inverse_transform(y_pred)
# temp_max2 是 dist2 中城市的对应最高温度 temp_max2 = temp_max[5:10] # 我们调用SVR函数,在参数中规定了使用线性的拟合函数 # 并且把 C 设为1000来尽量拟合数据(因为不需要精确预测不用担心过拟合) svr_lin1 = SVR(kernel='linear', C=1e3) svr_lin2 = SVR(kernel='linear', C=1e3) # 加入数据,进行拟合(这一步可能会跑很久,大概10多分钟,休息一下:) ) svr_lin1.fit(dist1, temp_max1) svr_lin2.fit(dist2, temp_max2) # 关于 reshape 函数请看代码后面的详细讨论 xp1 = np.arange(10, 100, 10).reshape((9, 1)) xp2 = np.arange(50, 400, 50).reshape((7, 1)) yp1 = svr_lin1.predict(xp1) yp2 = svr_lin2.predict(xp2) #限制了X轴的取值范围 ax.plot(xp1, yp1, c='b', label='Strong sea effect') ax.plot(xp2, yp2, c='g', label='Light sea effect') fig print(svr_lin1.coef_) #斜率 print(svr_lin1.intercept_) #截距 print(svr_lin2.coef_) print(svr_lin2.intercept_) # 定义了第一条拟合直线
import numpy as np from sklearn.svm import SVR X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]]) # y = 1 * x_0 + 2 * x_1 + 3 y = np.dot(X, np.array([1, 2])) + 3 model = SVR() model.fit(X, y) pred = model.predict(X) print(pred)
for max_depth in range(1, 51): for min_samples_split in range(1, 102, 5): tree = DecisionTreeRegressor(max_depth=max_depth, min_samples_split=min_samples_split) tree.fit(X_train, y_train) prediction = tree.predict(X_test) mae.append(mean_absolute_error(y_test, prediction)) print "Minimum MAE TREE = ", min(mae) test_maes_dictionary["Dec. Tree"] = min(mae) ## SUPPORT VECTORS MACHINE TRAINING mae = [] for kernel in ["rbf", "linear", "poly", "sigmoid"]: svr = SVR(kernel=kernel) svr.fit(X_train, y_train) prediction = svr.predict(X_test) mae.append(mean_absolute_error(y_test, prediction)) print "Minimum MAE SVR = ", min(mae) test_maes_dictionary["SVM"] = min(mae) ## RANDOM FOREST TRAINING mae = [] for n_estimators in range(10, 1100, 100): rf = RandomForestRegressor(n_estimators=n_estimators) rf.fit(X_train, y_train) prediction = rf.predict(X_test) mae.append(mean_absolute_error(y_test, prediction)) print "Minimum MAE R.Forest = ", min(mae) test_maes_dictionary["R. Forest"] = min(mae) #############################################################################################
y = dataset.iloc[:, -1].values y = y.reshape(len(y), ) from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() X = sc_X.fit_transform(X) sc_y = StandardScaler() y = sc_y.fit_transform(y) from sklearn.svm import SVR svr = SVR(kernel='rbf') svr.fit(X, y) # no try to predict level 6.5 x_interest = sc_X.transform([[6.5]]) y_interest = svr.predict(x_interest) # now inverse y final_y = sc_y.inverse_transform(y_interest) # scatter the real. # plot the pred X_org = dataset.iloc[:, 1:-1].values plt.scatter(X_org, sc_y.inverse_transform(y), color='red') plt.plot(X_org, sc_y.inverse_transform(svr.predict(X)), color='blue') plt.show() ## same thing. but smoth lines X_plot = np.arange(min(X_org), max(X_org), 0.1) X_plot = X_plot.reshape(len(X_plot), 1) plt.scatter(X_org, sc_y.inverse_transform(y), color='red') plt.plot(X_plot,
def tune_cv(x_train, y_train, x_test, y_test, C, gamma): model = SVR(C=C, gamma=gamma).fit(x_train, y_train) predictions = model.predict(x_test) return optunity.metrics.mse(y_test, predictions)
# Scale X sc_X = StandardScaler() X = sc_X.fit_transform(X) # Scale y sc_y = StandardScaler() y = sc_y.fit_transform(y) # Fitting SVR to the dataset # ========================== from sklearn.svm import SVR regressor = SVR(kernel="rbf", gamma="scale") regressor.fit(X, y) # Predicting the new result # ========================= temp_pred = regressor.predict(sc_X.transform(np.array([[6.5]]))) y_pred = sc_y.inverse_transform(temp_pred) # Visualising the SVR results # =========================== # Create higher precision for X axis. x_grid = np.arange(min(X), max(X), 0.1) x_grid = x_grid.reshape((len(x_grid), 1)) plt.scatter(X, y, color="red") # Use x_grid in place of X for smoother line plot. plt.plot(x_grid, regressor.predict(x_grid), color="blue") plt.title("Salary Guide (Polynomial Regression)") plt.xlabel("Position level") plt.ylabel("Salary") plt.show()
y = diabetes.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1234) # Fit regression model svr_lin = SVR(kernel='linear') svr_rbf = SVR(kernel='rbf', gamma=0.1) svr_poly = SVR(kernel='poly', degree=2) y_lin = svr_lin.fit(X_train, y_train).predict(X_train) y_rbf = svr_rbf.fit(X_train, y_train).predict(X_train) y_poly = svr_poly.fit(X_train, y_train).predict(X_train) print("Linear train error: ", mean_squared_error(y_train, y_lin), " test error: ", mean_squared_error(y_test, svr_lin.predict(X_test))) print("RBF train error: ", mean_squared_error(y_train, y_rbf), " test error: ", mean_squared_error(y_test, svr_rbf.predict(X_test))) print("Polynomial train error: ", mean_squared_error(y_train, y_poly), " test error: ", mean_squared_error(y_test, svr_rbf.predict(X_test))) plt.figure(figsize=(20, 10)) plt.scatter(X_train[:, feature], y_train, color='darkorange', label='data') plt.scatter(X_train[:, feature], y_lin, color='c', label='Linear model') plt.scatter(X_train[:, feature], y_rbf, color='navy', label='RBF model') plt.scatter(X_train[:, feature], y_poly, color='cornflowerblue', label='Polynomial model')
plt.scatter(X_svr, y_svr) plt.show() #%% Implementación del modelo de SVR #Separo los datos de "train" en entrenamiento y prueba para probar los algoritmos X_train, X_test, y_train, y_test = train_test_split(X_svr, y_svr, test_size=0.2) #Defino el algoritmo a utilizar svr = SVR(kernel='linear', C=1.0, epsilon=0.2) #svr = SVR() #Entreno el modelo svr.fit(X_train, y_train) #Realizo una predicción Y_pred = svr.predict(X_test) #%% Resultados del modelo #Graficamos los datos junto con el modelo plt.scatter(X_test, y_test) plt.plot(X_test, Y_pred, color='red', linewidth=3) plt.show() print() print('DATOS DEL MODELO VECTORES DE SOPORTE REGRESIÓN') print() print('Precisión del modelo:') print(svr.score(X_train, y_train))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)""" # Feature Scaling from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() sc_y = StandardScaler() X = sc_X.fit_transform(X) y = sc_y.fit_transform(y) # Fitting SVR to the dataset from sklearn.svm import SVR regressor = SVR(kernel='rbf') regressor.fit(X, y) # Predicting a new result y_pred = regressor.predict(6.5) y_pred = sc_y.inverse_transform(y_pred) # Visualising the SVR results plt.scatter(X, y, color='red') plt.plot(X, regressor.predict(X), color='blue') plt.title('Truth or Bluff (SVR)') plt.xlabel('Position level') plt.ylabel('Salary') plt.show() # Visualising the SVR results (for higher resolution and smoother curve) X_grid = np.arange( min(X), max(X), 0.01 ) # choice of 0.01 instead of 0.1 step because the data is feature scaled X_grid = X_grid.reshape((len(X_grid), 1))
def main(): horses98 = HorseParserNoHandicaps('./../Data/born98.csv').horses horses05 = HorseParserNoHandicaps('./../Data/born05.csv').horses races98 = RaceParserNoHandicaps('./../Data/born98.csv').races races05 = RaceParserNoHandicaps('./../Data/born05.csv').races print 'HorsesBorn98 Dataset' horses_train_98, horses_test_98 = split_dataset(horses98) horses_98_X_train = [] horses_98_y_train = [] for h in horses_train_98: v, s = compute_vector(h) horses_98_X_train.append(v) horses_98_y_train.append(s) print 'No. of instances in training set:' print len(horses_98_X_train) print len(horses_98_y_train) print '' horses_98_X_test = [] horses_98_y_test = [] for h in horses_test_98: v, s = compute_vector(h) horses_98_X_test.append(v) horses_98_y_test.append(s) print 'No. of instances in testing set:' print len(horses_98_X_test) print len(horses_98_y_test) print '' print 'Create SVR object' # Create svr object svr98 = SVR(kernel='linear', C=1e3) #, gamma=0.1) print 'Training SVR' # Train the model using the training sets svr98.fit(horses_98_X_train, horses_98_y_train) print 'Predicting' horses_98_y_pred = svr98.predict(horses_98_X_test) # Explained variance score: 1 is perfect prediction print 'Variance score:' print svr98.score(horses_98_X_test, horses_98_y_test) print '' print 'Mean absolute error:' print mean_absolute_error(horses_98_y_test, horses_98_y_pred) print '' print 'Explained variance:' print explained_variance_score(horses_98_y_test, horses_98_y_pred) print '' print 'Mean squared error:' print mean_squared_error(horses_98_y_test, horses_98_y_pred) print '' print 'R2 score:' print r2_score(horses_98_y_test, horses_98_y_pred) print ''
print(r2_score(Y , lin_reg2.predict(poly_reg.fit_transform(X)))) # SVR from sklearn.preprocessing import StandardScaler sc1 = StandardScaler() sc2 = StandardScaler() x_olcekli = sc1.fit_transform(X) y_olcekli = sc2.fit_transform(Y) from sklearn.svm import SVR svr_reg = SVR(kernel = 'rbf') svr_reg.fit(x_olcekli, y_olcekli) print("SVR OLS:") model3 = sm.OLS(svr_reg.predict(x_olcekli),x_olcekli) print(model3.fit().summary()) print("SVR R-square value:") print(r2_score(Y , svr_reg.predict(x_olcekli))) # Decision Tree from sklearn.tree import DecisionTreeRegressor dt_r = DecisionTreeRegressor(random_state=0) dt_r.fit(X,Y) print("Decision Tree OLS:") model4 = sm.OLS(dt_r.predict(X),X) print(model4.fit().summary()) print("Decision Tree R-square value:") print(r2_score(Y , dt_r.predict(X)))
(start_date + datetime.timedelta(days=i)).strftime('%m/%d/%Y')) X_train_confirmed, X_test_confirmed, y_train_confirmed, y_test_confirmed = train_test_split( days_since_1_22, turkey_cases, test_size=0.36, shuffle=False) # In[6]: # svm_confirmed = svm_search.best_estimator_ svm_confirmed = SVR(shrinking=True, kernel='poly', gamma=0.01, epsilon=1, degree=4, C=0.1) svm_confirmed.fit(X_train_confirmed, y_train_confirmed) svm_pred = svm_confirmed.predict(future_forcast) # check against testing data svm_test_pred = svm_confirmed.predict(X_test_confirmed) # In[7]: # transform our data for polynomial regression poly = PolynomialFeatures(degree=3) poly_X_train_confirmed = poly.fit_transform(X_train_confirmed) poly_X_test_confirmed = poly.fit_transform(X_test_confirmed) poly_future_forcast = poly.fit_transform(future_forcast) bayesian_poly = PolynomialFeatures(degree=4) bayesian_poly_X_train_confirmed = bayesian_poly.fit_transform( X_train_confirmed)
# Shuffle the data X, y = shuffle(data.data, data.target, random_state=7) # Split the data into training and testing datasets num_training = int(0.8 * len(X)) X_train, y_train = X[:num_training], y[:num_training] X_test, y_test = X[num_training:], y[num_training:] # Create Support Vector Regression model sv_regressor = SVR(kernel='linear', C=1.0, epsilon=0.1) # Train Support Vector Regressor sv_regressor.fit(X_train, y_train) # Evaluate performance of Support Vector Regressor y_test_pred = sv_regressor.predict(X_test) mse = mean_squared_error(y_test, y_test_pred) evs = explained_variance_score(y_test, y_test_pred) print("\n#### Performance ####") print("Mean squared error =", round(mse, 2)) ai02_3_url1 = round(mse, 2) print("Explained variance score =", round(evs, 2)) ai02_3_url2 = round(evs, 2) # Test the regressor on test datapoint test_data = [ 3.7, 0, 18.4, 1, 0.87, 5.95, 91, 2.5052, 26, 666, 20.2, 351.34, 15.27 ] print("\nPredicted price:", sv_regressor.predict([test_data])[0]) ai02_3_url3 = sv_regressor.predict([test_data])[0]
#%% from sklearn.preprocessing import StandardScaler sc1 = StandardScaler() x_olcekli = sc1.fit_transform(X) sc2 = StandardScaler() y_olcekli = sc2.fit_transform(Y) from sklearn.svm import SVR svrReg = SVR(kernel='rbf') svrReg.fit(x_olcekli, y_olcekli) plt.scatter(x_olcekli, y_olcekli, color="red") plt.plot(x_olcekli, svrReg.predict(x_olcekli), color="black") plt.xlabel("SVR") plt.show() print(svrReg.predict(np.array([6.6]).reshape(-1, 1))) #%% Decision Tree from sklearn.tree import DecisionTreeRegressor r_dt = DecisionTreeRegressor(random_state=0) r_dt.fit(X, Y) plt.scatter(X, Y, color="red") plt.plot(X, r_dt.predict(X)) plt.show()
#Feature Scaling from sklearn.preprocessing import StandardScaler sc_x = StandardScaler() sc_y = StandardScaler() x = sc_x.fit_transform(x) y = sc_y.fit_transform(y.reshape(-1, 1)) # SVR regression from sklearn.svm import SVR regressor = SVR(kernel='rbf') regressor.fit(x, y) #prediction y_pred = sc_y.inverse_transform( regressor.predict(sc_x.transform(np.array([[6.5]])))) print(y_pred) #visualising SVR result plt.scatter(x, y, color='red') plt.plot(x, regressor.predict(x), color='blue') plt.title('SVR Model') plt.xlabel('Position Level') plt.ylabel('Salary') plt.show() ## more continuous graph x_grid = np.arange(min(x), max(x), 0.1) x_grid = x_grid.reshape(-1, 1) plt.scatter(x, y, color='red') plt.plot(x_grid, regressor.predict(x_grid), color='blue')