def train_SVM(self, data): train, validacion = data x_tr, y_tr = train x_val, y_val = validacion #print("El set de train tiene {} filas y {} columnas".format(x_tr.shape[0],x_tr.shape[1])) #print("El set de validacion tiene {} filas y {} columnas".format(x_val.shape[0],x_val.shape[1])) print('Start training LinearSVR...') start_time = self.timer() svr = LinearSVR() svr.fit(x_tr, y_tr) print("The R2 is: {}".format(svr.score(x_tr, y_tr))) self.timer(start_time) print("Making prediction on validation data") y_val = np.expm1(y_val) y_val_pred = np.expm1(svr.predict(x_val)) mae = mean_absolute_error(y_val, y_val_pred) print("El mean absolute error de es {}".format(mae)) print('Saving model into a pickle') try: os.mkdir('pickles') except: pass with open('pickles/svrCV.pkl', 'wb') as f: pickle.dump(svr, f) print('Making prediction and saving into a csv') y_test = svr.predict(self.x_test) return y_test
class SVMWrapper: def __init__(self, c=1.0, e=0.0, loss="epsilon_insensitive", dual=True, max_iter=1000): self.regressor = LinearSVR(C=c, epsilon=e, loss=loss, dual=dual, max_iter=max_iter) self.training_time = None def train(self, x_train, y_train): start = time.perf_counter() self.regressor.fit(x_train, y_train) self.training_time = time.perf_counter() - start def score(self, x_test, y_test): return self.regressor.score(x_test, y_test) def predict(self, x_test): return self.regressor.predict(x_test) def predict_one(self, x_single): return self.regressor.predict(x_single) def get_training_time(self): if self.training_time is None: raise ValueError() else: return self.training_time
def LinearSVRRegressor(X_train, X_test, y_train, y_test): y_train1 = y_train[:, 0] y_train2 = y_train[:, 1] reg1 = LinearSVR(epsilon=0.001, max_iter=5000, C=3, loss='squared_epsilon_insensitive') reg1.fit(X_train, y_train1) reg2 = LinearSVR(epsilon=0.001, max_iter=5000, C=3, loss='squared_epsilon_insensitive') reg2.fit(X_train, y_train2) y_pred1 = reg1.predict(X=X_test) y_pred2 = reg2.predict(X=X_test) y_pred = np.hstack((y_pred1.reshape(-1, 1), y_pred2.reshape(-1, 1))) printMetrics(y_true=y_test, y_pred=y_pred) val_metrics = getMetrics(y_true=y_test, y_pred=y_pred) y_pred1 = reg1.predict(X=X_train) y_pred2 = reg2.predict(X=X_train) y_pred = np.hstack((y_pred1.reshape(-1, 1), y_pred2.reshape(-1, 1))) metrics = getMetrics(y_true=y_train, y_pred=y_pred) printMetrics(y_true=y_train, y_pred=y_pred) logSave(nameOfModel="LinearSVRRegressor", reg=[reg1, reg2], metrics=metrics, val_metrics=val_metrics)
class LSVR: def __init__(self): super(LSVR, self).__init__() self.C = 0.1 self.n_time = 5 self.model = LinearSVR(C=self.C) def fit(self, train_x, train_y): self.model.fit(train_x, train_y) def predict(self, test_x): return self.model.predict(test_x) def eval(self, out_time, v_path, w_path): train_x, train_y, test_x, test_y = Helper.retrieve_data( n_time=5, out_time=out_time, train_pct=0.7, test_pct=0.2, v_path=v_path, w_path=w_path) train_x = np.squeeze(train_x.transpose( (0, 2, 1, 3))).reshape(-1, self.n_time) test_x = np.squeeze(test_x.transpose( (0, 2, 1, 3))).reshape(-1, self.n_time) train_y = train_y.reshape(-1) test_y = test_y.reshape(-1) print("LSVR Fitting...") self.model.fit(train_x, train_y) print("LSVR Fitted!") y_pred = self.model.predict(test_x) Helper.metrics(y_pred, test_y)
def LinearSVRRegressorGS(X_train, X_test, y_train, y_test): y_train1 = y_train[:, 0] y_train2 = y_train[:, 1] reg1 = LinearSVR() reg2 = LinearSVR() grid_values = { 'epsilon': list(range(1, 3)) + [value * 0.01 for value in range(1, 3)], 'C': [value * 0.01 for value in range(1, 3)], 'loss': ['epsilon_insensitive', 'squared_epsilon_insensitive'] } grid_reg1 = GridSearchCV( reg1, param_grid=grid_values, scoring=['neg_mean_squared_error', 'neg_mean_absolute_error', 'r2'], refit='r2', n_jobs=-1, cv=2, verbose=100) grid_reg1.fit(X_train, y_train1) reg1 = grid_reg1.best_estimator_ reg1.fit(X_train, y_train1) grid_reg2 = GridSearchCV( reg2, param_grid=grid_values, scoring=['neg_mean_squared_error', 'neg_mean_absolute_error', 'r2'], refit='r2', n_jobs=-1, cv=2, verbose=100) grid_reg2.fit(X_train, y_train2) reg2 = grid_reg1.best_estimator_ reg2.fit(X_train, y_train2) y_pred1 = reg1.predict(X=X_test) y_pred2 = reg2.predict(X=X_test) y_pred = np.hstack((y_pred1.reshape(-1, 1), y_pred2.reshape(-1, 1))) printMetrics(y_true=y_test, y_pred=y_pred) val_metrics = getMetrics(y_true=y_test, y_pred=y_pred) y_pred1 = reg1.predict(X=X_train) y_pred2 = reg2.predict(X=X_train) y_pred = np.hstack((y_pred1.reshape(-1, 1), y_pred2.reshape(-1, 1))) metrics = getMetrics(y_true=y_train, y_pred=y_pred) printMetrics(y_true=y_train, y_pred=y_pred) best_params1: dict = grid_reg1.best_params_ best_params2: dict = grid_reg2.best_params_ best_params = {} for key in best_params1.keys(): best_params[key] = [best_params1[key], best_params2[key]] saveBestParams(nameOfModel="LinearSVRRegressorGS", best_params=best_params) logSave(nameOfModel="LinearSVRRegressorGS", reg=[reg1, reg2], metrics=metrics, val_metrics=val_metrics)
def fitSVR(self, X, Y, name, lastX = None): if not hasattr(self, name): SVR = [] setattr(self, name, SVR) else: SVR = getattr(self, name) # if "ridge_alpha" in self.args: # alpha = self.args['ridge_alpha'] # else: epsilon_options = [0, 0.1, 10, 100] C_options = [0.1, 10, 100] Xselect = 30000 kf = KFold(n_splits=5, shuffle=True) for i1, i2 in kf.split(X): train_index, test_index = i1[:Xselect], i2 break bestscore = 3 bestarg = None for epsilon in epsilon_options: for C in C_options: logging.info("SVR trying %f %f", epsilon, C) model = LinearSVR(epsilon=epsilon, C=C) model.fit(X[train_index], Y[train_index][:, 24]) if lastX is None: predY = model.predict(X[test_index]) score = calSMAPE1(Y[test_index][:, 24], predY) else: predY = lastX[test_index][:, 0] + model.predict(X[test_index]) score = calSMAPE1(lastX[test_index][:, 0] + Y[test_index][:, 24], predY) if score < bestscore: bestscore = score bestarg = (epsilon, C) logging.info("SVR try %f %f, score %f", epsilon, C, score) epsilon, C = bestarg logging.info("SVR best %f %f, bestscore %f", epsilon, C, bestscore) global SVRargs SVRargs = (X[train_index], Y[train_index], epsilon, C) for idx in self.divide(list(range(len(SVR), Y.shape[1])), 18): with mp.Pool(6) as pool: SVR += pool.map(train_SVR, idx) logging.info("SVR group %d", idx[0]) self.saveModule(name, False) logging.info("SVR ok")
def test_linear_svr_evaluation(self): """ Check that the evaluation results are the same in scikit learn and coremltools """ ARGS = [ {}, {"C": 0.5, "epsilon": 0.25}, {"dual": False, "loss": "squared_epsilon_insensitive"}, {"tol": 0.005}, {"fit_intercept": False}, {"intercept_scaling": 1.5}, ] input_names = self.scikit_data.feature_names df = pd.DataFrame(self.scikit_data.data, columns=input_names) for cur_args in ARGS: print(cur_args) cur_model = LinearSVR(**cur_args) cur_model.fit(self.scikit_data["data"], self.scikit_data["target"]) spec = convert(cur_model, input_names, "target") df["prediction"] = cur_model.predict(self.scikit_data.data) metrics = evaluate_regressor(spec, df) self.assertAlmostEquals(metrics["max_error"], 0)
def test_svm_model(train_X, train_y, dev_X, dev_y): print('Testing svm model...') from sklearn.svm import LinearSVR clf = LinearSVR() clf.fit(train_X, train_y) pred_y = clf.predict(dev_X) print('RMSE: {}'.format(math.sqrt(mean_squared_error(dev_y, pred_y))))
def try_Cs(X, y, cv, Cs): results = [] for C in Cs: t0 = time() scores = [] for train_idx, val_idx in cv: svm = LinearSVR(C=C, loss='squared_epsilon_insensitive', dual=False, random_state=1) svm.fit(X[train_idx], y[train_idx]) y_pred = svm.predict(X[val_idx]) y_pred[y_pred < 0] = 0.0 y_pred[y_pred > 1] = 1.0 rmse = mean_squared_error(y[val_idx], y_pred) scores.append(rmse) m = np.mean(scores) s = np.std(scores) print('C=%s, took %.3fs, mse=%.3f+-%.3f' % (C, time() - t0, m, s)) results.append((m.round(3), s, C)) _, _, best_C = min(results) return best_C
class LinearSVRPrim(primitive): def __init__(self, random_state=0): super(LinearSVRPrim, self).__init__(name='LinearSVR') self.hyperparams = [] self.type = 'Regressor' self.description = "We make use of the epsilon-insensitive loss, i.e. errors of less than epsilon are ignored. This is the form that is directly optimized by LinearSVR." self.hyperparams_run = {'default': True} self.random_state = random_state self.model = LinearSVR() self.accept_type = 'c_r' def can_accept(self, data): return self.can_accept_c(data, 'Regression') def is_needed(self, data): # data = handle_data(data) return True def fit(self, data): data = handle_data(data) self.model.fit(data['X'], data['Y']) def produce(self, data): output = handle_data(data) output['predictions'] = self.model.predict(output['X']) output['X'] = pd.DataFrame(output['predictions'], columns=[self.name+"Pred"]) final_output = {0: output} return final_output
def mse_of_linear_svr(X, y, epsilon): """ Compute the mean square error of a linear SVR predictor with hyperparameter epsilon. As a model, use LinearSVR library to train a linear SVR predictor. Set its epsilon hyperparameter to the value of the epsilon argument, and its random state to 5. Split the dataset into training dataset, test dataset, training labels, and test labels; with 0.2 as the test size and 5 as its random state. Use StandardScaler to scale the both datasets. Fit and test the model, and return the mean square error on the test dataset. Args: X - (n, d) numpy array of the dataset of n sample points each with d features y - (n, ) numpy array of the label values for each sample point epsilon - a scalar of the hyperparameter epsilon of a linear SVR predictor Returns: mse - a scalar of the mean square error of the test dataset """ # Write your code here model_linearSVR = LinearSVR(epsilon = epsilon,random_state=5) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5) scaler = StandardScaler() scaler_x_train = scaler.fit_transform(X_train) scaler_x_test = scaler.transform(X_test) model_linearSVR.fit(scaler_x_train, y_train) y_pred = model_linearSVR.predict(scaler_x_test) a = mean_squared_error(y_test, y_pred) return a
def train_svr(X, y, plot=False, linear=False): """ Trains a SVR Model. If the parameter linear is given, trains a Linear SVR. :param X: X of the current dataset :param y: target of the current dataset :param plot: either true or false. Controls if plots are shown while training this model. :param linear: either true or false. Controls if the trained model will be a Linear SVR or Epsilon-SVR. :return: trained SVR model """ print("Training SVR Model") if linear: estimator = LinearSVR() model = LinearSVR() else: estimator = SVR() model = SVR() model_name = type(estimator).__name__ estimated_test_error = estimate_test_error(estimator, X, y) print("Estimated test error for {} model : {}".format( model_name, estimated_test_error)) model.fit(X, y) y_pred = model.predict(X) rmse = np.sqrt(mean_squared_error(y, y_pred)) print("Training error for {} model : {}".format(model_name, rmse)) if plot: plot_residuals(y_pred, y, model_name) return model
def GlobalRegression(local_binary_features, targets): t1=time.time() updates=np.zeros((len(targets), param_landmark_num, 2)) svrs=[] for i in range(param_landmark_num): # dx svr_x=LinearSVR(C=1./len(targets), dual=True, loss='squared_epsilon_insensitive', epsilon=0.0001) svr_x.fit(local_binary_features, targets[:, i, 0]) updates[:, i, 0]=svr_x.predict(local_binary_features) # dy svr_y=LinearSVR(C=1./len(targets), dual=True, loss='squared_epsilon_insensitive', epsilon=0.0001) svr_y.fit(local_binary_features, targets[:, i, 1]) updates[:, i, 1]=svr_y.predict(local_binary_features) svrs.append([svr_x, svr_y]) print('Global Regression use:', time.time()-t1, 's') return updates, svrs
def test_data_truth(): n = 100 d = 10 strRel = 2 generator = check_random_state(1337) X, Y = genRegressionData( n_samples=n, n_features=d, n_redundant=0, n_strel=strRel, n_repeated=0, random_state=generator, noise=0, ) X = StandardScaler().fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=generator) linsvr = LinearSVR() linsvr.fit(X_train, y_train) pred = linsvr.predict(X_test) r2 = r2_score(y_test, pred) assert r2 > 0.9
def test_linear_svr_evaluation(self): """ Check that the evaluation results are the same in scikit learn and coremltools """ ARGS = [{}, { 'C': 0.5, 'epsilon': 0.25 }, { 'dual': False, 'loss': 'squared_epsilon_insensitive' }, { 'tol': 0.005 }, { 'fit_intercept': False }, { 'intercept_scaling': 1.5 }] input_names = self.scikit_data.feature_names df = pd.DataFrame(self.scikit_data.data, columns=input_names) for cur_args in ARGS: print(cur_args) cur_model = LinearSVR(**cur_args) cur_model.fit(self.scikit_data['data'], self.scikit_data['target']) spec = convert(cur_model, input_names, 'target') df['prediction'] = cur_model.predict(self.scikit_data.data) metrics = evaluate_regressor(spec, df) self.assertAlmostEquals(metrics['max_error'], 0)
class TestLinearSVRIntegration(TestCase): def setUp(self): df = pd.read_csv(path.join(BASE_DIR, '../models/categorical-test.csv')) Xte = df.iloc[:, 1:] Xenc = pd.get_dummies(Xte, prefix_sep='') yte = df.iloc[:, 0] self.test = (Xte, yte) self.enc = (Xenc, yte) pmml = path.join(BASE_DIR, '../models/linear-model-lm.pmml') self.clf = PMMLLinearSVR(pmml) self.ref = LinearSVR() self.ref.fit(Xenc, yte == 'Yes') def test_invalid_model(self): with self.assertRaises(Exception) as cm: PMMLLinearSVR(pmml=StringIO(""" <PMML xmlns="http://www.dmg.org/PMML-4_3" version="4.3"> <DataDictionary> <DataField name="Class" optype="categorical" dataType="string"> <Value value="setosa"/> <Value value="versicolor"/> <Value value="virginica"/> </DataField> </DataDictionary> <MiningSchema> <MiningField name="Class" usageType="target"/> </MiningSchema> </PMML> """)) assert str( cm.exception) == 'PMML model does not contain RegressionModel.' def test_fit_exception(self): with self.assertRaises(Exception) as cm: self.clf.fit(np.array([[]]), np.array([])) assert str(cm.exception) == 'Not supported.' def test_more_tags(self): assert self.clf._more_tags() == LinearSVR()._more_tags() def test_sklearn2pmml(self): # Export to PMML pipeline = PMMLPipeline([("classifier", self.ref)]) pipeline.fit(self.enc[0], self.enc[1] == 'Yes') sklearn2pmml(pipeline, "svm-sklearn2pmml.pmml", with_repr=True) try: # Import PMML model = PMMLLinearSVR(pmml='svm-sklearn2pmml.pmml') # Verify classification Xenc, _ = self.enc assert np.allclose(self.ref.predict(Xenc), model.predict(Xenc)) finally: remove("svm-sklearn2pmml.pmml")
def linear_svm_regression(): np.random.seed(42) m = 50 X = 2 * np.random.rand(m, 1) y = (4 + 3 * X + np.random.randn(m, 1)).ravel() svm_reg1 = LinearSVR(epsilon=1.5, random_state=42) svm_reg2 = LinearSVR(epsilon=0.5, random_state=42) svm_reg1.fit(X, y) svm_reg2.fit(X, y) svm_reg1.support_ = find_support_vectors(svm_reg1, X, y) svm_reg2.support_ = find_support_vectors(svm_reg2, X, y) eps_x1 = 1 eps_y_pred = svm_reg1.predict([[eps_x1]]) plt.figure(figsize=(9, 4)) plt.subplot(121) plot_svm_regression(svm_reg1, X, y, [0, 2, 3, 11]) plt.title(r"$\epsilon = {}$".format(svm_reg1.epsilon), fontsize=18) plt.ylabel(r"$y$", fontsize=18, rotation=0) # plt.plot([eps_x1, eps_x1], [eps_y_pred, eps_y_pred - svm_reg1.epsilon], "k-", linewidth=2) plt.annotate( '', xy=(eps_x1, eps_y_pred), xycoords='data', xytext=(eps_x1, eps_y_pred - svm_reg1.epsilon), textcoords='data', arrowprops={'arrowstyle': '<->', 'linewidth': 1.5} ) plt.text(0.91, 5.6, r"$\epsilon$", fontsize=20) plt.subplot(122) plot_svm_regression(svm_reg2, X, y, [0, 2, 3, 11]) plt.title(r"$\epsilon = {}$".format(svm_reg2.epsilon), fontsize=18) plt.show()
def outlier_linearSVR_detector(feature, target, residual_threshold, return_index = False): """ this function detect the outlier by using the LinearSVR with linear kernel with the fitted coefficient """ target = (np.array(target)).flatten() residual_threshold = (np.max(target) -np.min(target))*residual_threshold regr = LinearSVR(random_state=1, dual=True, epsilon=0.0) regr.fit(feature, target) predict_data = regr.predict(feature) i=0 num_of_outlier = 0 outlier_index = [] for x in predict_data: delta = x-target[i] if abs(delta) > residual_threshold: num_of_outlier = num_of_outlier + 1 outlier_index.append(i) i=i+1 slope = regr.coef_[0] if return_index is False: return (num_of_outlier, slope) else: return outlier_index
async def do_run_async(self): # Generate some non-linear data based on a quadratic equation m = 100 X = 6 * np.random.uniform(1, 5, (m, 1)) - 3 y = 0.5 * X**2 + X + 2 + np.random.uniform(1, 5, (m, 1)) plt.plot(X, y, ".") plt.show() # To tackle nonlinear regression tasks, you can use a kernelized SVM model svm_poly_reg = SVR(kernel="poly", degree=2, C=100, epsilon=0.1) svm_poly_reg.fit(X, y) rand_index = np.random.randint(0, 99) x = X[rand_index, ] print("Prediction for:", x) print(svm_poly_reg.predict([x])) print("Label:", y[rand_index, ]) # ... or just use the Linear SVR algorithm with polynomial features polly = PolynomialFeatures( degree=2) # Polynomial degree is usually number of features + 1? X_tr = polly.fit_transform(X) svm_reg = LinearSVR(epsilon=1.5) svm_reg.fit(X_tr, y) rand_index = np.random.randint(0, 99) x = X_tr[rand_index, ] print("Prediction for:", x) print(svm_reg.predict([x])) print("Label:", y[rand_index, ])
def build_svr(params=None): train_df, test_df = load_data() combined_df = pd.concat((train_df.loc[:, 'MSSubClass':'SaleCondition'], test_df.loc[:, 'MSSubClass':'SaleCondition'])) # feature engineering config_categorical_features(combined_df) # combined_df = extract_common_features(combined_df) log_transform_features(combined_df) combined_df = normalize_numerical_features(combined_df) combined_df = one_hot_encoding(combined_df) missing_value_fill(combined_df) X_train = combined_df[:train_df.shape[0]] X_test = combined_df[train_df.shape[0]:] y = np.log1p(train_df["SalePrice"]) if params is None: params = tuning(X_train, y) # model training model = LinearSVR(**params) model.fit(X_train, y) print("cross_validation_rmse:", np.mean(np.sqrt(-cross_val_score(model, X_train, y, cv=3, scoring="neg_mean_squared_error")))) # model prediction lasso_preds = np.expm1(model.predict(X_test)) solution = pd.DataFrame({"id": test_df.Id, "SalePrice": lasso_preds}) solution.to_csv("./house_price/submission_svr_v1.csv", index=False)
def svr_C(train_features, train_labels, test_features, test_labels, name): """ Plot C against the accuracy. """ sns.set() sns.set_style("ticks") train_results = [] test_results = [] c_values = np.linspace(1e-4, 1, 10) train_scaled, scaler = ml_funcs.apply_scaling(train_features, 'SVR', name, save_scaler=False) test_scaled = scaler.transform(test_features) for c_val in c_values: print("C:", c_val) svr = LinearSVR(C=c_val, max_iter=2000, random_state=0) svr.fit(train_scaled, train_labels) predict_train = svr.predict(train_scaled) # Accuracy of training data (mean absolute percentage error) accuracy_train = compute_accuracy(predict_train, train_labels) train_results.append(accuracy_train) predict_test = svr.predict(test_scaled) # Accuracy for test data. accuracy_test = compute_accuracy(predict_test, test_labels) test_results.append(accuracy_test) fig = plt.figure(figsize=(10, 6)) sns.lineplot(x=c_values, y=train_results, label='Train') sns.lineplot(x=c_values, y=test_results, label='Test') plt.legend(frameon=False, loc='lower right') plt.xlabel('C') plt.ylabel('Accuracy score [%]') fig.tight_layout() sns.despine() if generate_plots.directory_exists("./Figures"): plt.savefig("./Figures/C_" + name + ".pdf", bbox_inches="tight", dpi=300, transparent=True) else: print("Directory: ./Figures does not exist!")
def linearSVR(data): X = data.drop(["id", "date", "price","long","lat", "zipcode","yr_renovated", "sqft_above", "sqft_basement"], axis=1) y = data["price"] X_train, X_test, y_train, y_test = tts(X, y, test_size=0.10, random_state=42) svr = LinearSVR(random_state=42) svr.fit(X_train, y_train) y_predict = svr.predict(X_test) print "r2-score for LinearSVR: %f" % r2_score(y_test, y_predict)
def innerfold_svr(x_test, y_test, x_train, y_train): svr_rbf = LinearSVR(random_state=4) svr_rbf.fit(x_train, y_train) pred_y = svr_rbf.predict(x_test) mse = mean_squared_error(y_test, pred_y) rmse = math.sqrt(mse) print rmse return rmse
def regressor_test(complete,incomplete,years): kn_errors = [] linear_errors = [] svr_errors = [] for i in years[0]: X_train, X_test, y_train, y_test = train_test_split(complete.loc[:,complete.columns != i].values, complete.loc[:,i].values, test_size = 0.2, random_state = 0) regressor1 = KNeighborsRegressor(2, weights ='distance', metric = 'euclidean') regressor2= LinearRegression() regressor3=LinearSVR() trained_model1 = regressor1.fit(X_train, y_train) trained_model2 = regressor2.fit(X_train, y_train) trained_model3 = regressor3.fit(X_train, y_train) incomplete_2 = deepcopy(incomplete) incomplete_2.loc[:, incomplete.columns != i] = incomplete_2.loc[:, incomplete.columns != i].apply(lambda row: row.fillna(row.mean()), axis=1) y_pred1 = regressor1.predict(X_test) y_pred2 = regressor2.predict(X_test) y_pred3 = regressor3.predict(X_test) kn_errors.append(mean_squared_error(y_test, y_pred1)) linear_errors.append(mean_squared_error(y_test, y_pred2)) svr_errors.append(mean_squared_error(y_test, y_pred3)) #Test for checking the best model MSE= [] for i in range(0, len(complete.loc[:,'2007':'2017'].columns)): l = [] l.extend((kn_errors[i], linear_errors[i], svr_errors[i])) if min(l) == kn_errors[i]: MSE.append("KNN") elif min(l) == linear_errors[i]: MSE.append("Linear") elif min(l) == svr_errors[i]: MSE.append("SVR") print("KNN =",MSE.count("KNN"),'\nLinear =',MSE.count("Linear") ,'\nSVR =',MSE.count("SVR")) return max(set(MSE), key = MSE.count)
def svm_regressor(train_data, train_label, test_data, test_label, parameters): min_error = 10000000000 error = [] # tuned_parameters = [{'kernel': ['rbf'], 'gamma': [100,10,1,1e-1, 1e-2,], # 'C': [0.1,1, 10, 100], 'epsilon':[ 100, 1000, 10000,1e6,1e8]}] # # {'kernel': ['linear'], 'C': [1, 10, 100, 1000], 'epsilon': [1, 10,100,1000]}, # # {'kernel':['poly'],'gamma': [1e-3, 1e-4], # # 'C': [1, 10, 100, 1000], 'epsilon':[ 1, 10, 100,1000]}] # # {'kernel': ['linear'], 'C': [1, 10, 100, 1000], 'epsilon': [1e-2, 1e-1, 1, 10]} # clf = GridSearchCV(SVR(), tuned_parameters, cv=5,verbose=1,n_jobs=-1) # clf.fit(train_data, train_label) # print clf.best_params_ # print clf.cv_results_ # tuned_parameters = [{'C': [1e-2,1e-1,1, 10, 100], 'epsilon': [1, 10, 100, 1000,10000]}] # clf = GridSearchCV(LinearSVR(random_state=random_state), tuned_parameters, cv=5, verbose=1, n_jobs=-1) # clf.fit(train_data, train_label) # print clf.best_params_ # print clf.cv_results_ # regr = SVR(kernel='rbf', gamma=0.01,C=100) # regr.fit(train_data, train_label) # score = regr.score(test_data, test_label) # predict = regr.predict(test_data) # predict = map(lambda x: [x], predict) # predict = np.array(predict) # mse = MSE(np.array(predict), test_label) # if (mse[0] < min_error): # min_error = mse[0] # print mse[0] regr = LinearSVR(C=0.001, epsilon=1, random_state=random_state) regr.fit(train_data, train_label) score = regr.score(test_data, test_label) predict = regr.predict(test_data) predict = map(lambda x: [x], predict) predict = np.array(predict) mse = MSE(np.array(predict), test_label) if (mse[0] < min_error): min_error = mse[0] print 'MSE ' + parameters + ' ' + str(mse[0]) df = pd.Series(predict.flatten(), index=test_label.index) price = train_label.append(test_label) plt.title('SVM Regression on ' + parameters) plt.plot(price[1000:-1], label='actual price') plt.plot(df, label='predicted price') plt.legend(loc='lower right') plt.xlabel('Dates') plt.ylabel('Price') # plt.show() directory = './svm/' if not os.path.exists(directory): os.makedirs(directory) plt.savefig(directory + parameters + '.png') plt.close() return
def linear_svr_pred(X_train, Y_train): """ Train a linear model with Support Vector Regression """ svr_model = LinearSVR(random_state=RANDOM_STATE) svr_model.fit(X_train, Y_train) Y_pred = svr_model.predict(X_train) return Y_pred
class LibLinear_SVR: # Liblinear is not deterministic as it uses a RNG inside def __init__(self, epsilon, loss, dual, tol, C, fit_intercept, intercept_scaling, random_state=None): self.epsilon = epsilon self.loss = loss self.dual = dual self.tol = tol self.C = C self.fit_intercept = fit_intercept self.intercept_scaling = intercept_scaling self.random_state = random_state self.estimator = None def fit(self, X, Y): from sklearn.svm import LinearSVR # In case of nested loss if isinstance(self.loss, dict): combination = self.loss self.loss = combination['loss'] self.dual = combination['dual'] self.epsilon = float(self.epsilon) self.C = float(self.C) self.tol = float(self.tol) self.dual = check_for_bool(self.dual) self.fit_intercept = check_for_bool(self.fit_intercept) self.intercept_scaling = float(self.intercept_scaling) self.estimator = LinearSVR(epsilon=self.epsilon, loss=self.loss, dual=self.dual, tol=self.tol, C=self.C, fit_intercept=self.fit_intercept, intercept_scaling=self.intercept_scaling, random_state=self.random_state) self.estimator.fit(X, Y) return self def predict(self, X): if self.estimator is None: raise NotImplementedError() return self.estimator.predict(X)
class SVMRegression(object): def __init__(self, X, y, epsilon, **kwargs): self.X = X self.y = y self.epsilon = epsilon self.model = LinearSVR(epsilon=epsilon, **kwargs) def train_model(self): self.model.fit(self.X, self.y) self.epsilon = self.model.epsilon self.y_pred = self.model.predict(self.X) def get_support_vectors(self): """ Get the index of points which is off the street """ self.if_off_margin = (np.abs(self.y - self.y_pred) >= self.epsilon) self.idx_support_ = np.argwhere(self.if_off_margin) return self.idx_support_ def model_predict(self, x_new): return self.model.predict(x_new) def plot_svm_regression(self, axes): """ Plot SVM Regression """ x_new = np.linspace(axes[0], axes[1], 100).reshape(100, 1) y_estimate = self.model.predict(x_new) plt.plot(x_new, y_estimate, "k-", linewidth=2, label="Prediction of y") plt.plot(x_new, y_estimate + self.epsilon, "r--", label="Upper Bound") plt.plot(x_new, y_estimate - self.epsilon, "g--", label="Lower Bound") plt.scatter(self.X[self.idx_support_], self.y[self.idx_support_], s=180, facecolors='#FFAAAA') plt.plot(self.X, self.y, "bo") plt.xlabel(r"$x_1$", fontsize=18) plt.ylabel(r"$y$", fontsize=18, rotation=0) plt.legend(loc="best", fontsize=18) plt.axis(axes)
def predict_SVM(): svclassifier = LinearSVR(random_state=50, max_iter=100000, epsilon=0, tol=1e-9) svclassifier.fit(X_train_csr.todense(), y_train_1) scv_test_predict = svclassifier.predict(X_test_csr.todense()) print(scv_test_predict) print(classification_report(y_test_1, np.rint(scv_test_predict))) print("RMSE for Neural Random SVR Classifier", sqrt(mean_squared_error(y_test_1, np.rint(scv_test_predict))))
def Linear_SVR(Xtrain, Xtest, ytrain, ytest): cv_scores = [] parameters = [0.25, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.25, 2.5] for i in parameters: clf = LinearSVR(loss='squared_epsilon_insensitive', C=i) clf.fit(Xtrain, ytrain) y_pred = clf.predict(Xtest) #print clf.score(y_test, y_pred) cv_scores.append(metrics.r2_score(ytest, y_pred)) print("LinearSVR") print sum(cv_scores) / float(len(cv_scores))
def main(): # 数据加载 train_data = pd.read_csv('d_train_20180102.csv', encoding='GBK') train_bloods = train_data['血糖'].astype(float) test_data = pd.read_csv('d_test_A_20180102.csv', encoding='GBK') test_bloods = pd.read_csv('d_answer_a_20180128.csv', encoding='GBK').astype(float) train_data = train_data.drop(['id', '体检日期'], axis=1) test_data = test_data.drop(['id', '体检日期'], axis=1) train_data = train_data.drop( ['乙肝表面抗原', '乙肝表面抗体', '乙肝e抗原', '乙肝e抗体', '乙肝核心抗体', '血糖'], axis=1) test_data = test_data.drop( ['乙肝表面抗原', '乙肝表面抗体', '乙肝e抗原', '乙肝e抗体', '乙肝核心抗体'], axis=1) label = train_data.columns encoder = LabelEncoder() train_data['性别'] = encoder.fit_transform(train_data['性别']) test_data['性别'] = encoder.fit_transform(test_data['性别']) train_data.astype(float) test_data.astype(float) for i in label: train_data[i].fillna(train_data[i].mean(), inplace=True) test_data[i].fillna(test_data[i].mean(), inplace=True) scaler = StandardScaler() train_data = pd.DataFrame(scaler.fit_transform(train_data)) # 均值归一化 test_data = pd.DataFrame(scaler.fit_transform(test_data)) # 均值归一化/ # 回归得用线性svr lin_svr = LinearSVR(random_state=42) lin_svr.fit(train_data, train_bloods) predict_bloods = lin_svr.predict(test_data) mse = mean_squared_error(test_bloods, predict_bloods) print(mse) print(np.sqrt(mse)) param_distributions = { 'gamma': reciprocal([0.001, 0.1]), # 'C': uniform(1,10) 'C': [uniform(1, 10), uniform(10, 1)] } rnd_search_cv = RandomizedSearchCV(SVR(), param_distributions, n_iter=4, verbose=2, cv=3, random_state=42) train_bloods = pd.DataFrame(train_bloods) rnd_search_cv.fit(train_data, train_bloods) y_pred = rnd_search_cv.best_estimator_.predict(train_data) mse = mean_squared_error(train_bloods, y_pred) print(np.sqrt(mse)) # 0.5727524770785356 y_pred = rnd_search_cv.best_estimator_.predict(test_data) mse = mean_squared_error(test_bloods, y_pred) print(np.sqrt(mse)) # 0.592916838552874
class SVRR(object): def __init__(self, C): self.regression = LinearSVR(C=C) def fit(self, xs, ys): xs = xs.values ys = ys['y'] self.regression.fit(xs, ys) def predict(self, xs): xs = xs.values ys = self.regression.predict(xs) return ys
class LinearSVRPermuteCoef: def __init__(self, **kwargs): self.model = LinearSVR(**kwargs) def fit(self, X, y): self.model.fit(X, y) self.coef_ = self.model.coef_ self.intercept_ = self.model.intercept_ def add_coef(arr, fn): arr.append(fn(self.coef_)) add_coef(coeffs_state['max'], np.max) add_coef(coeffs_state['min'], np.min) return self def get_params(self, deep=True): return self.model.get_params(deep) def set_params(self, **kwargs): self.model.set_params(**kwargs) return self def predict(self, X): return self.model.predict(X) def score(self, X, y, sample_weight=None): if sample_weight is not None: return self.model.score(X, y, sample_weight) else: return self.model.score(X, y) @staticmethod def permute_min_coefs(): return coeffs_state['min'] @staticmethod def permute_max_coefs(): return coeffs_state['max'] @staticmethod def reset_perm_coefs(): coeffs_state['min'] = [] coeffs_state['max'] = []
def build_svm(x_train, y_train, x_test, y_test, n_features): """ Constructing a support vector regression model from input dataframe :param x_train: features dataframe for model training :param y_train: target dataframe for model training :param x_test: features dataframe for model testing :param y_test: target dataframe for model testing :return: None """ clf = LinearSVR(random_state=1, dual=False, epsilon=0, loss='squared_epsilon_insensitive') # Random state has int value for non-random sampling clf.fit(x_train, y_train) y_pred = clf.predict(x_test) # Mean absolute error regression loss mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred) # Mean squared error regression loss mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred) # Median absolute error regression loss median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred) # R^2 (coefficient of determination) regression score function r2 = sklearn.metrics.r2_score(y_test, y_pred) # Explained variance regression score function exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred) with open('../trained_networks/svm_%d_data.pkl' % n_features, 'wb') as results: pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL) pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL) pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL) pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL) pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL) pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL) pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL) return
cat_vars = ['DayOfWeek','Promo','StateHoliday','SchoolHoliday','StoreType','Assortment','CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear','Promo2','Promo2SinceWeek','Promo2SinceYear','PromoInterval','Day','Month','Year'] num_vars = ['Open','Store','CompetitionDistance','ratio1','ratio2'] X_trn, X_val = train_test_split(train, test_size=0.012, random_state=10) print 'Training Stage 1 Models' #train svm svm1 = LinearSVR(verbose=True) svm1.fit(X_trn[cat_vars+num_vars],X_trn['Sales']) svm1_feature = svm1.predict(train[cat_vars+num_vars]) preds = svm1.predict(X_val[cat_vars+num_vars]) print 'svm ',(np.mean(((np.exp(preds)-np.exp(X_val['Sales']))/(np.exp(X_val['Sales'])+1))**2))**0.5 #train xgb dtrain = xgb.DMatrix(X_trn[cat_vars+num_vars],X_trn['Sales']) dvalid = xgb.DMatrix(X_val[cat_vars+num_vars],X_val['Sales']) watchlist = [(dtrain, 'train'), (dvalid, 'eval')] num_boost_round = 50 params1 = {"objective": "reg:linear","booster" : "gbtree", "eta": 0.5,"max_depth": 2,"subsample": 0.5,"colsample_bytree": 0.4, "nthread":4,"silent": 1,"seed": 1301} gbm1 = xgb.train(params1, dtrain, num_boost_round, evals=watchlist,early_stopping_rounds=50, feval=rmspe_xg, verbose_eval=True)
linsvr = LinearSVR(epsilon=0.1, tol=1e-4, C=1.0, loss='squared_epsilon_insensitive') linsvr.fit(explanatory_df, response_series) linsvr_rsq[c] = svr.score(explanatory_df, response_series) # prediction and linear extrapolation of training data set to get further predictions. test_cluster = train_cluster.copy() explanatory_testdf = test_cluster[explanatory_features] response_testseries = test_cluster.y for i in range(0,(len(cluster_i) - 5)): test_cluster.loc[i] = [cluster_i.iloc[i], cluster_i.iloc[i+1], cluster_i.iloc[i+2], cluster_i.iloc[i+3], cluster_i.iloc[i+4], linsvr.predict(explanatory_df)[i]] # further running time series to predict into the future j = len(test_cluster) - 1 for i in range(j, j+forecast_years): explanatory_testdf = test_cluster[explanatory_features] test_list = test_cluster.ix[i,1:6].tolist() y_est = linsvr.predict(explanatory_testdf) test_list.append(y_est[i]) test_series = pd.Series(test_list, index = train_cluster.columns) test_cluster = test_cluster.append(test_series, ignore_index = True) linsvr_test_clustery[c] = test_cluster['y'] linsvr_residuals = test_cluster['y'][0:len(train_cluster)] - train_cluster['y'] linsvr_RMSE[c] = (((linsvr_residuals)**2).mean())**(0.5)
class TextLearner(object): def __init__(self,data_path,model_path = "./",name = ""): self.name = name self.data_path = data_path self.model_path = model_path self.DesignMatrix = [] self.TestMatrix = [] self.X_train = [] self.y_train = [] # not only train but general purpose too self.X_test = [] self.y_test = [] self.y_pred = [] self.vectorizer = None self.feature_names = None self.chi2 = None self.mlModel = None self.F = Filter() def __enter__(self): return self def __exit__(self, type, value, traceback): self.DesignMatrix = [] self.TestMatrix = [] self.X_train = [] self.y_train = [] self.X_test = [] self.y_test = [] self.y_pred = [] self.vectorizer = None self.feature_names = None self.chi2 = None self.mlModel = None self.F = None def addModelDetails(self,model_p,name = ""): self.name = name self.model_path = model_p def load_data(self,TrTe = 0): #TrTe => 0-Train 1-Test # returns the dimensions of vectors with open( self.data_path, 'rb') as f: if TrTe == 0: self.DesignMatrix = pickle.load(f) return len(self.DesignMatrix[1]) if TrTe == 1: self.TestMatrix = pickle.load(f) return len(self.TestMatrix[1]) def clearOld(self): self.X_train = [] self.y_train = [] self.X_test = [] self.y_test = [] self.y_pred = [] self.vectorizer = None self.feature_names = None self.chi2 = None self.mlModel = None def process(self,text,default = 0): if default == 0: text = text.strip().lower().encode("utf-8") else: text = self.F.process(text) return text def loadXY(self,TrTe = 0,feature_index = 0,label_index = 1): #TrTe => 0-Train 1-Test if TrTe == 0: for i in self.DesignMatrix: self.X_train.append(self.process(i[feature_index])) self.y_train.append(i[label_index]) self.X_train = np.array(self.X_train) self.y_train = np.array(self.y_train) elif TrTe == 1: for i in self.TestMatrix: self.X_test.append(self.process(i[feature_index])) self.y_test.append(i[label_index]) self.X_test = np.array(self.X_test) self.y_test = np.array(self.y_test) def featurizeXY(self,only_train = 1): # Extracts Features sw = ['a', 'across', 'am', 'an', 'and', 'any', 'are', 'as', 'at', 'be', 'been', 'being', 'but', 'by', 'can', 'could', 'did', 'do', 'does', 'each', 'for', 'from', 'had', 'has', 'have', 'in', 'into', 'is', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'of', 'on', 'or', 'that', "that's", 'thats', 'the', 'there', "there's", 'theres', 'these', 'this', 'those', 'to', 'under', 'until', 'up', 'were', 'will', 'with', 'would'] self.vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words=sw) self.X_train = self.vectorizer.fit_transform(self.X_train) self.feature_names = self.vectorizer.get_feature_names() if only_train == 0: self.X_test = self.vectorizer.transform(self.X_test) def reduceDimension(self,only_train = 1, percent = 50): # Reduce dimensions / self best of features n_samples, n_features = self.X_train.shape k = int(n_features*(percent/100)) self.chi2 = SelectKBest(chi2, k=k) self.X_train = self.chi2.fit_transform(self.X_train, self.y_train) self.feature_names = [self.feature_names[i] for i in self.chi2.get_support(indices=True)] self.feature_names = np.asarray(self.feature_names) if only_train == 0: self.X_test = self.chi2.transform(self.X_test) def trainModel(self,Model = "default"): if Model == "default": self.mlModel = LinearSVR(loss='squared_epsilon_insensitive',dual=False, tol=1e-3) else: self.mlModel = Model self.mlModel.fit(self.X_train, self.y_train) def testModel(self,approx = 1): # returns score ONLY self.y_pred = np.array(self.mlModel.predict(self.X_test)) if approx == 1: ### To convert real valued results to binary for scoring temp = [] for y in self.y_pred: if y > 0.0: temp.append(1.0) else: temp.append(-1.0) self.y_pred = temp return metrics.accuracy_score(self.y_test, self.y_pred) def getReport(self,save = 1, get_top_words = 0): # returns report report = "" if get_top_words == 1: if hasattr(self.mlModel, 'coef_'): report += "Dimensionality: " + str(self.mlModel.coef_.shape[1]) report += "\nDensity: " + str(density(self.mlModel.coef_)) rank = np.argsort(self.mlModel.coef_[0]) top10 = rank[-20:] bottom10 = rank[:20] report += "\n\nTop 10 keywords: " report += "\nPositive: " + (" ".join(self.feature_names[top10])) report += "\nNegative: " + (" ".join(self.feature_names[bottom10])) score = metrics.accuracy_score(self.y_test, self.y_pred) report += "\n\nAccuracy: " + str(score) report += "\nClassification report: " report += "\n\n" + str(metrics.classification_report(self.y_test, self.y_pred,target_names=["Negative","Positive"])) report += "\nConfusion matrix: " report += "\n\n" + str(metrics.confusion_matrix(self.y_test, self.y_pred)) + "\n\n" if save == 1: with open(self.model_path + "report.txt", "w") as text_file: text_file.write(report) return report def crossVal(self,folds = 5, dim_red = 50,full_iter = 0, save = 1): # returns report # Caution: resets train and test X,y skf = cross_validation.StratifiedKFold(self.y_train, n_folds = folds,shuffle=True) print(skf) master_report = "" X_copy = self.X_train y_copy = self.y_train for train_index, test_index in skf: self.X_train, self.X_test = X_copy[train_index], X_copy[test_index] self.y_train, self.y_test = y_copy[train_index], y_copy[test_index] self.featurizeXY(0) self.reduceDimension(0,dim_red) self.trainModel() self.testModel() master_report += self.getReport(save = 0,get_top_words = 0) if full_iter == 1: continue else: break if save == 1: with open(self.model_path + "master_report.txt", "w") as text_file: text_file.write(master_report) return master_report def save_obj(self,obj, name ): with open(self.model_path + name + '.pkl', 'wb') as f: pickle.dump(obj, f, protocol=2) def saveModel(self): # saves in model path self.save_obj(self.mlModel, self.name + "_model") self.save_obj(self.vectorizer, self.name + "_vectorizer") self.save_obj(self.chi2, self.name + "_feature_selector") def plot(self): ''' beta (Just plotting the model) (Not working) ''' h = .02 # step size in the mesh # create a mesh to plot in x_min, x_max = self.X_train[:, 0].min() - 1, self.X_train[:, 0].max() + 1 y_min, y_max = self.X_train[:, 1].min() - 1, self.X_train[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h),np.arange(y_min, y_max, h)) # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, m_max]x[y_min, y_max]. Z = self.mlModel.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) plt.contour(xx, yy, Z, cmap=plt.cm.Paired) plt.xlim(xx.min(), xx.max()) plt.ylim(yy.min(), yy.max()) plt.xticks(()) plt.yticks(()) plt.title(self.name) plt.savefig(self.model_path + 'plot.png')
def linearSVR(train,trainLable,testData): clf = LinearSVR() clf.fit(train,trainLable) predict = clf.predict(testData) return predict
# 通过交叉验证来选择C best_cv_score = -1e+30; for log2c in np.arange(-10,30,1): clf = LinearSVR(C=2**log2c, epsilon=0.0001) clf.fit(x_input_minmax, y_input) cv_score = cross_val_score(cv=sample_num, estimator=clf, X=x_input_minmax, y=y_input, scoring= 'mean_squared_error').mean() # 留1 print(cv_score) if cv_score > best_cv_score: best_cv_score = cv_score bestc = 2**log2c # 利用所选的参数进行预测 clf = LinearSVR(C=bestc, epsilon=0.0001) clf.fit(x_input_minmax, y_input) y_pred = clf.predict(x_input_minmax) # y_pred = y_scaler.inverse_transform(y_pred.reshape(-1,1)) view_point = 5; plt.plot(x_input[:,view_point], y_input, 'bo-', x_input[:,view_point], y_pred, 'rs-') plt.grid(True) plt.legend(['y', 'y_pred']) plt.show()
combined = np.append(X, np.matrix(Y).T, axis=1) np.random.shuffle(combined) tail_size = -1 * size last_column = X.shape[1] training_labels = combined[:tail_size, last_column] training_data = combined[:tail_size, :-2] test_data = combined[tail_size:, :-2] actual_labels = combined[tail_size:, last_column] return training_data, np.ravel(training_labels), test_data, np.ravel(actual_labels) training = open('author_features') NO_TRAINING_SAMPLES = 6000 NO_OF_AUTHORS = 10000 matrix = dok_matrix((NO_TRAINING_SAMPLES, NO_OF_AUTHORS), dtype=np.int) for line in training.readlines(): values = line.rstrip().split() matrix[int(values[0]), int(values[1])] = 1 labels_file = open('year_training_labels') labels = [int(x) for x in labels_file.readline().rstrip().split()] training_matrix = matrix[:4498] training_data, training_labels, test_data, actual_labels = sample(training_matrix, labels) classifier = LinearSVR() classifier.fit(training_data, training_labels) output = classifier.predict(test_data) for index, predicted in enumerate(output): print '%s %s' % (predicted, actual_labels[index]) print metrics.explained_variance_score(actual_labels, output)
print "----------- Fold %d -----------------------" %i print "--------------------------------------------" val_id = fold_ids.ix[:, i].dropna() idx = train["Id"].isin(list(val_id)) trainingSet = train[~idx] validationSet = train[idx] tr_X = np.matrix(trainingSet[feature_names]) tr_Y = np.array(trainingSet["Response"]) val_X = np.matrix(validationSet[feature_names]) val_Y = np.array(validationSet["Response"]) regm = LinearSVR(C = 0.06, epsilon = 0.45, tol = 1e-5, dual = True, verbose = True, random_state = 133) regm.fit(tr_X, tr_Y) preds = regm.predict(val_X) df = pd.DataFrame(dict({"Id" : validationSet["Id"], "ground_truth" : validationSet["Response"], "linsvr_preds" : preds})) linsvr_val = linsvr_val.append(df, ignore_index = True) tpreds = regm.predict(test_X) cname = "Fold" + `i` linsvr_test[cname] = tpreds linsvr_val.to_csv("ensemble2/linsvr_val.csv") linsvr_test.to_csv("ensemble2/linsvr_test.csv")
X2 = X_train_reduced[test] Y2 = Y_train_raw[test] ## Train Classifiers on fold rdg_clf = Ridge(alpha=0.5) rdg_clf.fit(X1, Y1) lso_clf = Lasso(alpha=0.6257) lso_clf.fit(X1, Y1) svr_clf = LinearSVR(C=1e3) svr_clf.fit(X1, Y1) ## Score Classifiers on fold rdg_clf_score = rdg_clf.score(X2, Y2) lso_clf_score = lso_clf.score(X2, Y2) svr_clf_score = svr_clf.score(X2, Y2) print "Ridge: ", rdg_clf_score print "Lasso: ", lso_clf_score print "SVR_RBF: ", svr_clf_score ## Train final Classifiers # clf = Ridge(alpha=.5) clf = LinearSVR(C=1e3, gamma=0.1) clf.fit(X_train_reduced, Y_train_raw) Y_predicted = clf.predict(X_test_reduced) ## Save results to csv np.savetxt("prediction.csv", Y_predicted, fmt="%.5f", delimiter=",")
svm_reg2 =LinearSVR(epsilon=0.5) svm_reg1.fit(X, y) svm_reg2.fit(X, y) def find_support_vectors(svm_reg, X, y): y_pred = svm_reg.predict(X) off_margin = (np.abs(y - y_pred) >= svm_reg.epsilon) return np.argwhere(off_margin) svm_reg1.support_ = find_support_vectors(svm_reg1, X, y) svm_reg2.support_ = find_support_vectors(svm_reg2, X, y) eps_x1 = 1 eps_y_pred = svm_reg1.predict([[eps_x1]]) def plot_svm_regression(svm_reg, X, y, axes): x1s = np.linspace(axes[0], axes[1], 100).reshape(100, 1) y_pred = svm_reg.predict(x1s) plt.plot(x1s, y_pred, "k-", linewidth=2, label=r"$\hat{y}$") plt.plot(x1s, y_pred + svm_reg.epsilon, "k--") plt.plot(x1s, y_pred - svm_reg.epsilon, "k--") plt.scatter(X[svm_reg.support_], y[svm_reg.support_], s=180, facecolors="#FFAAAA") plt.plot(X, y, "bo") plt.xlabel(r"$x_1$", fontsize=18) plt.legend(loc="upper left", fontsize=18) plt.axis(axes) plt.figure(figsize=(9, 4))
for row in csv.reader(data_file): data += [[row[0],row[4],row[6],row[10]]] target += [row[9]] data,target = Lin_clean_data(data[1:],target[1:],2) point = 2000 X_train = data[:point-1] X_test = data[point:point+int(point*0.2)] y_train = target[:point-1] y_test = target[point:point+int(point*0.2)] svr = LinearSVR(C=0.1) svr_model = svr.fit(X_train,y_train) lin = svr.predict(X_train) lin_test = svr.predict(X_test) lin,lin_test = data_normalize(y_train,y_test,lin,lin_test) print("Train score : ",score(y_train,lin)) print("Train average error : ",sum(abs(y_train-lin)) / float(len(y_train))) print("Fit score : ",score(y_test,lin_test)) print("Fit average error : ",sum(abs(y_test-lin_test)) / float(len(y_test))) figure1 = plt.figure(1,figsize=[20,10]) draw_pic(range(len(X_train)),range(len(X_test)),lin,lin_test,y_train,y_test,label='lin',figure=figure1) figure1.savefig("C:/Users/sean/Desktop/SVR_DATA/linSVR.png",dpi=300,format="png") plt.close(1)