def load_regressor_SVR(X_train, y_train): best_estimator_svr = joblib.load('./built_models/regressor_SVR.pkl') regressor_svr = SVR() regressor_svr.set_params(**best_estimator_svr) print regressor_svr regressor_svr.fit(X_train, y_train) return regressor_svr
def svr(x_train, x_test, y_train, y_test, model_processing_params): with open("tuning/regression/svr.json") as svr_params_file: file_data = json.load(svr_params_file) param_grid = file_data["param_grid"] svr_instance = SVR() best_score = 0 best_params = None first_iter = True for g in ParameterGrid(param_grid): svr_instance.set_params(**g) svr_instance.fit(x_train, y_train) new_score = svr_instance.score(x_test, y_test) # save if best if first_iter: best_score = new_score best_params = g first_iter = False elif new_score > best_score: best_score = new_score best_params = g all_x = np.vstack((x_train, x_test)) all_y = np.append(y_train, y_test) r2_score = svr_instance.score(all_x, all_y) y_test_predict = svr_instance.predict(x_test) mse = mean_squared_error(y_test, y_test_predict) print("\nSVR") print("MSE: " + str(mse)) print("R2 score: " + str(r2_score)) print("Best Score:" + str(best_score)) print("Best params:") print(best_params) if not path.exists("./models/regression/svr"): makedirs("./models/regression/svr") dump(svr_instance, "./models/regression/svr/model.dump") model_stats = { "mse": mse, "r2_score": r2_score, "best_score": best_score, "best_params": best_params } with open("./models/regression/svr/model_stats.json", "w") as outfile: json.dump(model_stats, outfile, indent=4) with open("./models/regression/svr/model_processing_params.json", "w") as outfile: json.dump(model_processing_params, outfile, indent=4)
def svr_cv(cv_outer, data): MAE_results = [] RMSE_results = [] MedAE_results = [] r2_results = [] model_params = [] for train_index, test_index in cv_outer: X_train, y_train = data.iloc[train_index, 6:].values, data.iloc[train_index, 3].values X_test, y_test = data.iloc[test_index, 6:].values, data.iloc[test_index, 3].values cv_inner = KFold(n_splits=3, shuffle=True, random_state=1) model = SVR(kernel='rbf') params = { 'C': np.arange(100, 250, 10), 'epsilon': [0.0001, 0.001, 0.01], 'gamma': [0.005, 0.006, 0.007, 0.008, 0.009, 0.01] } search = RandomizedSearchCV(model, params, cv=cv_inner, scoring='neg_mean_absolute_error', verbose=0, n_jobs=-1, n_iter=100, refit=False, random_state=0) search.fit(X_train, y_train) model_params.append(search.best_params_) model.set_params(**search.best_params_) model.fit(X_train, y_train) # print(search.best_params_) y_pred = model.predict(X_test) mae = MAE(y_test, y_pred) MAE_results.append(mae) rmse = mean_squared_error(y_test, y_pred, squared=False) RMSE_results.append(rmse) med = median_absolute_error(y_test, y_pred) MedAE_results.append(med) r2 = r2_score(y_test, y_pred) r2_results.append(r2) return MAE_results, RMSE_results, MedAE_results, r2_results, model_params
def reg_objective(hyperparams): clf = SVR() clf.set_params(**hyperparams) model = clf.fit(X_train, y_train) pred = model.predict(X_val) score = np.sqrt(((pred - y_val)**2).mean()) if np.isnan(score): score = 1e8 return {'loss': score, 'status': STATUS_OK}
def svr_from_config(config): m = SVR() m.set_params(**config['params']) for attr, v in config['attributes'].items(): dtype = config['attributes_types'].get(attr, 'float64') if isinstance(v, list): v = np.array(v, dtype=dtype) m.__setattr__(attr, v) return m
def update_event(self, input_called=-1): if input_called == 0: regr = SVR() if self.input(1) != None: regr.set_params(**self.input(1)) X = self.input(2) y = self.input(3) regr.fit(X, y) self.set_output_val(1, regr) self.exec_output(0)
def train_and_save_final_model(X, y, X_train, y_train, params, save_model_file_path, test_data): svr = SVR() svr.set_params(**params) if test_data == None: svr.fit(X_train, y_train) else: svr.fit(X, y) #save model model_file_path = save_model_file_path + 'svr.sav' pickle.dump(svr, open(model_file_path, 'wb'))
def SVM_regression(X_train, y_train, X_test, params): # Соединим нашу выборку для процедуры стандартизации sample = np.vstack((X_train, X_test)) # Стандартизуем выборку и снова разделяем sample = preprocessing.scale(sample) X_train = sample[:-1, :] X_test = sample[-1:, :] # Случайный поиск по сетке if hyperparameters == 'RandomGridSearch': # Осуществляем поиск по сетке с кросс-валидацией (число фолдов равно 3) Cs = [0.001, 0.01, 0.1, 1, 10] epsilons = [0.1, 0.4, 0.7, 1.0] param_grid = {'C': Cs, 'epsilon': epsilons} # Задаем модель, которую будем обучать estimator = SVR(kernel = 'linear', gamma = 'scale') # Производим обучение модели с заданными вариантами параметров (осуществляем поиск по сетке) optimizer = RandomizedSearchCV(estimator, param_grid, n_iter = 5, cv = 3, iid = 'deprecated', scoring = 'neg_mean_absolute_error') optimizer.fit(X_train, np.ravel(y_train)) regression = optimizer.best_estimator_ predicted = regression.predict(X_test) validation_score = optimizer.best_score_ # Полный поиск по сетке elif hyperparameters == 'GridSearch': Cs = [0.001, 0.01, 0.1, 1, 10] epsilons = [0.1, 0.4, 0.7, 1.0] param_grid = {'C': Cs, 'epsilon': epsilons} # Задаем модель, которую будем обучать estimator = SVR(kernel = 'linear', gamma = 'scale') # Производим обучение модели с заданными вариантами параметров (осуществляем поиск по сетке) optimizer = GridSearchCV(estimator, param_grid, cv = 3, iid = 'deprecated', scoring = 'neg_mean_absolute_error') optimizer.fit(X_train, np.ravel(y_train)) regression = optimizer.best_estimator_ predicted = regression.predict(X_test) validation_score = optimizer.best_score_ elif hyperparameters == 'Custom': estimator = SVR() # Задаем нужные параметры estimator.set_params(**params) # Проверка по кросс-валидации fold = KFold(n_splits = 3, shuffle = True) validation_score = cross_val_score(estimator = estimator, X = X_train, y = np.ravel(y_train), cv = fold, scoring = 'neg_mean_absolute_error') # Обучаем модель уже на всех данных estimator.fit(X_train, np.ravel(y_train)) predicted = estimator.predict(X_test) return(predicted, validation_score)
class SVM(): def __init__(self, task='cls', **kwargs): if task == 'cls': self.svm = SVC(**kwargs) self._name = 'SVC' elif task == 'prd': self.svm = SVR(**kwargs) self._name = 'SVR' def decision_function(self, X): ''' X (n_samples, n_features) return: X (n_samples, n_classes * (n_classes-1) / 2) ''' if self._name == 'SVC': return self.svm.decision_function(X) def fit(self, X, y, sample_weight=None): ''' X (n_samples, n_features) y (n_samples,) sample_weight (n_samples,) ''' return self.svm.fit(X, y, sample_weight) def get_params(self, deep=True): return self.svm.get_params(deep) def predict(self, X): return self.svm.predict(X) def score(self, X, y, sample_weight=None): ''' X (n_samples, n_features) y (n_samples,) or (n_samples, n_outputs) sample_weight (n_samples,), default=None ''' return self.svm.score(X, y, sample_weight) def set_params(self, **params): ''' **params dict ''' return self.svm.set_params(**params)
class Baseline: def __init__(self, city, dest_name): self.city = city self.dest_name = dest_name print 'Baseline implementation for {:s} : {:s}'.format( self.city, self.dest_name) dest_to_idx = { 'bofa': 0, 'church': 1, 'gas_station': 3, 'high_school': 3, 'mcdonalds': 4 } self.idx = dest_to_idx[self.dest_name] self.base_dir = osp.join('../data/dataset', city) self.train_label_filename = osp.join(self.base_dir, 'distance', 'train_labels.txt') self.train_im_list_filename = osp.join(self.base_dir, 'distance', 'train_im_list.txt') self.test_label_filename = osp.join(self.base_dir, 'distance', 'test_labels.txt') self.test_im_list_filename = osp.join(self.base_dir, 'distance', 'test_im_list.txt') self.svr = SVR(kernel='linear', shrinking=False, cache_size=10000, verbose=True) # self.svr = LinearSVR(verbose=1) def collect_train_data_parallel(self): with open(self.train_im_list_filename, 'r') as train_f_im,\ open(self.train_label_filename, 'r') as train_f_label: train_im_names = [ osp.join('../data/dataset', l.rstrip().split(' ')[0]) for l in train_f_im ] train_labels = [ float(l.rstrip().split(' ')[self.idx]) for l in train_f_label ] # get dims ge = GISTExtractor(width=256, height=256) im = cv2.imread(train_im_names[0]) gist_features = ge.extract_gist(im) self.train_X = np.zeros((len(train_im_names), gist_features.shape[0]), dtype=np.float) self.train_y = np.asarray(train_labels) # parallel feature extraction! print 'Collecting training data' pool = Pool(initializer=pool_init, initargs=(256, 256)) chunksize = len(train_im_names) / 4 for idx, feat in enumerate( pool.imap(gist_wrapper, train_im_names, chunksize)): self.train_X[idx, :] = feat pool.close() pool.join() def collect_train_data_serial(self): with open(self.train_im_list_filename, 'r') as train_f_im,\ open(self.train_label_filename, 'r') as train_f_label: train_im_names = [ osp.join('../data/dataset', l.rstrip().split(' ')[0]) for l in train_f_im ] train_labels = [ float(l.rstrip().split(' ')[self.idx]) for l in train_f_label ] # get dims ge = GISTExtractor(width=256, height=256) im = cv2.imread(train_im_names[0]) gist_features = ge.extract_gist(im) self.train_X = np.zeros((len(train_im_names), gist_features.shape[0]), dtype=np.float) self.train_y = np.asarray(train_labels) db = lmdb.open('../data/dataset/gist', map_size=int(1e12), readonly=True) txn = db.begin() # serial feature extraction! print 'Collecting training data' for idx, im_name in enumerate(train_im_names): if idx % 100 == 0: print 'Image {:d} / {:d}'.format(idx, len(train_im_names)) key = get_key(im_name) self.train_X[idx, :] = np.fromstring(txn.get(key)) def collect_test_data_parallel(self): with open(self.test_im_list_filename, 'r') as test_f_im,\ open(self.test_label_filename, 'r') as test_f_label: test_im_names = [ osp.join('../data/dataset', l.rstrip().split(' ')[0]) for l in test_f_im ] test_labels = [ float(l.rstrip().split(' ')[self.idx]) for l in test_f_label ] # get dims ge = GISTExtractor(width=256, height=256) im = cv2.imread(test_im_names[0]) gist_features = ge.extract_gist(im) self.test_X = np.zeros((len(test_im_names), gist_features.shape[0]), dtype=np.float) self.test_y = np.asarray(test_labels) # parallel feature extraction! print 'Collecting testing data' pool = Pool(initializer=pool_init, initargs=(256, 256)) chunksize = len(test_im_names) / 4 for idx, feat in enumerate( pool.imap(gist_wrapper, test_im_names, chunksize)): self.test_X[idx, :] = feat pool.close() pool.join() def collect_test_data_serial(self): with open(self.test_im_list_filename, 'r') as test_f_im,\ open(self.test_label_filename, 'r') as test_f_label: test_im_names = [ osp.join('../data/dataset', l.rstrip().split(' ')[0]) for l in test_f_im ] test_labels = [ float(l.rstrip().split(' ')[self.idx]) for l in test_f_label ] # get dims ge = GISTExtractor(width=256, height=256) im = cv2.imread(test_im_names[0]) gist_features = ge.extract_gist(im) self.test_X = np.zeros((len(test_im_names), gist_features.shape[0]), dtype=np.float) self.test_y = np.asarray(test_labels) db = lmdb.open('../data/dataset/gist', map_size=int(1e12), readonly=True) txn = db.begin() # serial feature extraction! print 'Collecting testing data' for idx, im_name in enumerate(test_im_names): if idx % 100 == 0: print 'Image {:d} / {:d}'.format(idx, len(test_im_names)) key = get_key(im_name) self.test_X[idx, :] = np.fromstring(txn.get(key)) def train(self, C=1.0, calc_loss=False): print 'Training with C = {:f}'.format(C) p = self.svr.get_params() p['C'] = C self.svr.set_params(**p) self.svr.fit(self.train_X, self.train_y) loss = 0 if calc_loss: test_y_pred = self.svr.predict(self.test_X) loss = np.linalg.norm(test_y_pred - self.test_y) # score = self.svr.score(self.test_X, self.test_y) print 'Loss = {:f}'.format(loss) return loss def cross_validate(self): C = np.power(10.0, xrange(-2, 5)) losses = np.array([self.train(c, calc_loss=True) for c in C]) idx = np.argmin(losses) print 'Best C = {:f}'.format(C[idx]) def save_current_model(self): model_filename = osp.join(self.base_dir, 'distance', '{:s}.pkl'.format(self.dest_name)) joblib.dump(self.svr, model_filename) print model_filename, 'saved'
## train ## set model # lasso lasso = Lasso(normalize=True) # ridge ridge = Ridge(normalize=True) # elasticnet elasticnet = ElasticNet(normalize=True) # SVR regression svr = SVR() svr.set_params(C=0.045, epsilon=0.06, kernel='linear') # Gradient Boosting Regressor gbr = GradientBoostingRegressor(n_estimators=6000, learning_rate=0.01, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber', random_state=42) #Random Forest Regressor randomForest = RandomForestRegressor(n_estimators=1200, max_depth=15, min_samples_split=5,
def svm_regression(X_train, y_train, X_test, params, use_cv: bool = True): # Combine our sample for the standardization procedure sample = np.vstack((X_train, X_test)) # Standardize the sample and split again sample = preprocessing.scale(sample) X_train = sample[:-1, :] X_test = sample[-1:, :] # If there are not enough points for cross validation if use_cv is False: if params is None: model = SVR() else: model = SVR(**params) model.fit(X_train, y_train) predicted = model.predict(X_test) # Calculate score on train train_predicted = model.predict(X_train) validation_score = mean_absolute_error( np.ravel(y_train), np.ravel(train_predicted)) return predicted, validation_score # Random grid search if hyperparameters == 'RandomGridSearch': # Carry out a random grid search with cross-validation (the number of folds is 3) Cs = [0.001, 0.01, 0.1, 1, 10] epsilons = [0.1, 0.4, 0.7, 1.0] param_grid = {'C': Cs, 'epsilon': epsilons} # Set the model to be trained estimator = SVR(kernel='linear', gamma='scale') # Train the model with the given options of parameters optimizer = RandomizedSearchCV( estimator, param_grid, n_iter=5, cv=3, iid='deprecated', scoring='neg_mean_absolute_error') optimizer.fit(X_train, np.ravel(y_train)) regression = optimizer.best_estimator_ predicted = regression.predict(X_test) validation_score = optimizer.best_score_ # Full grid search elif hyperparameters == 'GridSearch': Cs = [0.001, 0.01, 0.1, 1, 10] epsilons = [0.1, 0.4, 0.7, 1.0] param_grid = {'C': Cs, 'epsilon': epsilons} # Set the model to be trained estimator = SVR(kernel='linear', gamma='scale') # Train the model with the given options of parameters optimizer = GridSearchCV(estimator, param_grid, cv=3, iid='deprecated', scoring='neg_mean_absolute_error') optimizer.fit(X_train, np.ravel(y_train)) regression = optimizer.best_estimator_ predicted = regression.predict(X_test) validation_score = optimizer.best_score_ elif hyperparameters == 'Custom': estimator = SVR() # Set the params estimator.set_params(**params) # Cross-validation fold = KFold(n_splits=3, shuffle=True) validation_score = cross_val_score( estimator=estimator, X=X_train, y=np.ravel(y_train), cv=fold, scoring='neg_mean_absolute_error') estimator.fit(X_train, np.ravel(y_train)) predicted = estimator.predict(X_test) return predicted, validation_score
X = scaler.fit_transform(X) Xt = scaler.transform(Xt) ############################################################## # stacking result = svr + α * rfr + β * gbr # tune α and β with cross validation ############################################################## scores = dict() skf = cross_validation.StratifiedKFold(Y, n_folds=3) for train_index, test_index in skf: X1, X2 = X[train_index], X[test_index] Y1, Y2 = Y[train_index], Y[test_index] # predict with SVR svr = SVR() svr.set_params(**pickle.load(open("svr.p", "rb" ))) svr.fit(X1, Y1) Y_svr = svr.predict(X2) # predict with RF rfr = RandomForestRegressor(n_estimators = 1000) rfr.set_params(**pickle.load(open("rfr.p", "rb" ))) rfr.fit(X1, Y1) Y_rfr = rfr.predict(X2) # predict with GBT gbr = GradientBoostingRegressor(n_estimators=3000) gbr.set_params(**pickle.load(open("gbr.p", "rb" ))) gbr.fit(X1, Y1) Y_gbr = gbr.predict(X2)
def standard_experiment(train_df, test_df, feature_names, args): train_df['set'] = "train" # annotate test_df['set'] = "test" # annotate # clip training set, if necessary if (0 < args.limit_data < len(train_df)): print "Clipping training set to %d comments" % args.limit_data train_df = train_df[:args.limit_data] # Split into X, y for regression target = args.target train_X = train_df.filter(feature_names).as_matrix().astype( np.float) # training data train_y = train_df.filter([target]).as_matrix().astype( np.float) # training labels test_X = test_df.filter(feature_names).as_matrix().astype( np.float) # test data test_y = test_df.filter([target ]).as_matrix().astype(np.float) # ground truth # For compatibility, make 1D train_y = train_y.reshape((-1, )) test_y = test_y.reshape((-1, )) print "Training set: %d examples" % (train_X.shape[0], ) print "Test set: %d examples" % (test_X.shape[0], ) print "Selected %d features" % (len(feature_names), ) print 'Features: %s' % (' '.join(feature_names)) ## # Preprocessing: scale data, keep SVM happy scaler = preprocessing.StandardScaler() train_X = scaler.fit_transform( train_X) # faster than fit, transform separately test_X = scaler.transform(test_X) if args.classifier != 'baseline': if args.stock_params: if args.classifier == 'svr': print "Initializing SVR model" clf = SVR(**STANDARD_PARAMS['svr']) elif args.classifier == 'rf': print "Initializing RandomForestRegressor model, seed=%d" % args.rfseed clf = RandomForestRegressor(random_state=args.rfseed, **STANDARD_PARAMS['rf']) elif args.classifier == 'elasticnet': print "Initializing ElasticNet model" clf = ElasticNet(max_iter=10000, **STANDARD_PARAMS['elasticnet']) else: raise ValueError("Invalid classifier '%s' specified." % args.classifier) else: ## # Run Grid Search / 10xv on training/dev set start = time.time() print "== Finding optimal classifier using Grid Search ==" params, clf = train_optimal_classifier(train_X, train_y, classifier=args.classifier, rfseed=args.rfseed, quickmode=args.quickmode) print "Optimal parameters: " + json.dumps(params, indent=4) if hasattr(clf, "support_vectors_"): print 'Number of support vectors: %d' % len( clf.support_vectors_) print "Took %.2f minutes to train" % ((time.time() - start) / 60.0) if hasattr(clf, 'random_state'): clf.set_params(random_state=args.rfseed) clf.fit(train_X, train_y) params = clf.get_params() ## # Set up evaluation function if args.ndcg_weight == 'target': favfunc = evaluation.fav_target # score weighting else: favfunc = evaluation.fav_linear # rank weighting max_K = 20 eval_func = lambda data: evaluation.ndcg(data, max_K, target=args.ndcg_target, result_label=result_label, fav_func=favfunc) ## # Predict scores for training set result_label = "pred_%s" % args.target # e.g. pred_score if args.classifier != 'baseline': train_pred = clf.predict(train_X) else: # baseline: post order train_pred = -1 * train_df['position_rank'] train_df[result_label] = train_pred print 'Performance on training data (NDCG with %s weighting)' % args.ndcg_weight # ndcg_train = eval_func(train_df) ndcg_train = eval_func( train_df[train_df.parent_nchildren >= args.min_posts_ndcg]) for i, score in enumerate(ndcg_train, start=1): print '\tNDCG@%d: %.5f' % (i, score) print 'Karma MSE: %.5f' % mean_squared_error(train_y, train_pred) ## # Predict scores for test set if args.classifier != 'baseline': test_pred = clf.predict(test_X) else: # baseline: post order test_pred = -1 * test_df['position_rank'] test_df[result_label] = test_pred print 'Performance on test data (NDCG with %s weighting)' % args.ndcg_weight # ndcg_test = eval_func(test_df) ndcg_test = eval_func( test_df[test_df.parent_nchildren >= args.min_posts_ndcg]) for i, score in enumerate(ndcg_test, start=1): print '\tNDCG@%d: %.5f' % (i, score) print 'Karma MSE: %.5f' % mean_squared_error(test_y, test_pred) ## # Save model to disk if args.savename and (args.classifier != 'baseline'): import cPickle as pickle saveas = args.savename + ".model.pkl" print "== Saving model as %s ==" % saveas with open(saveas, 'w') as f: pickle.dump(clf, f) ## # Get feature importance, if possible if args.savename and (args.classifier != 'baseline'): feature_importances = get_feature_importance( clf, args.classifier, feature_names=feature_names, sorted=True) saveas = args.savename + ".topfeatures.txt" print "== Recording top features to %s ==" % saveas # np.savetxt(saveas, feature_importances) # with open(saveas, 'w') as f: # json.dump(feature_importances, f, indent=2) with open(saveas, 'w') as f: maxlen = max([len(fname) for fname in feature_importances[0]]) f.write("# Model: %s\n" % args.classifier) f.write("# Params: %s\n" % json.dumps(params)) for fname, val in zip(*feature_importances): f.write("%s %.06f\n" % (fname.ljust(maxlen), val)) f.flush() ## # Save data to HDF5 if args.savename: # Save score predictions fields = [ "self_id", "parent_id", 'cid', 'sid', 'set', args.target, result_label ] if not args.ndcg_target in fields: fields.append(args.ndcg_target) saveas = args.savename + ".scores.h5" print "== Saving raw predictions as %s ==" % saveas outdf = pd.concat([train_df[fields], test_df[fields]], ignore_index=True) outdf.to_hdf(saveas, 'data') if args.savefull: # Concatenate train, test df = pd.concat([train_df, test_df], ignore_index=True) print "== Exporting data to HDF5 ==" saveas = args.savename + ".data.h5" df.to_hdf(saveas, "data") print " [saved as %s]" % saveas # Save NDCG calculations dd = { 'k': range(1, max_K + 1), 'method': [args.ndcg_weight] * max_K, 'ndcg_train': ndcg_train, 'ndcg_test': ndcg_test } resdf = pd.DataFrame(dd) saveas = args.savename + ".results.csv" print "== Saving results to %s ==" % saveas resdf.to_csv(saveas)
y = range(10) # np.random.randn(n_samples) #X = np.random.randn(n_samples, n_features) #y = [ # [1, 38], # [2, 59], # [3, 14], #] y = [1, 0, 0, 1, 1, 1, 1, 0, 0, 0] X = [ [1, 24], [3, 48], [3, 63], [1, 12], [1, 27], [1, 31], [1, 18], [3, 50], [3, 73], [3, 82], ] y = [i for i in range(1, 10)] X = [[i] for i in range(1, 10)] print y print X clf = SVR(kernel='linear')#, C=1.0, epsilon=0.2) print clf.fit(X, y) print clf.predict([[i] for i in range(10)]) print clf.set_params(kernel='rbf')
pyplot.scatter(y_norm_test, pred_ridgeA_best, color='blue') plt.xlabel('Test Values') plt.ylabel('Predicted Values') plt.title('Ridge Regression Scatter Plot Test Set (BRAAK A)') pyplot.show() #Linear Support Vector Regression for BRAAK12 from sklearn.svm import SVR C = np.logspace(start=-5, stop=0, num=50) #make this a smaller range #epsilon = [2,4] print(C) C svr_lin_A = SVR(kernel='linear') #try rbf MSE = [] for a in C: svr_lin_A.set_params(C=a) svr_lin_A.fit(X_norm_train, y_norm_train) pred_lin_A = svr_lin_A.predict(X_norm_test) MSEtemp = mean_squared_error(y_norm_test, pred_lin_A) MSE.append(MSEtemp) ax = plt.gca() fig, ax = plt.subplots(figsize=(15, 10)) ax.plot(C, MSE) ax.set_xscale('log') plt.axis('tight') plt.xlabel('Regularisation Parameter (C)') plt.ylabel('Mean Squared Error (MSE)') plt.title('Linear Support Vector Regression Test Set (BRAAK A)') plt.show()
clf = SVR(kernel='rbf', epsilon=epsilon, C=3) unchanged = 0 droplist = [] gamma_search = [2**x for x in range(-15, 3)] wdcopy = whitedat.copy() train_set = wdcopy.sample(frac=0.67, random_state=0) test_set = wdcopy.drop(train_set.index) X_train = train_set[train_set.columns[0:11]] y_train = train_set[train_set.columns[11]] X_test = test_set[test_set.columns[0:11]] y_test = test_set[test_set.columns[11]] clf.set_params(gamma=0.5) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) L2 = mean_squared_error(y_test, y_pred) print(L2) while True: # First, search for optimal gamma wdcopy = whitedat.copy() train_set = wdcopy.sample(frac=0.67, random_state=0) test_set = wdcopy.drop(train_set.index) X_train = train_set[train_set.columns[0:11]] y_train = train_set[train_set.columns[11]]
pyplot.scatter(y_tmp_test_ridge, yhat_tmp_ridge, color='red') plt.xlabel('Test Values (MCI BRAAK56)') plt.ylabel('Predicted Values (MCI BRAAK56)') plt.title( 'Ridge Regression Scatter Plot MCI BRAAK 56 Test Correlation Dataset') pyplot.show() #RBF Support Vector Machine for MCI BRAAK 12 #C = np.logspace(start = -5, stop = 0, num = 70 ) #this is a good value for MCI BRAAK56 but not for MCI 12 C = np.logspace(start=-3, stop=5, num=40) svr_mci_rbf = SVR(kernel='rbf', gamma='scale') MSE = [] for a in C: svr_mci_rbf.set_params(C=a) svr_mci_rbf.fit(X_norm_train, y_norm_mci_train) pred_mci_rbf = svr_mci_rbf.predict(X_norm_test) MSEtemp = mean_squared_error(y_norm_mci_test, pred_mci_rbf) MSE.append(MSEtemp) ax = plt.gca() ax.plot(C, MSE) ax.set_xscale('log') plt.axis('tight') plt.xlabel('Regularisation Parameter C') plt.ylabel('Mean Squared Error') plt.title('RBF Support Vector Regression MCI Braak 56') plt.show() #grid search for RBF SVR
model = SVR() clist = [1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e10, 1e15] epsilonlist = [0, .5, 1, 5, 10, 20, 30, 40, 50] hplist = cartesian((clist, epsilonlist)) #hp_dict = {'C': paramlist1, 'epsilon': paramlist2} #hp_list = list(ParameterGrid(hp_dict)) # loop over each set model hyperparameters scores = [] for hp0 in hplist: model.set_params(C=hp0[0], epsilon=hp0[1]) model.fit(X, y) score0 = abs(model.score(X, y)) scores.append(score0) print(hp0[0], hp0[1], score0 * 200) plt.scatter(hp0[0], hp0[1], s=score0 * 200, c='b', alpha=0.3) plot_setup(scales=['log', 'linear'], labels=['C', 'Epsilon']) plt.plot() ''' for hp in hp_list: print(hp) model.set_params(**hp) model.fit(X, y) print(model.score(X, y))
regressor = SVR(verbose=10) gamma_range = np.logspace(-5, 2, 10) train_scores, valid_scores = validation_curve(regressor, X_train, y_train, "gamma", gamma_range, n_jobs=-1, scoring=rmsle_scorer) valid_scores = [np.mean(s) for s in valid_scores] # Take the alpha giving the highest validation score, and test it on test set best_gamma = gamma_range[np.nanargmax(valid_scores)] print("best gamma:", best_gamma) regressor.set_params(gamma=best_gamma) X_train, y_train = resample(X, y, n_samples=20000) regressor.fit(X_train, y_train) print("test") # Since we can't load the whole dataset, do batch testing batch_size = 5000 X_test, y_test = resample(X, y, n_samples=100000) y_pred = np.ndarray((0, )) for i in range(0, X_test.shape[0], batch_size): print(i) y_pred = np.hstack((y_pred, regressor.predict(X_test[i:i + batch_size]))) print("RMSLE =", root_mean_squared_log_error(y_test, y_pred))
class svReg(customRegressor): def __init__(self, in_df, zoning, utilities, frontage, qualPow): super(svReg, self).__init__() from lm_features import impute_shell ## Because we're currying in python now self._imputeVals = impute_shell(frontage=frontage, zoning=zoning, utilities=utilities, qualPow=qualPow) tempDF = self._imputeVals(in_df.copy()) self.X = tempDF.drop(columns=["SalePrice"]).copy() self.y = np.log(tempDF.SalePrice.values.reshape(-1, 1)) self.pipeline_X = self._make_pipe() self.pipeline_X.fit(self.X) self.pipeline_y = StandardScaler() self.pipeline_y.fit(self.y) def _rmOutliers(self, x, y): outliers = ((y > 4000) & (y < 5E5)) out = x[~(outliers)] return out def _make_pipe(self): import svr_features as f nonePipeline = make_pipeline( SimpleImputer(strategy="constant", fill_value="None"), OneHotEncoder(drop="first")) zeroPipeline = make_pipeline( SimpleImputer(strategy="constant", fill_value=0), OneHotEncoder(drop="first", categories="auto")) scalePipeline = make_pipeline( SimpleImputer(strategy="constant", fill_value=0), PowerTransformer()) regressionPipeline = ColumnTransformer( [("setNone", nonePipeline, f.fillNone), ("setZero", zeroPipeline, f.fillZeroCat), ("transformed", scalePipeline, f.fillZeroCont), ("dictImputed", make_pipeline( self.dictImputer(f.imputeDict), OneHotEncoder(drop="first")), list(f.imputeDict.keys())), ("bool", "passthrough", f.imputeBool), ("categoricalInts", "passthrough", f.cat_to_int), ("dropped", "drop", f.dropList)], remainder="drop") return make_pipeline(regressionPipeline, RobustScaler()) def gridSearch(self, params, cv=5, njobs=-1, verbose=50): self._searchSpace = params piped_X = self._rmOutliers(self.X, self.y) piped_X = self.pipeline_X.transform(piped_X) piped_y = self.pipeline_y.transform(self.y) self._gridSearchObject = GridSearchCV(SVR(), params, cv=cv, scoring="neg_mean_squared_error", n_jobs=njobs, verbose=verbose) self._gridSearchObject.fit(piped_X, piped_y) def fitModel(self, params): self.model = SVR() self._params = params piped_X = self._rmOutliers(self.X, self.y) piped_X = self.pipeline_X.transform(piped_X) piped_y = self.pipeline_y.transform(self.y) self.model.set_params(**params) self.model.fit(piped_X, piped_y) def getTrainRsquared(self): piped_X = self._rmOutliers(self.X, self.y) piped_X = self.pipeline_X.transform(piped_X) piped_y = self.pipeline_y.transform(self.y) return self.model.score(piped_X, piped_y)
# In[23]: X_train = train_data.iloc[:,6:].values y_train = train_data.iloc[:,3].values X_valid = valid_data.iloc[:,6:].values y_valid = valid_data.iloc[:,3].values # In[24]: predictions = [] for param in model_params: model = SVR() model.set_params(**param) model.fit(X_train, y_train) y_pred = model.predict(X_valid) predictions.append(y_pred) # In[25]: y_hat = list(map(lambda x: sum(x)/len(x), np.array(predictions).T)) # In[26]:
def fun_svm_fs(x, *args): X, y, flag, n_splits, random_seed = args clf = SVR(kernel='rbf', ) n_samples, n_var = X.shape kernel = { 2: 'linear', 3: 'poly', 0: 'rbf', 1: 'sigmoid', 4: 'laplacian', 5: 'chi2' } #p={'C':x[0], 'kernel':kernel[int(round(x[2]))], 'gamma':x[1]} p = { 'kernel': kernel[int(round(x[0]))], 'degree': int(round(x[1])), 'gamma': 'scale' if x[2] < 0 else x[2], 'coef0': x[3], 'C': x[4], 'epsilon': x[5], 'max_iter': 4000, } clf.set_params(**p) n_param = len(p) if len(x) <= n_param: ft = np.array([1 for i in range(n_var)]) ft = np.where(ft > 0.5) else: ft = np.array([1 if k > 0.5 else 0 for k in x[2::]]) ft = np.where(ft > 0.5) n_splits = n_splits try: #cv=KFold(n_splits=n_splits, shuffle=True, random_state=random_seed) cv = KFold(n_splits=n_splits, shuffle=True, random_state=int(random_seed)) #cv=StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=int(random_seed)) y_p = cross_val_predict(clf, X, y, cv=cv, n_jobs=1) r = RMSE(y_p, y) r2 = MAPE(y_p, y) r3 = RRMSE(y_p, y) r4 = -r2_score(y_p, y) #r = mean_squared_error(y,y_p)**0.5 #r = -accuracy_score(y,y_p) #r = -precision_score(y,y_p) #r = -f1_score(y,y_p,average='weighted') except: y_p = [None] r = 1e12 #print (r,'\t',p,'\t',ft) #print (r) if flag == 'eval': return r else: clf.fit(X[:, ft].squeeze(), y) return { 'Y_TRUE': y, 'Y_PRED': y_p, 'EST_PARAMS': p, 'PARAMS': x, 'EST_NAME': 'SVM', 'ESTIMATOR': clf, 'ACTIVE_VAR': ft, 'DATA': X, 'SEED': random_seed, 'ERROR_TRAIN': { 'RMSE': r, 'MAPE': r2, 'RRMSE': r3, 'R2_SCORE': r4 } }
def predict_COVID_part2(train_df, train_labels_df, test_feature): numberOFDaysToStartFrom = 50 df2 = train_df[['dailly_cases']] casesCol = 'dailly_cases' casesList = [] pastCase = 16 for index in range(1, pastCase + 1): newColName = casesCol + '-' + str(index) casesList.append(newColName) df2[newColName] = np.nan for rowInd in range(numberOFDaysToStartFrom, len(df2)): df2.loc[rowInd, newColName] = int(df2.loc[rowInd - index, casesCol]) dataFrameIncreasing = df2[50:80] dataFrameDecreasing = df2[81:137] dataFrameConstant = df2[138:] svrModelIncreasing = SVR() svrModelDecreasing = SVR() svrModelConstant = SVR() svrModelIncreasing.set_params( **{ 'kernel': 'rbf', 'degree': 1, 'C': 9500, 'gamma': 'scale', 'coef0': 0.0, 'tol': 0.001, 'epsilon': 110 }) svrModelDecreasing.set_params( **{ 'kernel': 'rbf', 'degree': 1, 'C': 9500, 'gamma': 'scale', 'coef0': 0.0, 'tol': 0.001, 'epsilon': 110 }) svrModelConstant.set_params( **{ 'kernel': 'rbf', 'degree': 1, 'C': 9500, 'gamma': 'scale', 'coef0': 0.0, 'tol': 0.001, 'epsilon': 110 }) xTrainIncreasing = dataFrameIncreasing.drop(['dailly_cases'], 1) yTrainIncreasing = train_labels_df.iloc[50:80] yTrainIncreasing = yTrainIncreasing.drop(['day'], 1) xTrainDecreasing = dataFrameDecreasing.drop(['dailly_cases'], 1) yTrainDecreasing = train_labels_df.iloc[81:137] yTrainDecreasing = yTrainDecreasing.drop(['day'], 1) xTrainConstant = dataFrameConstant.drop(['dailly_cases'], 1) yTrainConstant = train_labels_df.iloc[138:] yTrainConstant = yTrainConstant.drop(['day'], 1) svrModelIncreasing.fit(xTrainIncreasing, yTrainIncreasing) svrModelDecreasing.fit(xTrainDecreasing, yTrainDecreasing) svrModelConstant.fit(xTrainConstant, yTrainConstant) testingForSeperateModels = df2.drop(['dailly_cases'], 1) testingForSeperateModels = testingForSeperateModels[ numberOFDaysToStartFrom:] increasingModelPrediction = svrModelIncreasing.predict( testingForSeperateModels) decreasingModelPrediction = svrModelDecreasing.predict( testingForSeperateModels) constantModelPrediction = svrModelConstant.predict( testingForSeperateModels) combinedData = [] for index in range(len(increasingModelPrediction)): newArray = [] newArray.append(math.floor(increasingModelPrediction[index])) newArray.append(math.floor(decreasingModelPrediction[index])) newArray.append(math.floor(constantModelPrediction[index])) combinedData.append(newArray) xTrainCombinedModel = pd.DataFrame(combinedData, columns=[ 'increasingModelPrediction', 'decreasingModelPrediction', 'constantModelPrediction' ]) yTrainCombinedModel = train_labels_df.iloc[numberOFDaysToStartFrom:] yTrainCombinedModel = yTrainCombinedModel.drop(['day'], 1) svrModelCombined = SVR() svrModelCombined.set_params( **{ 'kernel': 'rbf', 'degree': 1, 'C': 9500, 'gamma': 'scale', 'coef0': 0.0, 'tol': 0.001, 'epsilon': 110 }) svrModelCombined.fit(xTrainCombinedModel, yTrainCombinedModel) dataColumns = casesList finalPrediction = makePrediction(svrModelIncreasing, svrModelDecreasing, svrModelConstant, svrModelCombined, dataColumns, test_feature) return finalPrediction