def linearSVR(data): X = data.drop(["id", "date", "price","long","lat", "zipcode","yr_renovated", "sqft_above", "sqft_basement"], axis=1) y = data["price"] X_train, X_test, y_train, y_test = tts(X, y, test_size=0.10, random_state=42) svr = LinearSVR(random_state=42) svr.fit(X_train, y_train) y_predict = svr.predict(X_test) print "r2-score for LinearSVR: %f" % r2_score(y_test, y_predict)
def linearSVR(X, c_param, norm=2): if norm == 2: XX = normalizeL2(X) T = X.shape[0] # temporal length clf = LinearSVR(C=c_param, dual=False, loss='squared_epsilon_insensitive', \ epsilon=0.1, tol=0.001, verbose=False) # epsilon is "-p" in C's liblinear and tol is "-e" clf.fit(XX, np.linspace(1,T,T)) return clf.coef_
def train(self, trainSet): pntNum = trainSet.meanShape.shape[0] treeNum = int(self.maxTreeNum/pntNum) ### Train the random forests begTime = time.time() for i in xrange(pntNum): rf = RandForest(treeDepth = self.treeDepth, treeNum = treeNum, feaNum = self.feaNum, radius = self.radius, binNum = self.binNum, feaRange = self.feaRange) rf.train(trainSet, i) self.rfs.append(rf) elapse = getTimeByStamp(begTime, time.time(), 'min') print("\t\tRandom Forest : %f mins"%elapse) ### Extract the local binary features begTime = time.time() feas = self.genFeaOnTrainset(trainSet) elapse = getTimeByStamp(begTime, time.time(), 'min') print("\t\tExtract LBFs : %f mins"%elapse) ### Global regression begTime = time.time() y = trainSet.residuals y = y.reshape(y.shape[0], y.shape[1]*y.shape[2]) for i in xrange(pntNum*2): ### TODO Show the training result reg=LinearSVR(epsilon=0.0, C = 1.0/feas.shape[0], loss='squared_epsilon_insensitive', fit_intercept = True) reg.fit(feas, y[:, i]) self.regs.append(reg) elapse = getTimeByStamp(begTime, time.time(), 'min') print("\t\tGlobal Regression : %f mins"%elapse) ### Update the initshapes begTime = time.time() for i in xrange(pntNum): regX = self.regs[2*i] regY = self.regs[2*i+1] x = regX.predict(feas) y = regY.predict(feas) delta = NP.squeeze(NP.dstack((x,y))) delta = Affine.transPntsForwardWithDiffT(delta, trainSet.ms2reals) delta = NP.multiply(delta, trainSet.bndBoxs[:,[2,3]]) trainSet.initShapes[:,i,:] = trainSet.initShapes[:,i,:] + delta elapse = getTimeByStamp(begTime, time.time(), 'min') print("\t\tUpdate Shape : %f mins"%elapse)
def main(train_file, model_file): train_x, train_y = load_trainingData(train_file) #LR = LinearRegression(normalize = True) #LR = Ridge(alpha=0.5) #LR = SVR(C=1.0, epsilon=0.2, verbose = True) LR = LinearSVR(verbose = 1, epsilon = 0.1) logging("training model...") starttime = datetime.now() LR.fit(train_x, train_y) logging("training model, eplased time:%s" % str(datetime.now() - starttime)) logging("saving model") joblib.dump(LR, model_file)
def GlobalRegression(self, lbf, shape_residual): m = K n, f = lbf.shape # prepare linear regression X, Y X = lbf Y = shape_residual / img_o_width # parallel for i in xrange(landmark_n*2): reg = LinearSVR(epsilon=0.0, C=1.0/n, loss='squared_epsilon_insensitive', fit_intercept = True) reg.fit(X, Y[:, i]) self.w[i] = reg.coef_ self.w = self.w * img_o_width
class SVRR(object): def __init__(self, C): self.regression = LinearSVR(C=C) def fit(self, xs, ys): xs = xs.values ys = ys['y'] self.regression.fit(xs, ys) def predict(self, xs): xs = xs.values ys = self.regression.predict(xs) return ys
def globalRegress(self, posSet, negSet): self.feaDim = self.getFeaDim() ### Extract the local binary features begTime = time.time() posFeas = self.genFeaOnTrainset(posSet) negFeas = self.genFeaOnTrainset(negSet) t = getTimeByStamp(begTime, time.time(), 'min') print("\t\tExtract LBFs : %f mins"%t) ### Global regression begTime = time.time() y = posSet.residuals y = y.reshape(y.shape[0], y.shape[1]*y.shape[2]) for i in xrange(posSet.pntNum*2): ### TODO Show the training result reg=LinearSVR(epsilon=0.0, C = 1.0/posFeas.shape[0], loss='squared_epsilon_insensitive', fit_intercept = True) reg.fit(posFeas, y[:, i]) self.globalReg.append(reg) t = getTimeByStamp(begTime, time.time(), 'min') print("\t\tGlobal Regression : %f mins"%t) ### Update the initshapes begTime = time.time() for i in xrange(posSet.pntNum): regX = self.globalReg[2*i] regY = self.globalReg[2*i+1] x = regX.predict(posFeas) y = regY.predict(posFeas) delta = NP.squeeze(NP.dstack((x,y))) delta = NP.multiply(delta, posSet.winSize) posSet.initShapes[:,i,:] = posSet.initShapes[:,i,:] + delta x = regX.predict(negFeas) y = regY.predict(negFeas) delta = NP.squeeze(NP.dstack((x,y))) delta = NP.multiply(delta, negSet.winSize) negSet.initShapes[:,i,:] = negSet.initShapes[:,i,:] + delta t = getTimeByStamp(begTime, time.time(), 'min') self.applyPntOffsetIntoTree() print("\t\tUpdate Shape : %f mins"%t)
def meta_model_fit(X_train, y_train, svm_hardness, fit_intercept, number_of_threads, regressor_type="LinearSVR"): """ Trains meta-labeler for predicting number of labels for each user. Based on: Tang, L., Rajan, S., & Narayanan, V. K. (2009, April). Large scale multi-label classification via metalabeler. In Proceedings of the 18th international conference on World wide web (pp. 211-220). ACM. """ if regressor_type == "LinearSVR": if X_train.shape[0] > X_train.shape[1]: dual = False else: dual = True model = LinearSVR(C=svm_hardness, random_state=0, dual=dual, fit_intercept=fit_intercept) y_train_meta = y_train.sum(axis=1) model.fit(X_train, y_train_meta) else: print("Invalid regressor type.") raise RuntimeError return model
def build_svm(x_train, y_train, x_test, y_test, n_features): """ Constructing a support vector regression model from input dataframe :param x_train: features dataframe for model training :param y_train: target dataframe for model training :param x_test: features dataframe for model testing :param y_test: target dataframe for model testing :return: None """ clf = LinearSVR(random_state=1, dual=False, epsilon=0, loss='squared_epsilon_insensitive') # Random state has int value for non-random sampling clf.fit(x_train, y_train) y_pred = clf.predict(x_test) # Mean absolute error regression loss mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred) # Mean squared error regression loss mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred) # Median absolute error regression loss median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred) # R^2 (coefficient of determination) regression score function r2 = sklearn.metrics.r2_score(y_test, y_pred) # Explained variance regression score function exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred) with open('../trained_networks/svm_%d_data.pkl' % n_features, 'wb') as results: pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL) pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL) pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL) pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL) pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL) pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL) pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL) return
class LinearSVRPermuteCoef: def __init__(self, **kwargs): self.model = LinearSVR(**kwargs) def fit(self, X, y): self.model.fit(X, y) self.coef_ = self.model.coef_ self.intercept_ = self.model.intercept_ def add_coef(arr, fn): arr.append(fn(self.coef_)) add_coef(coeffs_state['max'], np.max) add_coef(coeffs_state['min'], np.min) return self def get_params(self, deep=True): return self.model.get_params(deep) def set_params(self, **kwargs): self.model.set_params(**kwargs) return self def predict(self, X): return self.model.predict(X) def score(self, X, y, sample_weight=None): if sample_weight is not None: return self.model.score(X, y, sample_weight) else: return self.model.score(X, y) @staticmethod def permute_min_coefs(): return coeffs_state['min'] @staticmethod def permute_max_coefs(): return coeffs_state['max'] @staticmethod def reset_perm_coefs(): coeffs_state['min'] = [] coeffs_state['max'] = []
import numpy as np from sklearn.model_selection import train_test_split from sklearn.svm import LinearSVR # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['class'], random_state=42) exported_pipeline = LinearSVR(C=25.0, dual=False, epsilon=0.001, loss="squared_epsilon_insensitive", tol=1e-05) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
gamma, C = hyperparams[i] plt.title(r"$\gamma = {}, C = {}$".format(gamma, C), fontsize=16) #save_fig("moons_rbf_svc_plot") plt.show() #%% SVR Regression np.random.seed(42) m = 50 X = 2 * np.random.rand(m, 1) y = (4 + 3 * X + np.random.randn(m, 1)).ravel() from sklearn.svm import LinearSVR svm_reg = LinearSVR(epsilon=1.5, random_state=42) svm_reg.fit(X, y) svm_reg1 = LinearSVR(epsilon=1.5, random_state=42) svm_reg2 = LinearSVR(epsilon=0.5, random_state=42) svm_reg1.fit(X, y) svm_reg2.fit(X, y) def find_support_vectors(svm_reg, X, y): y_pred = svm_reg.predict(X) off_margin = (np.abs(y - y_pred) >= svm_reg.epsilon) return np.argwhere(off_margin) svm_reg1.support_ = find_support_vectors(svm_reg1, X, y)
target_col = col[2] features = col[3:len(col)] X = data[features].values y = data[target_col].values y = np.log1p(y) y = np.reshape(y, (-1,1)) ############################################################################### # Model configuration base = make_pipeline( StackingEstimator(estimator=LassoLarsCV(normalize=True)), StackingEstimator(estimator=LinearSVR(C=0.01, dual=True, epsilon=0.001, loss="epsilon_insensitive", tol=0.1)), MaxAbsScaler(), StackingEstimator(estimator=RidgeCV()), Normalizer(norm="l2"), StackingEstimator(estimator=LinearSVR(C=0.5, dual=False, epsilon=0.1, loss="squared_epsilon_insensitive", tol=0.1)), StackingEstimator(estimator=ExtraTreesRegressor(bootstrap=False, max_features=0.4, min_samples_leaf=2, min_samples_split=4, n_estimators=100)), MinMaxScaler(), StackingEstimator(estimator=RidgeCV()), StackingEstimator(estimator=LinearSVR(C=5.0, dual=True, epsilon=0.1, loss="epsilon_insensitive", tol=0.0001)), StackingEstimator(estimator=RidgeCV()), StackingEstimator(estimator=SGDRegressor()), RobustScaler(), StackingEstimator(estimator=LinearSVR(C=15.0, dual=True, epsilon=0.01, loss="epsilon_insensitive", tol=0.1)), StackingEstimator(estimator=ElasticNetCV(l1_ratio=0.75, tol=0.001)), StackingEstimator(estimator=XGBRegressor(learning_rate=0.1, max_depth=1, min_child_weight=6, n_estimators=100, nthread=1, objective="reg:squarederror", subsample=0.6500000000000001)), MinMaxScaler(),
combined = np.append(X, np.matrix(Y).T, axis=1) np.random.shuffle(combined) tail_size = -1 * size last_column = X.shape[1] training_labels = combined[:tail_size, last_column] training_data = combined[:tail_size, :-2] test_data = combined[tail_size:, :-2] actual_labels = combined[tail_size:, last_column] return training_data, np.ravel(training_labels), test_data, np.ravel(actual_labels) training = open('author_features') NO_TRAINING_SAMPLES = 6000 NO_OF_AUTHORS = 10000 matrix = dok_matrix((NO_TRAINING_SAMPLES, NO_OF_AUTHORS), dtype=np.int) for line in training.readlines(): values = line.rstrip().split() matrix[int(values[0]), int(values[1])] = 1 labels_file = open('year_training_labels') labels = [int(x) for x in labels_file.readline().rstrip().split()] training_matrix = matrix[:4498] training_data, training_labels, test_data, actual_labels = sample(training_matrix, labels) classifier = LinearSVR() classifier.fit(training_data, training_labels) output = classifier.predict(test_data) for index, predicted in enumerate(output): print '%s %s' % (predicted, actual_labels[index]) print metrics.explained_variance_score(actual_labels, output)
with open("C:/Users/sean/Desktop/SVR_DATA/edwademd.csv","rb") as data_file: data,target = [],[] for row in csv.reader(data_file): data += [[row[0],row[4],row[6],row[10]]] target += [row[9]] data,target = Lin_clean_data(data[1:],target[1:],2) point = 2000 X_train = data[:point-1] X_test = data[point:point+int(point*0.2)] y_train = target[:point-1] y_test = target[point:point+int(point*0.2)] svr = LinearSVR(C=0.1) svr_model = svr.fit(X_train,y_train) lin = svr.predict(X_train) lin_test = svr.predict(X_test) lin,lin_test = data_normalize(y_train,y_test,lin,lin_test) print("Train score : ",score(y_train,lin)) print("Train average error : ",sum(abs(y_train-lin)) / float(len(y_train))) print("Fit score : ",score(y_test,lin_test)) print("Fit average error : ",sum(abs(y_test-lin_test)) / float(len(y_test))) figure1 = plt.figure(1,figsize=[20,10]) draw_pic(range(len(X_train)),range(len(X_test)),lin,lin_test,y_train,y_test,label='lin',figure=figure1) figure1.savefig("C:/Users/sean/Desktop/SVR_DATA/linSVR.png",dpi=300,format="png")
def linearSVR(train,trainLable,testData): clf = LinearSVR() clf.fit(train,trainLable) predict = clf.predict(testData) return predict
import pandas as pd import numpy as np from sklearn.linear_model import LinearRegression from sklearn.ensemble import RandomForestRegressor from sklearn.svm import LinearSVR import proc_temperature as pt lr = LinearRegression(fit_intercept=False) rf = RandomForestRegressor() svr = LinearSVR() import Train.split_by_season as ss # Energy in bottle tot_en = 2212 # in MJ, 45kg*1.856*26 base_cons = 4.5 # 9MJ burner burning for half an hour everyday heater_max_config = 15 # MJ, can be 25 as well heater_duration = 4 # hours heater_energy_cons = heater_max_config * heater_duration hot_water_max_config = 125 # MJ/hour for 16 L/Min hot water and 199 MJ/hour for 26L/Min hotwater hot_water_duration = 5 # Minutes hw_energy_cons = hot_water_max_config * hot_water_duration / 60 max_cons = hw_energy_cons + heater_energy_cons + base_cons alpha = 12 # Temperature below which heater and hot water will start pushing the consumption up the slope beta = 6 # Temperature at which max_cons is being consumed at the house gamma = (max_cons - base_cons) / (alpha - beta) class MLE: def __init__(self, binsize, cols): self.binsize = binsize
import numpy as np from sklearn.linear_model import LassoLarsCV from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline, make_union from sklearn.svm import LinearSVR from tpot.builtins import StackingEstimator # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['class'], random_state=42) exported_pipeline = make_pipeline( StackingEstimator(estimator=LassoLarsCV(normalize=True)), LinearSVR(C=0.0001, dual=False, epsilon=0.1, loss="squared_epsilon_insensitive", tol=0.0001)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
def dict_method_reg(): dict_method = {} # 1st part """1SVR""" me1 = SVR(kernel='rbf', gamma='auto', degree=3, tol=1e-3, epsilon=0.1, shrinking=False, max_iter=2000) cv1 = 5 scoring1 = 'r2' param_grid1 = [{'C': [1, 0.75, 0.5, 0.25, 0.1], 'epsilon': [0.01, 0.001, 0.0001]}] dict_method.update({"SVR-set": [me1, cv1, scoring1, param_grid1]}) """2BayesianRidge""" me2 = BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True, fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, n_iter=300, normalize=False, tol=0.01, verbose=False) cv2 = 5 scoring2 = 'r2' param_grid2 = [{'alpha_1': [1e-07, 1e-06, 1e-05], 'alpha_2': [1e-07, 1e-05, 1e-03]}] dict_method.update({'BayR-set': [me2, cv2, scoring2, param_grid2]}) """3SGDRL2""" me3 = SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.01, fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling', loss='squared_loss', max_iter=1000, penalty='l2', power_t=0.25, random_state=0, shuffle=True, tol=0.01, verbose=0, warm_start=False) cv3 = 5 scoring3 = 'r2' param_grid3 = [{'alpha': [100, 10, 1, 0.1, 0.01, 0.001, 0.0001, 1e-05]}] dict_method.update({'SGDRL2-set': [me3, cv3, scoring3, param_grid3]}) """4KNR""" me4 = neighbors.KNeighborsRegressor(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski') cv4 = 5 scoring4 = 'r2' param_grid4 = [{'n_neighbors': [3, 4, 5, 6]}] dict_method.update({"KNR-set": [me4, cv4, scoring4, param_grid4]}) """5kernelridge""" kernel = 1.0 * RBF(1.0) me5 = kernel_ridge.KernelRidge(alpha=1, kernel=kernel, gamma="scale", degree=3, coef0=1, kernel_params=None) cv5 = 5 scoring5 = 'r2' param_grid5 = [{'alpha': [100, 10, 1, 0.1, 0.01, 0.001]}] dict_method.update({'KRR-set': [me5, cv5, scoring5, param_grid5]}) """6GPR""" # kernel = 1.0 * RBF(1.0) kernel = Matern(length_scale=0.1, nu=0.5) me6 = gaussian_process.GaussianProcessRegressor(kernel=kernel, alpha=1e-10, optimizer='fmin_l_bfgs_b', n_restarts_optimizer=10, normalize_y=False, copy_X_train=True, random_state=0) cv6 = 5 scoring6 = 'r2' param_grid6 = [{'alpha': [1e-11, 1e-10, 1e-9, 1e-8, 1e-7]}] dict_method.update({"GPR-set": [me6, cv6, scoring6, param_grid6]}) # 2nd part """6RFR""" me7 = ensemble.RandomForestRegressor(n_estimators=100, max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, random_state=None, verbose=0, warm_start=False) cv7 = 5 scoring7 = 'r2' param_grid7 = [{'max_depth': [3, 4, 5, 6]}] dict_method.update({"RFR-em": [me7, cv7, scoring7, param_grid7]}) """7GBR""" me8 = ensemble.GradientBoostingRegressor(loss='ls', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., max_depth=3, min_impurity_decrease=0., min_impurity_split=None, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto') cv8 = 5 scoring8 = 'r2' param_grid8 = [{'max_depth': [3, 4, 5, 6]}] dict_method.update({'GBR-em': [me8, cv8, scoring8, param_grid8]}) "AdaBR" dt = DecisionTreeRegressor(criterion="mae", splitter="best", max_features=None, max_depth=3, min_samples_split=2) me9 = AdaBoostRegressor(dt, n_estimators=100, learning_rate=1, loss='square', random_state=0) cv9 = 5 scoring9 = 'r2' param_grid9 = [{'n_estimators': [50, 120, 100, 200]}] dict_method.update({"AdaBR-em": [me9, cv9, scoring9, param_grid9]}) '''TreeR''' me10 = DecisionTreeRegressor( criterion='mse', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=0, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, presort=False) cv10 = 5 scoring10 = 'r2' param_grid10 = [{'max_depth': [3, 4, 5, 6], 'min_samples_split': [2, 3, 4]}] dict_method.update({'TreeC-em': [me10, cv10, scoring10, param_grid10]}) 'ElasticNet' me11 = ElasticNet(alpha=1.0, l1_ratio=0.7, fit_intercept=True, normalize=False, precompute=False, max_iter=1000, copy_X=True, tol=0.0001, warm_start=False, positive=False, random_state=None) cv11 = 5 scoring11 = 'r2' param_grid11 = [{'alpha': [0.0001, 0.001, 0.01, 0.1, 1], 'l1_ratio': [0.3, 0.5, 0.8]}] dict_method.update({"ElasticNet-L1": [me11, cv11, scoring11, param_grid11]}) 'Lasso' me12 = Lasso(alpha=1.0, fit_intercept=True, normalize=False, precompute=False, copy_X=True, max_iter=1000, tol=0.001, warm_start=False, positive=False, random_state=None, ) cv12 = 5 scoring12 = 'r2' param_grid12 = [{'alpha': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 10, 100, 1000]}, ] dict_method.update({"Lasso-L1": [me12, cv12, scoring12, param_grid12]}) """SGDRL1""" me13 = SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.01, fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling', loss='squared_loss', max_iter=1000, penalty='l1', power_t=0.25, random_state=0, shuffle=True, tol=0.01, verbose=0, warm_start=False) cv13 = 5 scoring13 = 'r2' param_grid13 = [{'alpha': [100, 10, 1, 0.1, 0.01, 0.001, 0.0001, 1e-5, 1e-6, 1e-7], "epsilon": [0.1, 0.2, 1]}] dict_method.update({'SGDR-L1': [me13, cv13, scoring13, param_grid13]}) """LinearSVR""" me14 = LinearSVR(epsilon=0.0, tol=1e-4, C=1.0, loss='epsilon_insensitive', fit_intercept=True, intercept_scaling=1., dual=True, verbose=0, random_state=3, max_iter=1000) cv14 = 5 scoring14 = 'r2' param_grid14 = [{'C': [10, 6, 5, 3, 2.5, 1, 0.75, 0.5, 0.25, 0.1], 'epsilon': [0.0, 0.1]}] dict_method.update({"LinearSVR-set": [me14, cv14, scoring14, param_grid14]}) return dict_method
st = preprocessing.MinMaxScaler() x = st.fit_transform(x) method = ["SVR-set", "AdaBR-em", 'GBR-em', "LinearSVR-set"] result = score_muti(x, y, me="reg", paras=True, method_name=method, shrink=1, str_name=False, param_grid=None) from sklearn.model_selection import cross_val_predict pre_y = cross_val_predict( SVR(C=1, cache_size=200, coef0=0.0, degree=3, epsilon=0.001, gamma='auto', kernel='rbf', max_iter=2000, shrinking=False, tol=0.001, verbose=False) , x, y, ) - cut lin = LinearSVR(C=10, dual=True, epsilon=0.0, fit_intercept=True, intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000, random_state=3, tol=0.0001, verbose=0) lin.fit(x, y) pre_y2 = lin.predict(x) - cut print(result[0]) print(result[1][-1].coef_) print(result[1][-1].intercept_) coef = lin.coef_ inter = lin.intercept_ data_max_ = st.data_max_ data_min_ = st.data_min_ data_range = st.data_range_
# with data with a very low Signal-to-noise ratio as one would expect from financial data. It is also a very fast algorithm (liblinear is heavily optimized). # We will do a grid search with 5-fold GroupKFold cross-validation. As mentioned earlier, the fact that returns are not independent of Feature_7, we will have # to group our cross-validation in order to avoid data leakage and hence overestimation of the CV performance*. # # Ideally, we should optimize using a loss function suitable for optimizing Weighed Mean Absolute Error (which is non-differentiable at 0). We did not # prioritize this, and we still got reasonable results in the model scoring. # # (*)See: https://stats.stackexchange.com/questions/95797/how-to-split-the-dataset-for-cross-validation-learning-curve-and-final-evaluat # and http://www.jmlr.org/papers/volume11/cawley10a/cawley10a.pdf # %% print('Building model...') # Define initial model model = LinearSVR(epsilon=0.0, C=0.0005, loss='squared_epsilon_insensitive', random_state=0) # 1727.860 # Define model pipeline for multi output regression multi_out_reg = MultiOutputRegressor(model) model_pipeline = Pipeline( steps=[('preprocessor', preprocessor_X), ('multioutreg', multi_out_reg)]) estimator = TransformedTargetRegressor(regressor=model_pipeline, transformer=preprocessor_Y) def WA(a, axis, weight): # Adapted from function_base.py a = np.asanyarray(a) wgt = np.asanyarray(weight) wgt = np.broadcast_to(wgt, (a.ndim - 1) * (1, ) + wgt.shape)
import numpy as np import pandas as pd from sklearn.svm import LinearSVR, LinearSVC from sklearn.cross_validation import train_test_split from sklearn.linear_model import LinearRegression from sklearn.linear_model import LogisticRegression #beause of the previous learning,in these codes,I will not use the normalizatin and result analysing #try to compare the SVR with the linearRegression on a same dataset data = pd.read_csv("./Folds5x2_pp.csv", header=0, encoding="gbk") X = data[['AT', 'V', 'AP', 'RH']] y = data[['PE']] X_train, X_test, y_train, y_test = train_test_split( X, y, random_state=10) #拆分成训练集和测试集 svr_Linear = LinearSVR(random_state=0) svr_Linear.fit(X_train, y_train) print("SVR_score:", svr_Linear.score(X_train, y_train)) liner = LinearRegression() liner.fit(X_train, y_train) print("Linearmodel_score:", liner.score(X_train, y_train)) #by doing so,in this example,you will see that linerRegresion fit better #try to compare the svc with logisticregression on a same dataset URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data' wine_dataset = pd.read_csv(URL, header=None) wine_dataset.columns = [ 'class label', 'F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9', 'F10', 'F11', 'F12', 'F13' ] X, y = wine_dataset.iloc[:, 1:].values, wine_dataset.iloc[:, 0].values
ax.set_xlim([2000, 2020.5]) ax.set_ylim([-7500,60000]) ax.set_xlabel("Year") ax.set_ylabel("Photos") fig.savefig(path+"images/svrprediction_cluster" + str(c) + ".jpg") ########################################################### ### PREDICTING VALUES WITH LINEAR SUPPORT VECTOR REGRESSION ########################################################### # http://scikit-learn.org/stable/auto_examples/plot_kernel_ridge_regression.html linsvr = LinearSVR(epsilon=0.0, tol=1e-4, C=1.0, loss='epsilon_insensitive') param_grid ={'epsilon': [0.0, 0.1,0.2,0.3,0.4], 'C': [1, 10, 100, 1000], 'loss':['epsilon_insensitive','squared_epsilon_insensitive']} linsvr_grid = GridSearchCV(linsvr, param_grid, cv=6, n_jobs=-1) y_linsvr = linsvr_grid.fit(explanatory_df, response_series) linbest_estimator = linsvr_grid.best_estimator_ print "Best epsilon: %s" %linbest_estimator.epsilon print "Best C: %s" %linbest_estimator.C print "Best Loss Function: %s" %linbest_estimator.loss print "R-squared: %f" % linsvr_grid.score(explanatory_df,response_series) # Create dataframe of number of points in each cluster per year
def QuickML_Ensembling(X_train, y_train, X_test, y_test='', modeltype='Regression', Boosting_Flag=False, scoring='', verbose=0): """ Quickly builds and runs multiple models for a clean data set(only numerics). """ start_time = time.time() seed = 99 FOLDS = 5 model_dict = {} model_tuples = [] if len(X_train) <= 100000 and X_train.shape[1] < 50: NUMS = 100 else: try: X_train = X_train.sample(frac=0.30, random_state=99) y_train = y_train[X_train.index] except: pass NUMS = 200 if modeltype == 'Regression': if scoring == '': scoring = 'neg_mean_squared_error' #scv = ShuffleSplit(n_splits=FOLDS,random_state=seed) scv = KFold(n_splits=FOLDS, shuffle=False, random_state=seed) if Boosting_Flag is None: ## Create an ensemble model #### model5 = AdaBoostRegressor(base_estimator=DecisionTreeRegressor( random_state=seed, max_depth=1, min_samples_leaf=2), n_estimators=NUMS, random_state=seed) model_tuples.append(('Adaboost', model5)) elif not Boosting_Flag: model5 = LassoLarsCV(cv=scv) model_tuples.append(('LassoLarsCV', model5)) else: model5 = LassoLarsCV(cv=scv) model_tuples.append(('LassoLarsCV', model5)) if Boosting_Flag is None: model6 = BaggingRegressor(DecisionTreeRegressor(random_state=seed), n_estimators=NUMS, random_state=seed) model_tuples.append(('Bagging_Regressor', model6)) elif not Boosting_Flag: model6 = LinearSVR() model_tuples.append(('Linear_SVR', model6)) else: model6 = DecisionTreeRegressor(max_depth=5, min_samples_leaf=2) model_tuples.append(('Decision_Tree', model6)) model7 = KNeighborsRegressor(n_neighbors=8) model_tuples.append(('KNN_Regressor', model7)) if Boosting_Flag is None: #### If the Boosting_Flag is True, it means Boosting model is present. ### So choose a different kind of classifier here model8 = DecisionTreeRegressor(max_depth=5, min_samples_leaf=2) model_tuples.append(('Decision_Tree', model8)) elif not Boosting_Flag: #### If the Boosting_Flag is True, it means Boosting model is present. ### So choose a different kind of classifier here model8 = AdaBoostRegressor(base_estimator=DecisionTreeRegressor( random_state=seed, max_depth=1, min_samples_leaf=2), n_estimators=NUMS, random_state=seed) model_tuples.append(('Adaboost', model8)) else: model8 = RandomForestRegressor(bootstrap=False, max_depth=10, max_features='auto', min_samples_leaf=2, n_estimators=200, random_state=99) model_tuples.append(('RF_Regressor', model8)) else: if scoring == '': scoring = 'accuracy' num_classes = len(np.unique(y_test)) scv = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=seed) if Boosting_Flag is None: ## Create an ensemble model #### model5 = AdaBoostClassifier(base_estimator=DecisionTreeClassifier( random_state=seed, max_depth=1, min_samples_leaf=2), n_estimators=NUMS, random_state=seed) model_tuples.append(('Adaboost', model5)) elif not Boosting_Flag: model5 = LinearDiscriminantAnalysis() model_tuples.append(('Linear_Discriminant', model5)) else: model5 = LogisticRegressionCV(Cs=[0.001, 0.01, 0.1, 1, 10, 100], solver='liblinear', random_state=seed) model_tuples.append(('Logistic_Regression_CV', model5)) if Boosting_Flag is None: model6 = DecisionTreeClassifier(max_depth=5, min_samples_leaf=2) model_tuples.append(('Decision_Tree', model6)) elif not Boosting_Flag: model6 = LinearSVC() model_tuples.append(('Linear_SVC', model6)) else: model6 = DecisionTreeClassifier(max_depth=5, min_samples_leaf=2) model_tuples.append(('Decision_Tree', model6)) if modeltype == 'Binary_Classification': model7 = GaussianNB() else: model7 = MultinomialNB() model_tuples.append(('Naive_Bayes', model7)) if Boosting_Flag is None: #### If the Boosting_Flag is True, it means Boosting model is present. ### So choose a different kind of classifier here model8 = RandomForestClassifier(bootstrap=False, max_depth=10, max_features='auto', min_samples_leaf=2, n_estimators=200, random_state=99) model_tuples.append(('Bagging_Classifier', model8)) elif not Boosting_Flag: #### If the Boosting_Flag is True, it means Boosting model is present. ### So choose a different kind of classifier here sgd_best_model = SGDClassifier(alpha=1e-06, loss='log', max_iter=1000, penalty='l2', learning_rate='constant', eta0=.1, random_state=3, tol=None) model8 = OneVsRestClassifier(sgd_best_model) model_tuples.append(('One_vs_Rest_Classifier', model8)) else: model8 = RandomForestClassifier(bootstrap=False, max_depth=10, max_features='auto', min_samples_leaf=2, n_estimators=200, random_state=99) model_tuples.append(('Bagging_Classifier', model8)) model_dict = dict(model_tuples) models, results = run_ensemble_models(model_dict, X_train, y_train, X_test, y_test, scoring, modeltype) return models, results
print "\n--------------------------------------------" print "----------- Fold %d -----------------------" %i print "--------------------------------------------" val_id = fold_ids.ix[:, i].dropna() idx = train["Id"].isin(list(val_id)) trainingSet = train[~idx] validationSet = train[idx] tr_X = np.matrix(trainingSet[feature_names]) tr_Y = np.array(trainingSet["Response"]) val_X = np.matrix(validationSet[feature_names]) val_Y = np.array(validationSet["Response"]) regm = LinearSVR(C = 0.06, epsilon = 0.45, tol = 1e-5, dual = True, verbose = True, random_state = 133) regm.fit(tr_X, tr_Y) preds = regm.predict(val_X) df = pd.DataFrame(dict({"Id" : validationSet["Id"], "ground_truth" : validationSet["Response"], "linsvr_preds" : preds})) linsvr_val = linsvr_val.append(df, ignore_index = True) tpreds = regm.predict(test_X) cname = "Fold" + `i` linsvr_test[cname] = tpreds linsvr_val.to_csv("ensemble2/linsvr_val.csv") linsvr_test.to_csv("ensemble2/linsvr_test.csv")
X_2020 = weightedRunStats2020[[ 'weightWRC', 'weightPA', 'weightH', 'weightAB', 'weightRBI', 'weightG', 'weight2B' ]].values y_pred = regressor.predict(X_2020) # In[241]: y_pred_list = [] # list of y_pred so we can add it to a dataframe for i in range(len(y_pred)): y_pred_list.append(y_pred[i][0]) weightedRunStats2020['runsPredicted'] = y_pred_list # X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0) regr = LinearSVR(random_state=0) regr.fit(X, Y) # In[240]: y_pred = regr.predict(X_2020) weightedRunStats2020['linearSVRRuns'] = y_pred regr = SVR() regr.fit(X, Y) y_pred = regr.predict(X_2020) weightedRunStats2020['svrRuns'] = y_pred weightedRunStats2020 = weightedRunStats2020.sort_values(by=['linearSVRRuns'], ascending=False) print(weightedRunStats2020)
def trainModel(self,Model = "default"): if Model == "default": self.mlModel = LinearSVR(loss='squared_epsilon_insensitive',dual=False, tol=1e-3) else: self.mlModel = Model self.mlModel.fit(self.X_train, self.y_train)
max_depth=1, min_child_weight=3, n_estimators=100, n_jobs=1, objective="reg:squarederror", subsample=0.9500000000000001, verbosity=0)), MinMaxScaler(), StackingEstimator(estimator=SGDRegressor(alpha=0.01, eta0=0.01, fit_intercept=False, l1_ratio=0.0, learning_rate="constant", loss="huber", penalty="elasticnet", power_t=0.0)), StackingEstimator(estimator=LinearSVR(C=25.0, dual=True, epsilon=0.01, loss="epsilon_insensitive", tol=0.0001)), FeatureAgglomeration(affinity="l2", linkage="average"), SelectPercentile(score_func=f_regression, percentile=6), StackingEstimator(estimator=LinearSVR(C=20.0, dual=True, epsilon=0.1, loss="squared_epsilon_insensitive", tol=0.1)), RidgeCV()) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
col_dict = defaultdict(list) tran_dict = {} for idx, feature_name in enumerate(feature_names): short_name = re.findall('[^=]*',feature_name)[0] #get the part before the equals sign, if there is onee col_dict[short_name].append(idx) pidx = use_cols.index(short_name) if predictors[pidx].norm_type in transformer_map: tran_dict[use_cols[pidx]] = transformer_map[predictors[pidx].norm_type] X[:,idx] = tran_dict[use_cols[pidx]].fit_transform(X[:,idx].reshape(-1,1)).squeeze() dict_vect.tran_dict = tran_dict #%% COMPILE LIST OF MODELS TO COMPARE Lin_est = Ridge() svr_est = LinearSVR(epsilon=0) max_depth=16 min_samples_leaf=50 min_samples_split=100 n_trees=100 #100 RF_est = RandomForestRegressor(n_estimators=n_trees, max_depth=max_depth, min_samples_leaf=min_samples_leaf, min_samples_split=min_samples_split,n_jobs=-1) GBR_est = GradientBoostingRegressor(learning_rate=0.1, n_estimators=n_trees, min_samples_leaf=min_samples_leaf, min_samples_split=min_samples_split, max_depth=2) #%% Run CV grid search if desired.
sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'], random_state=None) # Average CV score on the training set was: -3.5348482165317705 exported_pipeline = make_pipeline( make_union( FunctionTransformer(copy), make_union(FunctionTransformer(copy), FunctionTransformer(copy))), SelectPercentile(score_func=f_regression, percentile=89), PolynomialFeatures(degree=2, include_bias=False, interaction_only=False), StackingEstimator(estimator=SGDRegressor(alpha=0.0, eta0=0.01, fit_intercept=True, l1_ratio=1.0, learning_rate="constant", loss="squared_loss", penalty="elasticnet", power_t=50.0)), MaxAbsScaler(), MaxAbsScaler(), LinearSVR(C=0.5, dual=True, epsilon=1.0, loss="epsilon_insensitive", tol=1e-05)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
cat_vars = ['DayOfWeek','Promo','StateHoliday','SchoolHoliday','StoreType','Assortment','CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear','Promo2','Promo2SinceWeek','Promo2SinceYear','PromoInterval','Day','Month','Year'] num_vars = ['Open','Store','CompetitionDistance','ratio1','ratio2'] X_trn, X_val = train_test_split(train, test_size=0.012, random_state=10) print 'Training Stage 1 Models' #train svm svm1 = LinearSVR(verbose=True) svm1.fit(X_trn[cat_vars+num_vars],X_trn['Sales']) svm1_feature = svm1.predict(train[cat_vars+num_vars]) preds = svm1.predict(X_val[cat_vars+num_vars]) print 'svm ',(np.mean(((np.exp(preds)-np.exp(X_val['Sales']))/(np.exp(X_val['Sales'])+1))**2))**0.5 #train xgb dtrain = xgb.DMatrix(X_trn[cat_vars+num_vars],X_trn['Sales']) dvalid = xgb.DMatrix(X_val[cat_vars+num_vars],X_val['Sales']) watchlist = [(dtrain, 'train'), (dvalid, 'eval')] num_boost_round = 50 params1 = {"objective": "reg:linear","booster" : "gbtree", "eta": 0.5,"max_depth": 2,"subsample": 0.5,"colsample_bytree": 0.4, "nthread":4,"silent": 1,"seed": 1301}
def SVMRClassifier(training_data_X, training_data_y, vocab, word_vocab, svmr_type): if svmr_type == 'linearsvr': pos_vectors = CountVectorizer(vocabulary=vocab, analyzer='word', ngram_range=(1, 5), tokenizer=lambda x: x.split(' '), lowercase=False) text_vectors = CountVectorizer(analyzer='word', ngram_range=(1, 1), tokenizer=lambda x: x.split(' '), lowercase=True) classifier = LinearSVR(max_iter=100000) parameters = [{ 'C': [0.1, 1, 10], 'epsilon': [0, 0.1, 1], 'loss': ('epsilon_insensitive', 'squared_epsilon_insensitive') }] fclassifier = GridSearchCV(classifier, parameters, cv=5, n_jobs=7) feature_cat_1 = FeatureUnion([ ('POS', Pipeline([ ('selector', ItemSelector(key='POS')), ('vectorizer', pos_vectors), ('tf', TfidfTransformer(norm='l2', use_idf=True)), ])), ('text', Pipeline([ ('selector', ItemSelector(key='text_norm')), ('vectorizer', text_vectors), ('tf', TfidfTransformer(norm='l2', use_idf=True)), ])), ('gf', Pipeline([ ('selector', ItemSelector(key='gf')), ('toarray', FunctionTransformer(returnNumpyMatrix, validate=False)), ('tf', TfidfTransformer(norm='l2', use_idf=True)), ])), ('fa', Pipeline([ ('selector', ItemSelector(key='fa')), ('toarray', FunctionTransformer(returnNumpyMatrix, validate=False)), ('tf', TfidfTransformer(norm='l2', use_idf=True)), ])), ('diag_act', Pipeline([ ('selector', ItemSelector(key='diag_act')), ('toarray', FunctionTransformer(returnNumpyMatrix, validate=False)), ('tf', TfidfTransformer(norm='l2', use_idf=True)), ])), ]) feature_cat_2 = FeatureUnion([ ('word_count', Pipeline([ ('selector', ItemSelector(key='word_count')), ('toarray', FunctionTransformer(returnNumpyArray, validate=False)), ])), ('f_measure', Pipeline([ ('selector', ItemSelector(key='f_measure')), ('toarray', FunctionTransformer(returnNumpyArray, validate=False)), ])), ]) text_clf = Pipeline([ ('features', FeatureUnion([ ('pipeline', Pipeline([ ('features', feature_cat_1), ])), ('pipeline2', Pipeline([ ('features', feature_cat_2), ('scaler', MinMaxScaler()), ])), ])), ('clf', fclassifier), ]) text_clf.fit(training_data_X, training_data_y) return text_clf
def StandardLinearSVR(C=10.0,epsilon=0.01): return Pipeline([ ("std_scaler",StandardScaler()), ("linearSVR",LinearSVR(C = C,epsilon = epsilon)) ])
y = notas['linguagem_codigo'] # Treino da inteligencia # Ele seleciona alguns elementos para "ensinar" e outros para "testar" a qualidade do teste x_treino, x_teste, y_treino, y_teste = train_test_split(x, y, random_state=326784) # Separa a amostra em elementos de treino e de teste print(f'Dados para treino (x e y): x = {x_treino.shape} e y = {y_treino.shape}') # random_state é outra forma de fixar a escolha de termos aleatorios #print(x_treino.shape) # não é muito eficiente, pois um método pode chamar outro que utilizam random, e este não seguira este padrão ... print(f'Dados para teste (x e y): x = {x_teste.shape} e y = {y_teste.shape}') # Criar o modelo de inteligencia artificial print('Criação e treino da inteligencia artificial (IA)') a = time.process_time() # modelo = SVR() # Cria um modelo Não Linear (é muito "pesado") print('Modelo - Linear SVR') modelo_svrl = LinearSVR(max_iter=1000) # Máquina de Vetores de suporte (SVM, do inglês: support vector machine) modelo_svrl = modelo_svrl.fit(x_treino, y_treino) # .fit - Realiza o treino (forma de aprender as regras, ou tentar) predicoes_svrl = modelo_svrl.predict(x_teste) # .predict - Saida dos valores estimados pela IA # plot.figure(figsize=(10,10)) # sns.scatterplot(x=y_teste, y=(predicoes_svr - y_teste)) # plotar diferença entre o projetado e o real # plot.show() #print(modelo_svr) qualidade_svrl = mean_squared_error(y_teste, predicoes_svrl) del modelo_svrl, predicoes_svrl print(f'Tempo gasto: {time.process_time()- a} s') print('Modelo - SVR') # Muito Pesado a = time.process_time() modelo_svr = SVR() modelo_svr = modelo_svr.fit(x_treino, y_treino) predicoes_svr = modelo_svr.predict(x_teste)
print('Classifier {}, fold {}, run {}'.format( classifier_name, fold, run_delay)) print('Probably caused by a locked thread') return 'Timeout_{}_{}_{}'.format(classifier_name, fold, run_delay) except Exception as e: print('Exception {} occured during threading of:'.format(e)) print('Classifier {}, fold {}, run {}'.format(classifier_name, fold, run_delay)) return 'Catched Exception: {}, Classifier: {}, fold: {}, run: {}'.format( e, classifier_name, fold, run_delay) if __name__ == "__main__": # the classifiers classifiers = [ LinearSVR(), #RandomForestRegressor(), ] # their parameters, should be in the correct order classifiers_params = [ { 'C': 0.25 }, #{'n_estimators' : 100}, ] # number of genes selected per run n_genes = 250 # number of folds per classifier n_folds = 2500 # number of threads
build_housing( AdaBoostRegressor(DecisionTreeRegressor(random_state=13, min_samples_leaf=5), random_state=13, n_estimators=17), "AdaBoostHousing") build_housing(KNeighborsRegressor(), "KNNHousing", with_kneighbors=True) build_housing( MLPRegressor(activation="tanh", hidden_layer_sizes=(26, ), solver="lbfgs", random_state=13, tol=0.001, max_iter=1000), "MLPHousing") build_housing(SGDRegressor(random_state=13), "SGDHousing") build_housing(SVR(), "SVRHousing") build_housing(LinearSVR(random_state=13), "LinearSVRHousing") build_housing(NuSVR(), "NuSVRHousing") # # Anomaly detection # def build_iforest_housing_anomaly(iforest, name, **kwargs): mapper = DataFrameMapper([(housing_X.columns.values, ContinuousDomain())]) pipeline = Pipeline([("mapper", mapper), ("estimator", iforest)]) pipeline.fit(housing_X) pipeline = make_pmml_pipeline(pipeline, housing_X.columns.values) customize(iforest, **kwargs) store_pkl(pipeline, name + ".pkl") decisionFunction = DataFrame(pipeline.decision_function(housing_X),
print("Data shape after feature creation: ", df_data.shape) print("Columns: ", " ".join(list(df_data.columns))) print('------------------------------------------') if args.regression: y = df_data['mmse'].values else: y = df_data['target'].values X = df_data.drop(['id', 'target'], axis=1) # build classification model if args.regression: lr_1 = LinearRegression(fit_intercept=True) svm_10 = LinearSVR(C=10, fit_intercept=True, random_state=123, max_iter=10000) svm_100 = LinearSVR(C=100, fit_intercept=True, random_state=123, max_iter=10000) xgb = xgb.XGBRegressor(max_depth=10, subsample=0.8, n_estimators=50, colsample_bytree=0.8, learning_rate=1, nthread=8) rfc = RandomForestRegressor(random_state=123, n_estimators=50, max_depth=5) learners = [('xgb', xgb), ('rfc', rfc), ('svm_10', svm_10),
plt.show() ###################################################################### Plot the tree regressor vs. test outputs plt.figure(figsize=(10,6)) testTreeTargetHandle, = plt.plot(day, testTargets / 1000000, label = 'Target values') testTreeOutputHandle, = plt.plot(day, treeTestingOutputs / 1000000, label = 'Decision tree', linestyle = 'dotted') plt.xlabel('Day') plt.ylabel(r'Incoming Solar Energy [$MJ / m^2$]') plt.title('Comparison of Decision Tree Test Targets and Outputs') plt.legend(handles = [testTreeTargetHandle, testTreeOutputHandle]) plt.show() #INITIALIZE from sklearn.svm import LinearSVR svm_clf = LinearSVR(C=0.6, loss='squared_epsilon_insensitive') svm_clf.fit(scaledTrainingInputs, np.ravel(scaledTrainingTargets)) # PREDICT the training outputs and the test outputs scaledTrainingOutputs = svm_clf.predict(scaledTrainingInputs) scaledTestOutputs = svm_clf.predict(scaledTestInputs) trainingOutputs = tScaler.inverse_transform(scaledTrainingOutputs) testOutputs = tScaler.inverse_transform(scaledTestOutputs) #Calculate and display training and test root mean square error (RMSE) trainingsvmRMSE = np.sqrt(np.sum((trainingOutputs - trainingTargets[:, 0]) ** 2) / len(trainingOutputs)) / 1000000 # Divide by 1e6 for MJ/m^2 testsvmRMSE = np.sqrt(np.sum((testOutputs - testTargets[:, 0]) ** 2) / len(testOutputs)) / 1000000 #### PLOTTING
StackingEstimator(estimator=XGBRegressor(learning_rate=0.001, max_depth=1, min_child_weight=3, n_estimators=50, n_jobs=1, objective="reg:squarederror", subsample=0.9500000000000001, verbosity=0)), MinMaxScaler(), StackingEstimator(estimator=SGDRegressor(alpha=0.01, eta0=0.01, fit_intercept=False, l1_ratio=0.0, learning_rate="constant", loss="huber", penalty="elasticnet", power_t=0.0)), StackingEstimator(estimator=LinearSVR( C=25.0, dual=True, epsilon=0.1, loss="epsilon_insensitive", tol=0.0001)), FeatureAgglomeration(affinity="l2", linkage="average"), SelectPercentile(score_func=f_regression, percentile=6), StackingEstimator(estimator=ExtraTreesRegressor(bootstrap=False, max_features=0.8, min_samples_leaf=19, min_samples_split=10, n_estimators=400)), ZeroCount(), FeatureAgglomeration(affinity="l2", linkage="complete"), StackingEstimator(estimator=RidgeCV()), RidgeCV()) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
import pandas as pd from sklearn.svm import LinearSVR import matplotlib.pyplot as plt inputfile = './datasave/new_reg_data_GM11.csv' #灰色预测后保存的路径 data = pd.read_csv(inputfile) #读取数据 data.index = range(1994, 2016) feature = ['x1', 'x4', 'x5', 'x6', 'x7', 'x8'] data_train = data.loc[range(1994, 2014)].copy() #取2014年前的数据建模 data_mean = data_train.mean() data_std = data_train.std() data_train = (data_train - data_mean) / data_std #数据标准化 x_train = data_train[feature].values #特征数据 y_train = data_train['y'].values #标签数据 linearsvr = LinearSVR() #调用LinearSVR()函数 linearsvr.fit(x_train, y_train) x = ((data[feature] - data_mean[feature])/ \ data_std[feature]).values #预测,并还原结果。 data[u'y_pred'] = linearsvr.predict(x) * \ data_std['y'] + data_mean['y'] ## SVR预测后保存的结果 outputfile = './datasave/new_reg_data_GM11_revenue.csv' data.to_csv(outputfile) print('真实值与预测值分别为:', data[['y', 'y_pred']]) p = data[['y', 'y_pred']].plot(style=['b-o', 'r-*']) p.set_ylim(0, 2500) p.set_xlim(1993, 2016) plt.show()
import numpy as np import pandas as pd from sklearn.linear_model import ElasticNetCV, LassoLarsCV from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline, make_union from sklearn.svm import LinearSVR from tpot.builtins import StackingEstimator from sklearn.preprocessing import FunctionTransformer from copy import copy # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:-17.62015561497372 exported_pipeline = make_pipeline( make_union( StackingEstimator(estimator=make_pipeline( StackingEstimator(estimator=ElasticNetCV(l1_ratio=0.55, tol=0.001)), LassoLarsCV(normalize=True) )), FunctionTransformer(copy) ), LinearSVR(C=5.0, dual=True, epsilon=1.0, loss="epsilon_insensitive", tol=0.1) ) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
test_dataA.drop(['嗜碱细胞%'], axis=1, inplace=True) #对列的空值进行填充 for i in train_data.columns: train_data[i].fillna(train_data[i].mean(), inplace=True) for i in test_dataA: test_dataA[i].fillna(test_dataA[i].mean(), inplace=True) print(train_data.info()) train_data_y = train_data['血糖'] train_data.drop(['血糖'], axis=1, inplace=True) print(test_dataA.info()) #归一化 scaler = StandardScaler() train_data = scaler.fit_transform(train_data.astype(float)) test_dataA = scaler.transform(test_dataA.astype(float)) #建立模型 lin_svr = LinearSVR(random_state=42, max_iter=5000) lin_svr.fit(train_data, train_data_y) test_features_labers = lin_svr.predict(test_dataA) #评估模型 mse = mean_squared_error(test_labels, test_features_labers) print(mse) print(np.sqrt(mse)) #使用RandomizedSearchCV param_distributions = { 'gamma': reciprocal([0.001, 0.1]), # 'C': uniform(1,10) 'C': [uniform(1, 10), uniform(1, 10)] } rnd_search_cv = RandomizedSearchCV(SVR(), param_distributions, n_iter=10,
svr_score_test = svr.score(smr_test.feature_matrix, smr_test.labels) print 'SVR precision test: {}'.format(svr_score_test) # plot_learning_curve(svr, 'SVR Curve', smr_train.feature_matrix, smr_train.labels, n_jobs=4) print '' lsvc = LinearSVC() print 'LinearSVC config:' print lsvc.get_params() lsvc.fit(smr_train.feature_matrix, smr_train.labels) lsvc_score_train = lsvc.score(smr_train.feature_matrix, smr_train.labels) print 'LinearSVC precision train: {}'.format(lsvc_score_train) lsvc_score_test = lsvc.score(smr_test.feature_matrix, smr_test.labels) print 'LinearSVC precision test: {}'.format(lsvc_score_test) print '' lsvr = LinearSVR() print 'LinearSVR config:' print svc.get_params() lsvr.fit(smr_train.feature_matrix, smr_train.labels) lsvr_score_train = svc.score(smr_train.feature_matrix, smr_train.labels) print 'LinearSVR precision train: {}'.format(lsvr_score_train) lsvr_score_test = lsvr.score(smr_test.feature_matrix, smr_test.labels) print 'LinearSVR precision test: {}'.format(lsvr_score_test) print '' nusvc = NuSVC() print 'NuSVC config:' print nusvc.get_params() nusvc.fit(smr_train.feature_matrix, smr_train.labels) nusvc_score_train = nusvc.score(smr_train.feature_matrix, smr_train.labels) print 'NuSVC precision train: {}'.format(nusvc_score_train)
NEIGHBOR = 400 # pick some neighbor to compute the eigenvalues randidx = np.random.permutation(data.shape[0])[:SAMPLE] knbrs = NearestNeighbors(n_neighbors=NEIGHBOR, algorithm='ball_tree').fit(data) sing_vals = [] for idx in randidx: dist, ind = knbrs.kneighbors(data[idx:idx + 1]) nbrs = data[ind[0, 1:]] u, s, v = np.linalg.svd(nbrs - nbrs.mean(axis=0)) s /= s.max() sing_vals.append(s) sing_vals = np.array(sing_vals).mean(axis=0) return sing_vals # Train a linear SVR npzfile = np.load('large_data.npz') X = npzfile['X'] y = npzfile['y'] # we already normalize these values in gen.py # X /= X.max(axis=0, keepdims=True) svr = SVR(C=1) svr.fit(X, y) joblib.dump(svr, 'model.sav') # In[ ]:
## Create K folds k_fold = KFold(Y_train_raw.shape[0], n_folds=10) for train, test in k_fold: X1 = X_train_reduced[train] Y1 = Y_train_raw[train] X2 = X_train_reduced[test] Y2 = Y_train_raw[test] ## Train Classifiers on fold rdg_clf = Ridge(alpha=0.5) rdg_clf.fit(X1, Y1) lso_clf = Lasso(alpha=0.6257) lso_clf.fit(X1, Y1) svr_clf = LinearSVR(C=1e3) svr_clf.fit(X1, Y1) ## Score Classifiers on fold rdg_clf_score = rdg_clf.score(X2, Y2) lso_clf_score = lso_clf.score(X2, Y2) svr_clf_score = svr_clf.score(X2, Y2) print "Ridge: ", rdg_clf_score print "Lasso: ", lso_clf_score print "SVR_RBF: ", svr_clf_score ## Train final Classifiers # clf = Ridge(alpha=.5) clf = LinearSVR(C=1e3, gamma=0.1)
from sklearn.preprocessing import StandardScaler from sklearn.datasets import make_regression from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt import numpy as np import pandas as pd df = pd.read_csv("finalenc.csv") y = df['price'] X = df.drop(columns=['price']) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1) regr = make_pipeline(StandardScaler(), LinearSVR(random_state=0, tol=1e-03)) reg = LinearRegression().fit(X_train, y_train) regr.fit(X_train, y_train) y_pred = regr.predict(X_test) plt.figure() plt.plot(range(100000)) plt.scatter(y_test, y_pred, alpha=0.4, c='red', label='Ground Truth vs Predicted') plt.savefig('SVR.png')
(x_input, y_input) = get_training_data(feature_lin_lambda=feature_lin_lambda, feature_lin_var=feature_lin_var, data_exp=data_exp) # 对属性进行归一化 x_scaler = preprocessing.MinMaxScaler(feature_range=(-1,1)) ### y 需不需要进行归一化?没有归一化的理由,但影响结果!!! # y_scaler = preprocessing.MinMaxScaler(feature_range=(-1,1)) # x_input_minmax = x_scaler.fit_transform(x_input) # y_input_minmax = y_scaler.fit_transform(y_input.reshape(-1,1)) # y_input_minmax = y_input_minmax.reshape((len(y_input_minmax))) # 通过交叉验证来选择C best_cv_score = -1e+30; for log2c in np.arange(-10,30,1): clf = LinearSVR(C=2**log2c, epsilon=0.0001) clf.fit(x_input_minmax, y_input) cv_score = cross_val_score(cv=sample_num, estimator=clf, X=x_input_minmax, y=y_input, scoring= 'mean_squared_error').mean() # 留1 print(cv_score) if cv_score > best_cv_score: best_cv_score = cv_score bestc = 2**log2c # 利用所选的参数进行预测 clf = LinearSVR(C=bestc, epsilon=0.0001) clf.fit(x_input_minmax, y_input) y_pred = clf.predict(x_input_minmax) # y_pred = y_scaler.inverse_transform(y_pred.reshape(-1,1)) view_point = 5;
def __init__(self, C): self.regression = LinearSVR(C=C)
class TextLearner(object): def __init__(self,data_path,model_path = "./",name = ""): self.name = name self.data_path = data_path self.model_path = model_path self.DesignMatrix = [] self.TestMatrix = [] self.X_train = [] self.y_train = [] # not only train but general purpose too self.X_test = [] self.y_test = [] self.y_pred = [] self.vectorizer = None self.feature_names = None self.chi2 = None self.mlModel = None self.F = Filter() def __enter__(self): return self def __exit__(self, type, value, traceback): self.DesignMatrix = [] self.TestMatrix = [] self.X_train = [] self.y_train = [] self.X_test = [] self.y_test = [] self.y_pred = [] self.vectorizer = None self.feature_names = None self.chi2 = None self.mlModel = None self.F = None def addModelDetails(self,model_p,name = ""): self.name = name self.model_path = model_p def load_data(self,TrTe = 0): #TrTe => 0-Train 1-Test # returns the dimensions of vectors with open( self.data_path, 'rb') as f: if TrTe == 0: self.DesignMatrix = pickle.load(f) return len(self.DesignMatrix[1]) if TrTe == 1: self.TestMatrix = pickle.load(f) return len(self.TestMatrix[1]) def clearOld(self): self.X_train = [] self.y_train = [] self.X_test = [] self.y_test = [] self.y_pred = [] self.vectorizer = None self.feature_names = None self.chi2 = None self.mlModel = None def process(self,text,default = 0): if default == 0: text = text.strip().lower().encode("utf-8") else: text = self.F.process(text) return text def loadXY(self,TrTe = 0,feature_index = 0,label_index = 1): #TrTe => 0-Train 1-Test if TrTe == 0: for i in self.DesignMatrix: self.X_train.append(self.process(i[feature_index])) self.y_train.append(i[label_index]) self.X_train = np.array(self.X_train) self.y_train = np.array(self.y_train) elif TrTe == 1: for i in self.TestMatrix: self.X_test.append(self.process(i[feature_index])) self.y_test.append(i[label_index]) self.X_test = np.array(self.X_test) self.y_test = np.array(self.y_test) def featurizeXY(self,only_train = 1): # Extracts Features sw = ['a', 'across', 'am', 'an', 'and', 'any', 'are', 'as', 'at', 'be', 'been', 'being', 'but', 'by', 'can', 'could', 'did', 'do', 'does', 'each', 'for', 'from', 'had', 'has', 'have', 'in', 'into', 'is', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'of', 'on', 'or', 'that', "that's", 'thats', 'the', 'there', "there's", 'theres', 'these', 'this', 'those', 'to', 'under', 'until', 'up', 'were', 'will', 'with', 'would'] self.vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words=sw) self.X_train = self.vectorizer.fit_transform(self.X_train) self.feature_names = self.vectorizer.get_feature_names() if only_train == 0: self.X_test = self.vectorizer.transform(self.X_test) def reduceDimension(self,only_train = 1, percent = 50): # Reduce dimensions / self best of features n_samples, n_features = self.X_train.shape k = int(n_features*(percent/100)) self.chi2 = SelectKBest(chi2, k=k) self.X_train = self.chi2.fit_transform(self.X_train, self.y_train) self.feature_names = [self.feature_names[i] for i in self.chi2.get_support(indices=True)] self.feature_names = np.asarray(self.feature_names) if only_train == 0: self.X_test = self.chi2.transform(self.X_test) def trainModel(self,Model = "default"): if Model == "default": self.mlModel = LinearSVR(loss='squared_epsilon_insensitive',dual=False, tol=1e-3) else: self.mlModel = Model self.mlModel.fit(self.X_train, self.y_train) def testModel(self,approx = 1): # returns score ONLY self.y_pred = np.array(self.mlModel.predict(self.X_test)) if approx == 1: ### To convert real valued results to binary for scoring temp = [] for y in self.y_pred: if y > 0.0: temp.append(1.0) else: temp.append(-1.0) self.y_pred = temp return metrics.accuracy_score(self.y_test, self.y_pred) def getReport(self,save = 1, get_top_words = 0): # returns report report = "" if get_top_words == 1: if hasattr(self.mlModel, 'coef_'): report += "Dimensionality: " + str(self.mlModel.coef_.shape[1]) report += "\nDensity: " + str(density(self.mlModel.coef_)) rank = np.argsort(self.mlModel.coef_[0]) top10 = rank[-20:] bottom10 = rank[:20] report += "\n\nTop 10 keywords: " report += "\nPositive: " + (" ".join(self.feature_names[top10])) report += "\nNegative: " + (" ".join(self.feature_names[bottom10])) score = metrics.accuracy_score(self.y_test, self.y_pred) report += "\n\nAccuracy: " + str(score) report += "\nClassification report: " report += "\n\n" + str(metrics.classification_report(self.y_test, self.y_pred,target_names=["Negative","Positive"])) report += "\nConfusion matrix: " report += "\n\n" + str(metrics.confusion_matrix(self.y_test, self.y_pred)) + "\n\n" if save == 1: with open(self.model_path + "report.txt", "w") as text_file: text_file.write(report) return report def crossVal(self,folds = 5, dim_red = 50,full_iter = 0, save = 1): # returns report # Caution: resets train and test X,y skf = cross_validation.StratifiedKFold(self.y_train, n_folds = folds,shuffle=True) print(skf) master_report = "" X_copy = self.X_train y_copy = self.y_train for train_index, test_index in skf: self.X_train, self.X_test = X_copy[train_index], X_copy[test_index] self.y_train, self.y_test = y_copy[train_index], y_copy[test_index] self.featurizeXY(0) self.reduceDimension(0,dim_red) self.trainModel() self.testModel() master_report += self.getReport(save = 0,get_top_words = 0) if full_iter == 1: continue else: break if save == 1: with open(self.model_path + "master_report.txt", "w") as text_file: text_file.write(master_report) return master_report def save_obj(self,obj, name ): with open(self.model_path + name + '.pkl', 'wb') as f: pickle.dump(obj, f, protocol=2) def saveModel(self): # saves in model path self.save_obj(self.mlModel, self.name + "_model") self.save_obj(self.vectorizer, self.name + "_vectorizer") self.save_obj(self.chi2, self.name + "_feature_selector") def plot(self): ''' beta (Just plotting the model) (Not working) ''' h = .02 # step size in the mesh # create a mesh to plot in x_min, x_max = self.X_train[:, 0].min() - 1, self.X_train[:, 0].max() + 1 y_min, y_max = self.X_train[:, 1].min() - 1, self.X_train[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h),np.arange(y_min, y_max, h)) # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, m_max]x[y_min, y_max]. Z = self.mlModel.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) plt.contour(xx, yy, Z, cmap=plt.cm.Paired) plt.xlim(xx.min(), xx.max()) plt.ylim(yy.min(), yy.max()) plt.xticks(()) plt.yticks(()) plt.title(self.name) plt.savefig(self.model_path + 'plot.png')
def GetRegModels(nums=None, cats=None, text=None, scoring='neg_mean_squared_error', split_option=None, split_type=None, max_train_time=3600): if nums is None and cats is None and text is None: return if nums is None: nums = [] if cats is None: cats = [] if text is None: text = [] models = [] """ Numeric preprocessing options """ if len(nums) > 0: params_impute = [[ 'pre__nums__imputer', 'categoric', ['mean', 'median'] ]] params_scale = [[ 'pre__nums__scaler', 'categoric', ['MinMax', 'MaxAbs', 'Standard', 'Robust'] ]] else: params_impute = [] params_scale = [] """ Linear models """ # Linear regression mod = Pipeline([('pre', GetPreprocessor(nums, cats, text)), ('mod', LinearRegression())]) params = [] params += params_impute mod = BayesianOptEstimator(mod, params, scoring, split_type, split_option, max_train_time=max_train_time) mod.name = 'Linear Regression' models += [mod] # Lasso regression mod = Pipeline([('pre', GetPreprocessor(nums, cats, text)), ('mod', Lasso(copy_X=False))]) params = [['mod__alpha', 'exponential', [-5, 5]]] params += params_impute + params_scale mod = BayesianOptEstimator(mod, params, scoring, split_type, split_option, max_train_time=max_train_time) mod.name = 'Lasso Regression' models += [mod] # Ridge regression mod = Pipeline([('pre', GetPreprocessor(nums, cats, text)), ('mod', Ridge(copy_X=False))]) params = [['mod__alpha', 'exponential', [-5, 5]]] params += params_impute + params_scale mod = BayesianOptEstimator(mod, params, scoring, split_type, split_option, max_train_time=max_train_time) mod.name = 'Ridge Regression' models += [mod] # ElasticNet regression mod = Pipeline([('pre', GetPreprocessor(nums, cats, text)), ('mod', ElasticNet(copy_X=False))]) params = [['mod__l1_ratio', 'float', [1e-3, 1 - 1e-3]], ['mod__alpha', 'exponential', [-5, 5]]] params += params_impute + params_scale mod = BayesianOptEstimator(mod, params, scoring, split_type, split_option, max_train_time=max_train_time) mod.name = 'ElasticNet Regression' models += [mod] # # SGD Regression # mod = Pipeline([('pre', GetPreprocessor(nums, cats, text)), # ('mod', SGDRegressor())]) # # params = [['mod__alpha', 'exponential', [-5, 5]], # ['mod__penalty', 'categoric', ['l2', 'l2', 'elasticnet']], # ['mod__l1_ratio', 'float', [1e-5, 1-1e-5]]] # # params += params_impute + params_scale # # mod = BayesianOptEstimator(mod, params, scoring, split_type, split_option,max_train_time=max_train_time) # mod.name = 'SGD Regression' # models += [mod] # # Lasso LARS # mod = Pipeline([('pre', GetPreprocessor(nums, cats, text)), # ('mod', LassoLars(max_iter=500))]) # # params = [['mod__alpha', 'exponential', [-5, 5]]] # params += params_impute + params_scale # # mod = BayesianOptEstimator(mod, params, scoring, split_type, split_option,max_train_time=max_train_time) # mod.name = 'LARS Regression' # models += [mod] # Passive Aggressive Regressor mod = Pipeline([('pre', GetPreprocessor(nums, cats, text)), ('mod', PassiveAggressiveRegressor())]) params = [['mod__C', 'exponential', [-5, 5]]] params += params_impute + params_scale mod = BayesianOptEstimator(mod, params, scoring, split_type, split_option, max_train_time=max_train_time) mod.name = 'Passive Aggressive Regression' models += [mod] """ Support Vector Machines """ # Linear SVM mod = Pipeline([('pre', GetPreprocessor(nums, cats, text)), ('mod', LinearSVR(dual=False, loss='squared_epsilon_insensitive'))]) params = [['mod__C', 'exponential', [-5, 5]]] params += params_impute + params_scale mod = BayesianOptEstimator(mod, params, scoring, split_type, split_option, max_train_time=max_train_time) mod.name = 'Linear SVM' models += [mod] # Kernel SVM mod = Pipeline([('pre', GetPreprocessor(nums, cats, text)), ('krn', RBFSampler()), ('mod', LinearSVR(dual=False, loss='squared_epsilon_insensitive'))]) params = [['krn__gamma', 'exponential', [-10, 10]], ['krn__n_components', 'integer', [10, 200]], ['mod__C', 'exponential', [-5, 5]]] params += params_impute + params_scale mod = BayesianOptEstimator(mod, params, scoring, split_type, split_option, max_train_time=max_train_time) mod.name = 'Kernel SVM' models += [mod] """ Tree based methods """ # Decision Tree mod = Pipeline([('pre', GetPreprocessor(nums, cats, text)), ('mod', DecisionTreeRegressor())]) params = [['mod__criterion', 'categoric', ['mse', 'friedman_mse', 'mae']], ['mod__max_depth', 'integer', [1, 15]], ['mod__min_samples_split', 'integer', [2, 20]], ['mod__min_samples_leaf', 'integer', [1, 20]]] params += params_impute mod = BayesianOptEstimator(mod, params, scoring, split_type, split_option, max_train_time=max_train_time) mod.name = 'Decision Tree' models += [mod] # Random Forest mod = Pipeline([('pre', GetPreprocessor(nums, cats, text)), ('mod', RandomForestRegressor())]) params = [['mod__criterion', 'categoric', ['mse', 'friedman_mse', 'mae']], ['mod__max_depth', 'integer', [1, 15]], ['mod__min_samples_split', 'integer', [2, 20]], ['mod__min_samples_leaf', 'integer', [1, 20]]] params += params_impute mod = BayesianOptEstimator(mod, params, scoring, split_type, split_option, max_train_time=max_train_time) mod.name = 'Random Forest' models += [mod] # Extremely Random Forest mod = Pipeline([('pre', GetPreprocessor(nums, cats, text)), ('mod', ExtraTreesRegressor())]) params = [['mod__criterion', 'categoric', ['mse', 'friedman_mse', 'mae']], ['mod__max_depth', 'integer', [1, 15]], ['mod__min_samples_split', 'integer', [2, 20]], ['mod__min_samples_leaf', 'integer', [1, 20]]] params += params_impute mod = BayesianOptEstimator(mod, params, scoring, split_type, split_option, max_train_time=max_train_time) mod.name = 'Extra Trees' models += [mod] # Boosting needs dense data if len(text) == 0: # Gradient Boosted Trees mod = Pipeline([('pre', GetPreprocessor(nums, cats, text)), ('mod', GradientBoostingRegressor())]) params = [[ 'mod__criterion', 'categoric', ['mse', 'friedman_mse', 'mae'] ], ['mod__max_depth', 'integer', [1, 5]], ['mod__min_samples_split', 'integer', [2, 20]], ['mod__min_samples_leaf', 'integer', [1, 20]], ['mod__learning_rate', 'exponential', [-5, 0]], ['mod__n_estimators', 'integer', [10, 50]]] params += params_impute mod = BayesianOptEstimator(mod, params, scoring, split_type, split_option, max_train_time=max_train_time) mod.name = 'Gradient Boosted Trees' models += [mod] # AdaBoost mod = Pipeline([('pre', GetPreprocessor(nums, cats, text)), ('mod', AdaBoostRegressor())]) params = [[ 'mod__loss', 'categoric', ['linear', 'square', 'exponential'] ], ['mod__learning_rate', 'exponential', [-5, 5]], ['mod__n_estimators', 'integer', [10, 50]]] params += params_impute mod = BayesianOptEstimator(mod, params, scoring, split_type, split_option, max_train_time=max_train_time) mod.name = 'AdaBoost' models += [mod] # XGBoost! mod = Pipeline([('pre', GetPreprocessor(nums, cats, text)), ('mod', XGBRegressor())]) params = [['mod__max_depth', 'integer', [1, 5]], ['mod__learning_rate', 'exponential', [-5, 0]], ['mod__n_estimators', 'integer', [10, 50]]] params += params_impute mod = BayesianOptEstimator(mod, params, scoring, split_type, split_option, max_train_time=max_train_time) mod.name = 'XGBoost' models += [mod] """ KNN """ # KNN mod = Pipeline([('pre', GetPreprocessor(nums, cats, text)), ('mod', KNeighborsRegressor())]) params = [['mod__n_neighbors', 'integer', [1, 20]], ['mod__weights', 'categoric', ['uniform', 'distance']]] params += params_impute + params_scale mod = BayesianOptEstimator(mod, params, scoring, split_type, split_option, max_train_time=max_train_time) mod.name = 'K Nearest Neighbors' models += [mod] """ Neural Network""" # NN mod = Pipeline([('pre', GetPreprocessor(nums, cats, text)), ('mod', MLPRegressor(learning_rate_init=0.01))]) params = [['mod__alpha', 'exponential', [-10, 10]], ['mod__hidden_layer_sizes', 'integer', [5, 50]]] params += params_impute + params_scale mod = BayesianOptEstimator(mod, params, scoring, split_type, split_option, max_train_time=max_train_time) mod.name = 'Neural Network' models += [mod] return {mod.name: mod for mod in models}
def __init__(self, **kwargs): self.model = LinearSVR(**kwargs)
''' X_train_rank = np.linalg.matrix_rank(X_train) ##################################################################### ## PARAMETER ESTIMATION WITH RAW DATA ## ##################################################################### ### Parameter estimation with least square linear matrix solver beta_height, residual_height, rank, singu_value = np.linalg.lstsq( X_train, Y_height_train, rcond=None) beta_thick, residual_thick, rank, singu_value = np.linalg.lstsq(X_train, Y_thick_train, rcond=None) ### Parameter estimation with svr svr_lin = LinearSVR(verbose=True) svr_lin.fit(X_train, Y_height_train) beta_height_svr = svr_lin.coef_ ##################################################################### ## MONTE CARLO SIMULATION ## ##################################################################### ### Monte Carlo Simulation mu_vec = np.zeros((4, 1)) sigma_vec = np.ones((4, 1)) ### Parameters -> number of sample an error percentage num_samples_mc = 1000000 error_percentage = 0.2
md=dnn_reg(X_train,y_train,X_test,y_test) reg_eval(X_test,y_test,md) ###Lasso CV regression def reg_eval2(y_test,model): y_pred=model.predict(X_test) print("evaluation the results for model:",model) print("MSE:",mean_squared_error(y_test,y_pred)) print("R2:",r2_score(y_test,y_pred)) print("EVS:",explained_variance_score(y_test,y_pred)) lasso = LassoCV(cv=5, random_state=0,max_iter=10000) lasso.fit(X_train,y_train) reg_eval2(y_test,lasso) #ElasticNet Regressionb ela = ElasticNetCV(l1_ratio=0.8,normalize=True,max_iter=5000,random_state=77) ela.fit(X_train,y_train) print("R square:",ela.score(X_test,y_test)) reg_eval2(y_test,ela) #SVR Regression from sklearn.svm import LinearSVR LSVR=LinearSVR(epsilon=0.1,random_state=0, tol=1e-5,max_iter=10000) # scaler=RobustScaler() # pipe=Pipeline(steps=[("scaling",scaler),("rg",LSVR)]) LSVR.fit(X_train,y_train) reg_eval2(y_test,LSVR))
import numpy as np import pandas as pd from src.model_validation import ModelValidation from personal.chris_farr.mixed_stepwise_selection import MixedStepSelect from src.data.data_ks_filtered import DataKSFiltered from random import randint from sklearn.svm import LinearSVR # Setup data_class = DataKSFiltered() x_train, x_test, y_train, y_scaler = data_class.load_data() validation = ModelValidation() predictions_file = "personal/chris_farr/predictions.csv" features_file = "personal/chris_farr/features.csv" model = LinearSVR(random_state=0) # TODO Run mixed select for _ in range(100): starting_features = randint(5, int(len(x_train.columns) / 2)) ms = MixedStepSelect(corr_threshold=.99, data_class=data_class, n_start_feats=starting_features) ms.model = model ms.run(1000) in_features = ms.in_features # TODO Store results in CSV # TODO randomize attributes with each run # Final scoring and storage
from sklearn.ensemble import VotingClassifier, VotingRegressor @pytest.mark.parametrize( "X, y, estimator", [(*make_classification(n_samples=10), StackingClassifier(estimators=[('lr', LogisticRegression()), ('svm', LinearSVC()), ('rf', RandomForestClassifier())])), (*make_classification(n_samples=10), VotingClassifier(estimators=[('lr', LogisticRegression()), ('svm', LinearSVC()), ('rf', RandomForestClassifier())])), (*make_regression(n_samples=10), StackingRegressor(estimators=[('lr', LinearRegression()), ('svm', LinearSVR()), ('rf', RandomForestRegressor())])), (*make_regression(n_samples=10), VotingRegressor(estimators=[('lr', LinearRegression()), ('svm', LinearSVR()), ('rf', RandomForestRegressor())]))], ids=['stacking-classifier', 'voting-classifier', 'stacking-regressor', 'voting-regressor'] ) def test_ensemble_heterogeneous_estimators_behavior(X, y, estimator): # check that the behavior of `estimators`, `estimators_`, # `named_estimators`, `named_estimators_` is consistent across all # ensemble classes and when using `set_params()`. # before fit assert 'svm' in estimator.named_estimators
# for i, svm_clf in enumerate(svm_clfs): # plt.subplot(222 + i) # plot_predictions(svm_clf, [-1.5, 2.5, -1, -1.5]) # plot_dataset(X, y, [-1.5, 2.5, -1, 1.5]) # gamma, C = hyperparams[i] # plt.title(r"$\gamma = {}, C={}$".format(gamma, C), fontsize=16) # plt.show() rnd.seed(42) m = 50 X = 2 * rnd.rand(m, 1) y = (4 + 3 * X + rnd.randn(m, 1)).ravel() svm_reg1 = LinearSVR(epsilon=1.5) svm_reg2 =LinearSVR(epsilon=0.5) svm_reg1.fit(X, y) svm_reg2.fit(X, y) def find_support_vectors(svm_reg, X, y): y_pred = svm_reg.predict(X) off_margin = (np.abs(y - y_pred) >= svm_reg.epsilon) return np.argwhere(off_margin) svm_reg1.support_ = find_support_vectors(svm_reg1, X, y) svm_reg2.support_ = find_support_vectors(svm_reg2, X, y) eps_x1 = 1