predict_out = int(math.ceil(0.01 * len(data))) data['label'] = data[predict_col].shift(-predict_out) #%% splitting the data into Features X and labels y data.dropna(inplace=True) X = np.array(data.drop(['label'], 1)) y = np.array(data['label']) X = preprocessing.scale(X) y = np.array(data['label']) X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.2) #%% training and trying different kernerls for SVM model for k in ['linear', 'poly', 'rbf', 'sigmoid']: model = svm.SVR(kernel=k, gamma='scale') model.fit(X_train, y_train) accuracy = model.score(X_test, y_test) print(k, accuracy) #%% training linearregression model model = LinearRegression() model.fit(X_train, y_train) accuracy = model.score(X_test, y_test) print('accuracy ', accuracy) #%% chosing the svm linear kernel for best accuracy model = svm.SVR(kernel='linear', gamma='scale') model.fit(X_train, y_train) accuracy = model.score(X_test, y_test)

def modelSVM(X_train, y_train): model = svm.SVR(kernel="poly") model.fit(X_train, y_train) return model

def getData(): q.put("starting data query...") lat1 = str(request.args.get('lat1')) lng1 = str(request.args.get('lng1')) lat2 = str(request.args.get('lat2')) lng2 = str(request.args.get('lng2')) w = float(request.args.get('w')) h = float(request.args.get('h')) cell_size = float(request.args.get('cell_size')) analysis = request.args.get('analysis') heatmap = request.args.get('heatmap') spread = request.args.get('spread') if spread == "": spread = 12 else: try: spread = int(spread) except: spread = 12 #CAPTURE ANY ADDITIONAL ARGUMENTS SENT FROM THE CLIENT HERE engine = create_engine( 'sqlite:////var/www/mywebsite/mywebsite/database/datamining.db') Base.metadata.bind = engine DBSession = sessionmaker(bind=engine) session = DBSession() records = session.query(RealEstate).filter( RealEstate.latitude > lat1, RealEstate.latitude < lat2, RealEstate.longitude > lng1, RealEstate.longitude < lng2).all() #USE INFORMATION RECEIVED FROM CLIENT TO CONTROL #HOW MANY RECORDS ARE CONSIDERED IN THE ANALYSIS if heatmap == "true": random.shuffle(records) records = records[:100] if analysis == "true": random.shuffle(records) records = records[:80] numListings = len(records) # iterate through data to find minimum and maximum price minPrice = 1000000000 maxPrice = 0 for record in records: price = record.price if price > maxPrice: maxPrice = price if price < minPrice: minPrice = price output = {"type": "FeatureCollection", "features": []} for record in records: feature = { "type": "Feature", "properties": {}, "geometry": { "type": "Point" } } feature["id"] = record.id feature["properties"]["name"] = record.title feature["properties"]["price"] = record.price feature["properties"]["priceNorm"] = remap(record.price, minPrice, maxPrice, 0, 1) feature["geometry"]["coordinates"] = [ record.latitude, record.longitude ] output["features"].append(feature) if heatmap == "false": if analysis == "false": q.put('idle') return json.dumps(output) output["analysis"] = [] numW = int(math.floor(w / cell_size)) numH = int(math.floor(h / cell_size)) grid = [] for j in range(numH): grid.append([]) for i in range(numW): grid[j].append(0) #USE CONDITIONAL ALONG WITH UI INFORMATION RECEIVED FROM THE CLIENT TO SWITCH #BETWEEN HEAT MAP AND INTERPOLATION ANALYSIS if heatmap == "true": ## HEAT MAP IMPLEMENTATION q.put('starting heatmap analysis...') for record in records: pos_x = int(remap(record.longitude, lng1, lng2, 0, numW)) pos_y = int(remap(record.latitude, lat1, lat2, numH, 0)) #USE INFORMATION RECEIVED FROM CLIENT TO CONTROL SPREAD OF HEAT MAP #spread = 12 if ((spread > 0) and (spread < 20)): spread = spread else: spread = 12 print "spread = defult value" for j in range(max(0, (pos_y - spread)), min(numH, (pos_y + spread))): for i in range(max(0, (pos_x - spread)), min(numW, (pos_x + spread))): grid[j][i] += 2 * math.exp( (-point_distance(i, j, pos_x, pos_y)**2) / (2 * (spread / 2)**2)) grid = normalizeArray(grid) offsetLeft = (w - numW * cell_size) / 2.0 offsetTop = (h - numH * cell_size) / 2.0 for j in range(numH): for i in range(numW): newItem = {} newItem['x'] = offsetLeft + i * cell_size newItem['y'] = offsetTop + j * cell_size newItem['width'] = cell_size - 1 newItem['height'] = cell_size - 1 newItem['value'] = grid[j][i] output["analysis"].append(newItem) if analysis == "false": q.put('idle') if analysis == "true": q.put('cannot run both, run as heatmap') return json.dumps(output) ## MACHINE LEARNING IMPLEMENTATION if ((heatmap == "false") and (analysis == "true")): q.put('starting interpolation analysis...') featureData = [] targetData = [] for record in records: featureData.append([record.latitude, record.longitude]) targetData.append(record.price) X = np.asarray(featureData, dtype='float') y = np.asarray(targetData, dtype='float') breakpoint = int(numListings * .7) # create training and validation set X_train = X[:breakpoint] X_val = X[breakpoint:] y_train = y[:breakpoint] y_val = y[breakpoint:] #mean 0, variance 1 scaler = preprocessing.StandardScaler().fit(X_train) X_train_scaled = scaler.transform(X_train) mse_min = 10000000000000000000000 for C in [.01, 1, 100, 10000, 1000000]: for e in [.01, 1, 100, 10000, 1000000]: for g in [.01, 1, 100, 10000, 1000000]: q.put("training model: C[" + str(C) + "], e[" + str(e) + "], g[" + str(g) + "]") model = svm.SVR(C=C, epsilon=e, gamma=g, kernel='rbf', cache_size=2000) model.fit(X_train_scaled, y_train) y_val_p = [model.predict(i) for i in X_val] mse = 0 for i in range(len(y_val_p)): mse += (y_val_p[i] - y_val[i])**2 mse /= len(y_val_p) if mse < mse_min: mse_min = mse model_best = model C_best = C e_best = e g_best = g q.put("best model: C[" + str(C_best) + "], e[" + str(e_best) + "], g[" + str(g_best) + "]") for j in range(numH): for i in range(numW): lat = remap(j, numH, 0, lat1, lat2) lng = remap(i, 0, numW, lng1, lng2) testData = [[lat, lng]] X_test = np.asarray(testData, dtype='float') X_test_scaled = scaler.transform(X_test) grid[j][i] = model_best.predict(X_test_scaled) grid = normalizeArray(grid) offsetLeft = (w - numW * cell_size) / 2.0 offsetTop = (h - numH * cell_size) / 2.0 for j in range(numH): for i in range(numW): newItem = {} newItem['x'] = offsetLeft + i * cell_size newItem['y'] = offsetTop + j * cell_size newItem['width'] = cell_size - 1 newItem['height'] = cell_size - 1 newItem['value'] = grid[j][i] output["analysis"].append(newItem) q.put('idle') return json.dumps(output)

from sklearn import linear_model from sklearn.neighbors import KNeighborsRegressor from sklearn.tree import DecisionTreeRegressor from sklearn import svm from sklearn.ensemble import AdaBoostRegressor from sklearn.ensemble import RandomForestRegressor algorithm_name = [ 'Regression(Lasso)', 'KNN', 'Decision Tree', 'SVM(Linear)', 'AdaBoost', 'Random Forest' ] algorithm = [] algorithm.append(linear_model.Lasso(alpha=0.00114115)) algorithm.append(KNeighborsRegressor(n_neighbors=31)) algorithm.append(DecisionTreeRegressor(max_depth=4)) algorithm.append(svm.SVR(kernel='linear', C=0.0918484848484)) algorithm.append(AdaBoostRegressor(n_estimators=10)) algorithm.append(RandomForestRegressor(n_estimators=13, max_depth=4)) for i in range(len(algorithm_name)): kfold = KFold(n_splits=5, shuffle=False) index = kfold.split(X=x, y=y) for train_index, val_index in index: starttime = datetime.datetime.now() algorithm[i].fit(x[train_index], y[train_index]) # train y_pred = algorithm[i].predict(x[val_index]) # predict accuracy1 = r2_score(list(y_pred), list(y[val_index])) y_pred = algorithm[i].predict(x_test) # predict accuracy = r2_score(list(y_pred), list(y_test)) endtime = datetime.datetime.now() time = (endtime - starttime).microseconds

target_train = target[:int(.9 * n_samples)] data_test = data[int(.9 * n_samples):] target_test = target[int(.9 * n_samples):] # classfication scores print('# Classification scores:') print('KNN: %f' % neighbors.KNeighborsClassifier().fit(data_train, target_train).score(data_test, target_test)) print('linear_model.ElasticNet: %f' % linear_model.ElasticNet().fit(data_train, target_train).score(data_test, target_test)) print('linear_model.ElasticNetCV: %f' % linear_model.ElasticNetCV().fit(data_train, target_train).score(data_test, target_test)) print('linear_model.Lars: %f' % linear_model.Lars().fit(data_train, target_train).score(data_test, target_test)) print('linear_model.Lasso: %f' % linear_model.Lasso().fit(data_train, target_train).score(data_test, target_test)) print('linear_model.LassoCV: %f' % linear_model.LassoCV().fit(data_train, target_train).score(data_test, target_test)) print('linear_model.LassoLars: %f' % linear_model.LassoLars().fit(data_train, target_train).score(data_test, target_test)) print('linear_model.LassoLarsIC: %f' % linear_model.LassoLarsIC().fit(data_train, target_train).score(data_test, target_test)) print('linear_model.LinearRegression: %f' % linear_model.LinearRegression().fit(data_train, target_train).score(data_test, target_test)) print('linear_model.LogisticRegression: %f' % linear_model.LogisticRegression().fit(data_train, target_train).score(data_test, target_test)) print('linear_model.OrthogonalMatchingPursuit: %f' % linear_model.OrthogonalMatchingPursuit().fit(data_train, target_train).score(data_test, target_test)) print('linear_model.PassiveAggressiveClassifier: %f' % linear_model.PassiveAggressiveClassifier().fit(data_train, target_train).score(data_test, target_test)) print('linear_model.PassiveAggressiveRegressor: %f' % linear_model.PassiveAggressiveRegressor().fit(data_train, target_train).score(data_test, target_test)) print('linear_model.Perceptron: %f' % linear_model.Perceptron().fit(data_train, target_train).score(data_test, target_test)) print('linear_model.Ridge: %f' % linear_model.Ridge().fit(data_train, target_train).score(data_test, target_test)) print('linear_model.RidgeClassifier: %f' % linear_model.RidgeClassifier().fit(data_train, target_train).score(data_test, target_test)) print('linear_model.RidgeClassifierCV: %f' % linear_model.RidgeClassifierCV().fit(data_train, target_train).score(data_test, target_test)) print('linear_model.RidgeCV: %f' % linear_model.RidgeCV().fit(data_train, target_train).score(data_test, target_test)) print('linear_model.SGDClassifier: %f' % linear_model.SGDClassifier().fit(data_train, target_train).score(data_test, target_test)) print('linear_model.SGDRegressor: %f' % linear_model.SGDRegressor().fit(data_train, target_train).score(data_test, target_test)) print('naive_bayes.MultinomialNB: %f' % naive_bayes.MultinomialNB().fit(data_train, target_train).score(data_test, target_test)) print('lda.LDA: %f' % lda.LDA().fit(data_train, target_train).score(data_test, target_test)) print('svm.SVR: %f' % svm.SVR().fit(data_train, target_train).score(data_test, target_test)) print('svm.SVC: %f' % svm.SVC(kernel='linear').fit(data_train, target_train).score(data_test, target_test)) print('svm.LinearSVC: %f' % svm.LinearSVC().fit(data_train, target_train).score(data_test, target_test))

import numpy as np from sklearn import svm import matplotlib.pyplot as plt if __name__ == "__main__": N = 50 np.random.seed(0) x = np.sort(np.random.uniform(0, 6, N), axis=0)#uniform() 方法将随机生成下一个实数，它在 [x, y] 范围内。 y = 2*np.sin(x) + 0.1*np.random.randn(N) x = x.reshape(-1, 1) print('x =\n', x) print('y =\n', y) print('SVR - RBF') svr_rbf = svm.SVR(kernel='rbf', gamma=0.2, C=100) svr_rbf.fit(x, y) print('SVR - Linear') svr_linear = svm.SVR(kernel='linear', C=100) svr_linear.fit(x, y) print('SVR - Polynomial') svr_poly = svm.SVR(kernel='poly', degree=3, C=100) svr_poly.fit(x, y) print('Fit OK.') # 思考：系数1.1改成1.5 x_test = np.linspace(x.min(), 1.1*x.max(), 100).reshape(-1, 1) y_rbf = svr_rbf.predict(x_test) y_linear = svr_linear.predict(x_test) y_poly = svr_poly.predict(x_test)

def svrPredictions(xTrain,yTrain,xTest,k): clf = svm.SVR(C=2.0,kernel=k) clf.fit(xTrain,yTrain) return clf.predict(xTest)

'epsilon': ( 1e-2, 1e-1, 1e0, 1e1, ), 'coef0': ( 0.0, 0.1, 0.2, ), }] # Exhaustive search over specified parameter values for the estimator # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html est = svm.SVR() gs = GridSearchCV(est, cv=10, param_grid=hyper_params, verbose=2, n_jobs=n_jobs, scoring='r2', refit=True, pre_dispatch='n_jobs', error_score=np.nan, return_train_score=True) t0 = time.time() gs.fit(x_train, y_train) runtime = time.time() - t0 print("Training time: %.6f s" % runtime)

def baggingMySVM(trainX, trainY, train_prediction_start, testX, testY, test_prediction_start, look_ahead, bag_size=47, Nestimators=50, samp_size=0.95, sampleModels=50, kernel='sigmoid'): cRange = scipy.stats.expon(scale=5) #gammaRange = scipy.stats.expon(scale=0.1) #parameter_dist = {'C': cRange, 'gamma': gammaRange} parameter_dist = {'C': cRange} clf = RandomizedSearchCV(estimator=svm.SVR(kernel=kernel), param_distributions=parameter_dist, n_iter=50, cv=10, n_jobs=-1) clf.fit(trainX, trainY) print('Best C:', clf.best_estimator_.C) #print('Best Gamma:', clf.best_estimator_.gamma) svr = BaggingRegressor(svm.SVR(kernel=kernel, C=clf.best_estimator_.C), n_estimators=Nestimators, max_samples=samp_size, bootstrap=False, random_state=123) svr = svr.fit(trainX, trainY) colnames = ['dtStart'] cln = [i for i in range(1, Nestimators * 2 + 3, 1)] colnames.extend(cln) # (date, trainY, true_lab, pred_labs...) trainRs = np.zeros((trainX.shape[0], sampleModels * 2 + 3)) trainRs_raw = np.zeros((trainX.shape[0], Nestimators * 2 + 3)) trainRs[:, 0] = train_prediction_start trainRs_raw[:, 0] = train_prediction_start trainRs[:, 1] = trainY trainRs_raw[:, 1] = trainY trainRs[:, 2] = [1 if trainY[i] > 0 else 0 for i in range(len(trainY))] trainRs_raw[:, 2] = [1 if trainY[i] > 0 else 0 for i in range(len(trainY))] # testRs = np.zeros((testX.shape[0], sampleModels * 2 + 3)) testRs_raw = np.zeros((testX.shape[0], Nestimators * 2 + 3)) testRs[:, 0] = test_prediction_start testRs_raw[:, 0] = test_prediction_start testRs[:, 1] = testY testRs_raw[:, 1] = testY testRs[:, 2] = [1 if testY[i] > 0 else 0 for i in range(len(testY))] testRs_raw[:, 2] = [1 if testY[i] > 0 else 0 for i in range(len(testY))] for i in range(sampleModels): trainRs_raw[:, i + 3] = svr.estimators_[i].predict(trainX) testRs_raw[:, i + 3] = svr.estimators_[i].predict(testX) trainRs_raw[:, i + Nestimators + 3] = \ [1 if trainRs_raw[j, i + 3] > 0 else 0 for j in range(len(trainRs_raw[:, i + 3]))] testRs_raw[:, i + Nestimators + 3] = \ [1 if testRs_raw[j, i + 3] > 0 else 0 for j in range(len(testRs_raw[:, i + 3]))] # aggregating results! model_inds = [j for j in range(3, Nestimators + 3)] # print(model_inds) for i in range(len(model_inds)): index_modelstoUse = np.random.choice(model_inds, bag_size, replace=False) tmp_train = trainRs_raw[:, index_modelstoUse] tmp_test = testRs_raw[:, index_modelstoUse] trainRs[:, i + 3] = np.sum(tmp_train, axis=1) testRs[:, i + 3] = np.sum(tmp_test, axis=1) trainRs[:, i + Nestimators + 3] = \ [1 if trainRs[j, i + 3] > 0 else 0 for j in range(len(trainRs[:, i + 3]))] testRs[:, i + Nestimators + 3] = \ [1 if testRs[j, i + 3] > 0 else 0 for j in range(len(testRs[:, i + 3]))] trainRs = pd.DataFrame(trainRs, columns=colnames) trainRs.to_csv('train_SQ_results_la%d.csv' % look_ahead, index=False) trainRs_raw = pd.DataFrame(trainRs_raw, columns=colnames) trainRs_raw.to_csv('train_SQ_Raw_results_la%d.csv' % look_ahead, index=False) testRs = pd.DataFrame(testRs, columns=colnames) testRs.to_csv('test_SQ_results_la%d.csv' % look_ahead, index=False) testRs_raw = pd.DataFrame(testRs_raw, columns=colnames) testRs_raw.to_csv('test_SQ_Raw_results_la%d.csv' % look_ahead, index=False)

def regressors(regrs): if (regrs == 'lin'): reg = LinearRegression(n_jobs=-1) elif (regrs == 'svm-lin'): reg = svm.SVR(kernel='linear', gamma='auto') elif (regrs == 'svm-poly'): reg = svm.SVR(kernel='poly', gamma='auto') elif (regrs == 'lasso'): reg = make_pipeline(PolynomialFeatures(params['deg_poly'], interaction_only=False), LassoCV(eps=params['lasso_eps'],\ n_alphas=params['lasso_nalpha'],max_iter=params['lasso_iter'], normalize=False,cv=5)) elif (regrs == 'tree'): reg = DecisionTreeRegressor(random_state=24361) elif (regrs == 'forest'): reg = RandomForestRegressor(n_estimators=20, max_depth=2, min_samples_split=4, min_samples_leaf=1, random_state=24361, n_jobs=-1) elif (regrs == 'xgbr'): reg=XGBRegressor(learning_rate=0.10, max_depth=2, min_child_weight=1, \ n_estimators=100, subsample=0.25) # reg = XGBRegressor(learning_rate=0.045, max_depth=2, min_child_weight=1, \ # n_estimators=100, subsample=0.15 # eta=0.2, gamma=0.9, reg_lambda=0.1, reg_alpha=0.3, n_jobs=-1 elif (regrs == 'ada'): nn = MLPRegressor(hidden_layer_sizes=(32, 1), activation='relu', solver='adam', random_state=24361) xgbr=XGBRegressor(learning_rate=0.10, max_depth=2, min_child_weight=1, \ n_estimators=100, subsample=0.25, random_state=24361) # xgbr = XGBRegressor(learning_rate=0.045, max_depth=2, min_child_weight=1, \ # n_estimators=100, subsample=0.15, gamma=0.3, reg_lambda=0.5, reg_alpha=0.4, n_jobs=-1) reg = AdaBoostRegressor(base_estimator=xgbr, learning_rate=0.1, loss='square', \ n_estimators=100, random_state=24361) elif (regrs == 'nn'): reg = MLPRegressor(hidden_layer_sizes=(32, 1), activation='relu', solver='adam', random_state=24361) # learning_rate='constant', learning_rate_init=0.01, alpha=0.001, power_t=0.5, max_iter=50, \ # tol=0.0001, momentum=0.5, nesterovs_momentum=True, validation_fraction=0.1, \ # beta_1=0.1, beta_2=0.555, epsilon=1e-08, n_iter_no_change=50, random_state=24361) elif (regrs == 'comb'): xgbr = XGBRegressor(learning_rate=0.045, max_depth=2, min_child_weight=1, \ n_estimators=100, subsample=0.15, n_jobs=-1) xgbr1 = XGBRegressor(learning_rate=0.035, max_depth=3, min_child_weight=1, \ n_estimators=50, subsample=0.15, n_jobs=-1) # xgbr2 = XGBRegressor(learning_rate=0.025, max_depth=2, min_child_weight=1, \ # n_estimators=50, subsample=0.15, n_jobs=-1) frst = RandomForestRegressor(max_depth=2, max_leaf_nodes=2, n_estimators=3, n_jobs=-1) dtr = DecisionTreeRegressor(max_depth=2, max_leaf_nodes=2) nn = MLPRegressor(hidden_layer_sizes=(32, 1), activation='tanh', solver='adam', learning_rate_init=0.15) reg = StackingRegressor(regressors=[xgbr, xgbr1, frst, nn], meta_regressor=frst) elif (regrs == 'tpot'): reg = TPOTRegressor(generations=10, verbosity=2, scoring='r2', n_jobs=-1, random_state=23) elif (regrs == 'voting'): frst = RandomForestRegressor(n_estimators=100, random_state=24361, n_jobs=-1) dtr = DecisionTreeRegressor(random_state=24361) reg = VotingClassifier(estimators=[('frst', frst), ('dtr', dtr)], voting='hard') return (reg)

import pandas as pd from sklearn import svm from sklearn.metrics import mean_squared_error, make_scorer from sklearn.model_selection import GridSearchCV def rmse(y, y_pred): return mean_squared_error(y, y_pred)**0.5 train_dataset = pd.read_csv('train.csv', header=0) x_train = train_dataset.iloc[:, 2:] y_train = train_dataset.iloc[:, 1] svc = svm.SVR(kernel='linear') param_grid = [ { 'C': [1, 10, 100, 1000], 'kernel': ['linear'] }, { 'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf'] }, ] rmse_scorer = make_scorer(rmse, greater_is_better=False) model = GridSearchCV(estimator=svc, param_grid=param_grid, scoring=rmse_scorer, cv=3)

plt.plot(xx, yy_down, 'k--') plt.plot(xx, yy_up, 'k--') plt.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s=80, facecolors='none') plt.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired) plt.axis('tight') plt.show() ''' #回归 from sklearn import svm X = [[0, 0], [2, 2]] y = [0.5, 2.5] clf = svm.SVR() clf.fit(X, y) svm.SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto', kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False) print(clf.predict([[1, 1]])) from sklearn import datasets iris = datasets.load_iris() X = iris.data print(X.shape) y = iris.target print(y.shape) X = X[y != 0, :2] y = y[y != 0] print(X.shape)

#分类 # X=[[0,0],[1,1]] # y=[0,1] # clf=svm.SVC() # clf.fit(X,y) # res_predicted=clf.predict([[2.,2.]]) # #获得支持向量 # res_support_vector=clf.support_vectors_ # #获得支持向量的索引 # res_support=clf.support_ # #每一个类别获得支持向量的数量 # res_nSupport=clf.n_support_ # print(res_nSupport) #多元分类 # X=[[0],[1],[2],[3]] # Y=[0,1,2,3] # clf=svm.SVC(decision_function_shape='ovo') # clf.fit(X,Y) # dec=clf.decision_function([[1]]) # print(dec.shape[1]) #回归 from sklearn import svm X = [[0, 0], [2, 2]] y = [0.5, 2.5] clf = svm.SVR() clf.fit(X, y) res = clf.predict([[1, 1]]) print(res)

rows = datasetX.shape[0] #print(rows) end = int(round(rows * 0.7, 0)) # vyber kolik % dat bude trenovaci trainX = datasetX[0:end] trainY = datasetY[0:end] testX = datasetX[end:rows] testY = datasetY[end:rows] print("_______________________\n SVM - SVR \n_______________________") clf = svm.SVR(C=100000, degree=3, kernel='rbf', gamma=1.9, shrinking=True, tol=1e-9, cache_size=500, verbose=True, max_iter=-1) #normalizace #X_normalized = preprocessing.normalize(trainX, norm='l2') #X_test_normalized = preprocessing.normalize(testX, norm='l2') # normalize if normalize: trainX = preprocessing.normalize(trainX) testX = preprocessing.normalize(testX)

MAE_train_LR = [] MSE_train_LR = [] RMSE_train_LR = [] APE_test_LR = [] MAE_test_LR = [] MSE_test_LR = [] RMSE_test_LR = [] i = 0 for train_index, test_index in loo.split(scaled): trainSet = scaled[train_index] testSet = scaled[test_index] train_X, train_y = trainSet[:, 0:4], trainSet[:, -1] test_X, test_y = testSet[:, 0:4], testSet[:, -1] clf_SVR = svm.SVR(kernel='rbf', C=1000, gamma=15).fit(train_X, train_y) #clf_SVR = svm.SVR(kernel='linear',C=20).fit(train_X,train_y) #clf_SVR = svm.SVR(kernel='poly',C=1000, degree=3).fit(train_X,train_y) clf_RFR = RandomForestRegressor().fit(train_X, train_y) #clf_RFR = RandomForestRegressor(n_estimators=100,max_features=2).fit(train_X,train_y) clf_LR = linear_model.LinearRegression().fit(train_X, train_y) #joblib.dump(clf_SVR, '../results/SVR_train_model_'+str(i+1)+'.m') #joblib.dump(clf_RFR, '../results/RFR_train_model_'+str(i+1)+'.m') #joblib.dump(clf_LR, '../results/LR_train_model_'+str(i+1)+'.m') #inverse dataset of SVR train_pred_SVR = clf_SVR.predict(train_X) test_pred_SVR = clf_SVR.predict(test_X)

divisao = 0.75 embaralhar = np.random.permutation(amostras) x = x[embaralhar] y= y[embaralhar] x_treino = x [:int(amostras*divisao)] y_treino = y [:int(amostras*divisao)] x_teste = x [int(amostras*divisao):] y_teste = y [int(amostras*divisao):] parametros_svr = {'kernel':('linear','poly','sigmoid','rbf'),'C':[1,2,3,4,5]} svr = svm.SVR() clf = GridSearchCV(svr,parametros_svr, n_jobs=10) print(clf.best_params_) clf = svm.SVR(kernel = 'linear') clf.fir(x_treino, y_treino) predicao = clf.predict(x_teste) mse = metrics.mean_squared_error(y_teste, predicao) r2 = metrics.r2_score(y_teste, predicao)

classification(xgboost.XGBClassifier(**XGBOOST_PARAMS)), classification_binary(xgboost.XGBClassifier(**XGBOOST_PARAMS)), # XGBoost (Large Trees) regression_random(xgboost.XGBRegressor(**XGBOOST_PARAMS_LARGE)), classification_random(xgboost.XGBClassifier(**XGBOOST_PARAMS_LARGE)), classification_binary_random( xgboost.XGBClassifier(**XGBOOST_PARAMS_LARGE)), # Linear SVM regression(svm.LinearSVR(random_state=RANDOM_SEED)), classification(svm.LinearSVC(random_state=RANDOM_SEED)), classification_binary(svm.LinearSVC(random_state=RANDOM_SEED)), # SVM regression(svm.SVR(kernel="rbf")), regression(svm.NuSVR(kernel="rbf")), classification_binary(svm.SVC(kernel="rbf", **SVC_PARAMS)), classification_binary(svm.SVC(kernel="linear", **SVC_PARAMS)), classification_binary(svm.SVC(kernel="poly", degree=2, **SVC_PARAMS)), classification_binary(svm.SVC(kernel="sigmoid", **SVC_PARAMS)), classification_binary(svm.NuSVC(kernel="rbf", **SVC_PARAMS)), classification(svm.SVC(kernel="rbf", **SVC_PARAMS)), classification(svm.NuSVC(kernel="rbf", **SVC_PARAMS)), # Linear Regression regression(linear_model.LinearRegression()), regression(linear_model.HuberRegressor()), regression(linear_model.ElasticNet(random_state=RANDOM_SEED)), regression(linear_model.ElasticNetCV(random_state=RANDOM_SEED)), regression(linear_model.TheilSenRegressor(random_state=RANDOM_SEED)),

print(X_test.shape) print(X_test) print('Training set - y') print(y_train.shape) print(y_train) print('Test set - y') print(y_test.shape) print(y_test) X_train = X_train.astype('double') y_train = y_train.astype('double') X_test = X_test.astype('double') y_test = y_test.astype('double') print('Started Model training') clf = svm.SVR(kernel=kernel).fit(X_train, y_train) print(clf) print('Model trained') #make predictions pred = clf.predict(X_test) #print('Here is the mean squared error -') #print(mean_squared_error(pred, y_test)) fig = plt.figure() #print(X.reshape(1, -1)) #print(y) plt.scatter(X.reshape(1, -1), y.ravel(), color='blue', label='original data') #plt.scatter(val_x, val_y, color='pink', label='mean data') #plt.plot(means, y_compressed, color='red', label='connected mean data')

def attack_svr(self, server, predictor_name, kernel_type, attack_type, dimension, query_budget, dataset=None, roundsize=5): if dataset is None and attack_type != "extraction" or len(dataset) < 2: print("[!] Dataset too small") print("[*] Aborting attack...") raise ValueError if not isinstance(dataset, list): dataset = dataset.tolist() if attack_type == "retraining": X = [] y = [] for datum in random.sample(dataset, query_budget): b = self.client.poll_server(server, predictor_name, [datum]) X.append(datum) y.append(b) if kernel_type == "quadratic": my_model = svm.SVR(kernel="poly", degree=2) else: my_model = svm.SVR(kernel=kernel_type) my_model.fit(X, numpy.ravel(y)) return my_model elif attack_type == "adaptive retraining": if len(dataset) >= query_budget > roundsize: pool = random.sample(dataset, query_budget) X = [] y = [] n = roundsize t = math.ceil(query_budget / n) for i in range(0, n): # Initial training data for a basic start to train upon a = pool.pop(0) b = self.client.poll_server(server, predictor_name, [a]) X.append(a) y.append(b) if kernel_type == "quadratic": my_model = svm.NuSVR(kernel="poly", degree=2) else: my_model = svm.NuSVR(kernel=kernel_type) for i in range(0, t - 1): # perform t rounds minus the initial round. #print(numpy.ravel(y)) my_model.fit(X, numpy.ravel(y)) if len(my_model.support_vectors_) == 0: print("[!] NO SUPPORTVECTORS IN ROUND", i) print("[*] Adding another round of random samples") #print(my_model.support_) #print(my_model.support_vectors_) #print(my_model.dual_coef_) for q in range(0, n): # Initial training data for a basic start to train upon if len(pool) == 0: print("[!] Error: Not enough data") raise IndexError a = pool.pop(0) b = self.client.poll_server(server, predictor_name, [a]) X.append(a) y.append(b) continue print("Training Round", i, " of ", t-1) pool, samples = self.get_furthest_samples(pool, my_model.support_vectors_, kernel_type, my_model.coef0, my_model.get_params()["gamma"], my_model.get_params()["C"], n, my_model.dual_coef_) for j in samples: X.append(j) y.append(self.client.poll_server(server, predictor_name, [j])) my_model.fit(X, numpy.ravel(y)) return my_model else: print("[!] Error: either not enough data in data set, or query budget not bigger than round size.") print("[*] Aborting attack...") raise ValueError elif attack_type == "extraction": if kernel_type == "quadratic": # NOTE: KEEP IN MIND, IN THE IMPLEMENTATION THE VECTOR INDICES START AT 0, INSTEAD OF 1 # Also DIMENSION - 1 is max index, not dimenstion itself. d_ = self.nCr(dimension, 2) + 2*dimension + 1 # d := Projection dimension if d_ > query_budget: print("[!] Error: This algorithm will need", d_ ," queries.") raise ValueError w_ = [0] * d_ # extracted weight vectors null_vector = [0] * dimension b_ = self.client.poll_server(server, predictor_name, [null_vector])[0] # b' = w_d c +b for dim in range(dimension): v_p = dim * [0] + [1] + (dimension - 1 - dim) * [0] v_n = dim * [0] + [-1] + (dimension - 1 - dim) * [0] f_v_p = self.client.poll_server(server, predictor_name, [v_p])[0] - b_ f_v_n = self.client.poll_server(server, predictor_name, [v_n])[0] - b_ w_[dimension - dim + 1 - 2] = (f_v_p + f_v_n) / 2 w_[d_ - dim - 2] = (f_v_p - f_v_n) / 2 class QuadraticMockModel: def __init__(self, d__, w__, b__): self.dim = d__ self.w = w__ self.b = b__ def phi(self, x__): vec = [] for i__ in x__[::-1]: vec.append(i__**2) for i__ in reversed(range(len(x__))): for j__ in reversed(range(i__)): vec.append(math.sqrt(2)*x__[i__]*x__[j__]) for i__ in x__[::-1]: vec.append(i__) vec.append(0) return vec def predict(self, arr): rv = [] for v__ in arr: val = numpy.dot(self.w, self.phi(v__)) + self.b rv.append(val) return rv if dimension <= 2: return QuadraticMockModel(d_, w_, b_) for dim_i in range(dimension): for dim_j in range(dim_i + 1, dimension): #print(dim_i, dim_j) v = dimension*[0] v[dim_i], v[dim_j] = 1, 1 f_v = self.client.poll_server(server, predictor_name, [v])[0] r = self.r_index(dim_i + 1, dim_j + 1, dimension) - 1 w_[r] = (f_v - w_[dimension - dim_i + 1 - 2] - w_[dimension - dim_j + 1 - 2] - w_[d_ - dim_i - 2] - w_[d_ - dim_j - 2] - b_) / math.sqrt(2) print("[+] w' extrahiert:", w_) return QuadraticMockModel(d_, w_, b_) if kernel_type != "linear": print("[!] Error: Unsupported Kernel for extraction attack.") raise ValueError d = [0] * dimension b = self.client.poll_server(server, predictor_name, [d])[0] w = [] for j in range(0, dimension): x = j * [0] + [1] + (dimension - 1 - j) * [0] w.append(self.client.poll_server(server, predictor_name, [x])[0]-b) print("[+] Model parameters have been successfully extracted") print("[*] weight (w):", w) print("[*] bias (b):", b) print("[*] Building mock model...") class LinearMockModel: def __init__(self, d__, w__, b__): self.dim = d__ self.w = w__ self.b = b__ def predict(self, arr): rv = [] for v__ in arr: val = numpy.dot(self.w, v__) + self.b rv.append(val) return rv return LinearMockModel(dimension, w, b) else: print("[!] Error: unknown attack type for svr") print("[*] Aborting attack...") raise ValueError

# Put the result into a color plot Z = Z.reshape(XX.shape) plt.pcolormesh(XX, YY, Z > 0, cmap=plt.cm.Paired) plt.contour(XX, YY, Z, colors=['k', 'k', 'k'], linestyles=['--', '-', '--'], levels=[-.5, 0, .5]) plt.title(kernel) plt.show() ''' # Find the optimized model from sklearn.svm import SVR import numpy as np parameters = {'kernel': ('linear', 'rbf','poly'), 'C':[1.5, 10],'gamma': [1e-7, 1e-4],'epsilon':[0.1,0.2,0.5,0.3]} svr = svm.SVR() clf = GridSearchCV(svr, parameters) clf.fit(X_train, y_train) clf.best_params_ print "Best estimator found by grid search:",clf.best_estimator_ print "Best parameters found by grid search:",clf.best_params_ print clf.best_score_ clf_best = linear_model.LogisticRegression(**clf.best_params_) y_pred = clf_best.fit(X_train, y_train).predict(X_test) score = clf_best.score(X_test, y_test) # out prediction accuracy print 'Accuracy:', score # output confusion matrix

os.chdir(r'D:\desktop\data mining\ML\LinearReg_ML') df = pd.read_csv('HousePrices.csv') df=df.head(50000) X = df.drop(['Prices'], axis=1) y = df.Prices from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_scaled =pd.DataFrame(sc.fit_transform(X), columns=X.columns) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=1/3, random_state=0) from sklearn import svm from sklearn.svm import SVR from sklearn.model_selection import GridSearchCV gamma= 'auto' ; gamma = 'scale' parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]} svr = svm.SVR() gr = GridSearchCV(svr, parameters, cv=5) gr.fit(X_train, y_train) gr.score(X_test,y_test) #*********************************************************** from sklearn.model_selection import GridSearchCV param_grid = [{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]}, {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},] from sklearn.ensemble import forest_reg = RandomForestRegressor() grd = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error') grd.fit(X_test, y_test) grd.score(X_test, y_test)

def build_surrogate(self): """ Build a surrogate. Multiple options for models are available including: -Gaussian Processes -KNN -SVR Assumptions: None Source: N/A Inputs: state [state()] Outputs: self.sfc_surrogate [fun()] self.thrust_surrogate [fun()] Properties Used: Defaulted values """ # unpack pycycle_problem = self.model pycycle_problem.set_solver_print(level=-1) pycycle_problem.set_solver_print(level=2, depth=0) # Extract the data # Create lists that will turn into arrays Altitudes = [] Machs = [] PCs = [] Thrust = [] TSFC = [] # if we added fc.dTS this would handle the deltaISA throttles = self.evaluation_throttles*1. for MN, alt in self.evaluation_mach_alt: print('***'*10) print(f'* MN: {MN}, alt: {alt}') print('***'*10) pycycle_problem['OD_full_pwr.fc.MN'] = MN pycycle_problem['OD_full_pwr.fc.alt'] = alt pycycle_problem['OD_part_pwr.fc.MN'] = MN pycycle_problem['OD_part_pwr.fc.alt'] = alt for PC in throttles: print(f'## PC = {PC}') pycycle_problem['OD_part_pwr.PC'] = PC pycycle_problem.run_model() #Save to our list for SUAVE Altitudes.append(alt) Machs.append(MN) PCs.append(PC) TSFC.append(pycycle_problem['OD_part_pwr.perf.TSFC'][0]) Thrust.append(pycycle_problem['OD_part_pwr.perf.Fn'][0]) throttles = np.flip(throttles) # Now setup into vectors Altitudes = np.atleast_2d(np.array(Altitudes)).T * Units.feet Mach = np.atleast_2d(np.array(Machs)).T Throttle = np.atleast_2d(np.array(PCs)).T thr = np.atleast_2d(np.array(Thrust)).T * Units.lbf sfc = np.atleast_2d(np.array(TSFC)).T * Units['lbm/hr/lbf'] # lbm/hr/lbf converted to (kg/N/s) # Once we have the data the model must be deleted because pycycle models can't be deepcopied self.pop('model') # Concatenate all together and things will start to look like the propuslor surrogate soon my_data = np.concatenate([Altitudes,Mach,Throttle,thr,sfc],axis=1) if self.save_deck : # Write an engine deck np.savetxt("pyCycle_deck.csv", my_data, delimiter=",") print(my_data) # Clean up to remove redundant lines b = np.ascontiguousarray(my_data).view(np.dtype((np.void, my_data.dtype.itemsize * my_data.shape[1]))) _, idx = np.unique(b, return_index=True) my_data = my_data[idx] xy = my_data[:,:3] # Altitude, Mach, Throttle thr = np.transpose(np.atleast_2d(my_data[:,3])) # Thrust sfc = np.transpose(np.atleast_2d(my_data[:,4])) # SFC self.altitude_input_scale = np.max(xy[:,0]) self.thrust_input_scale = np.max(thr) self.sfc_input_scale = np.max(sfc) # normalize for better surrogate performance xy[:,0] /= self.altitude_input_scale thr /= self.thrust_input_scale sfc /= self.sfc_input_scale # Pick the type of process if self.surrogate_type == 'gaussian': gp_kernel = Matern() regr_sfc = gaussian_process.GaussianProcessRegressor(kernel=gp_kernel) regr_thr = gaussian_process.GaussianProcessRegressor(kernel=gp_kernel) thr_surrogate = regr_thr.fit(xy, thr) sfc_surrogate = regr_sfc.fit(xy, sfc) elif self.surrogate_type == 'knn': regr_sfc = neighbors.KNeighborsRegressor(n_neighbors=1,weights='distance') regr_thr = neighbors.KNeighborsRegressor(n_neighbors=1,weights='distance') sfc_surrogate = regr_sfc.fit(xy, sfc) thr_surrogate = regr_thr.fit(xy, thr) elif self.surrogate_type == 'svr': regr_thr = svm.SVR(C=500.) regr_sfc = svm.SVR(C=500.) sfc_surrogate = regr_sfc.fit(xy, sfc) thr_surrogate = regr_thr.fit(xy, thr) elif self.surrogate_type == 'linear': regr_thr = linear_model.LinearRegression() regr_sfc = linear_model.LinearRegression() sfc_surrogate = regr_sfc.fit(xy, sfc) thr_surrogate = regr_thr.fit(xy, thr) else: raise NotImplementedError('Selected surrogate method has not been implemented') if self.thrust_anchor is not None: cons = deepcopy(self.thrust_anchor_conditions) cons[0,0] /= self.altitude_input_scale base_thrust_at_anchor = thr_surrogate.predict(cons) self.thrust_anchor_scale = self.thrust_anchor/(base_thrust_at_anchor*self.thrust_input_scale) if self.sfc_anchor is not None: cons = deepcopy(self.sfc_anchor_conditions) cons[0,0] /= self.altitude_input_scale base_sfc_at_anchor = sfc_surrogate.predict(cons) self.sfc_anchor_scale = self.sfc_anchor/(base_sfc_at_anchor*self.sfc_input_scale) # Save the output self.sfc_surrogate = sfc_surrogate self.thrust_surrogate = thr_surrogate

data.append(line[:-1].split(',')) data = np.array(data).T encoders, x = [], [] for row in range(len(data)): if data[row, 0].isdigit(): encoder = DigitEncoder() else: encoder = sp.LabelEncoder() if row < len(data) - 1: x.append(encoder.fit_transform(data[row])) else: y = encoder.fit_transform(data[row]) encoders.append(encoder) x = np.array(x).T train_x, test_x, train_y, test_y = \ ms.train_test_split(x, y, test_size=0.25, random_state=5) model = svm.SVR(kernel='rbf', C=10, epsilon=0.2) model.fit(train_x, train_y) pred_test_y = model.predict(test_x) print(sm.r2_score(test_y, pred_test_y)) data = [['Tuesday', '13:35', 'San Francisco', 'yes']] data = np.array(data).T x = [] for row in range(len(data)): encoder = encoders[row] x.append(encoder.transform(data[row])) x = np.array(x).T pred_y = model.predict(x) print(int(pred_y))

model1 = svm.LinearSVR() model1.fit(x_train, y_train) confidence1 = model1.score(x_test, y_test) predict_1 = model1.predict(x_small) dataset['Predict_Linear'] = np.nan print('Score for Linear Reg: :',confidence1) print('\n') for i in predict_1: next_date = datetime.datetime.fromtimestamp(next_unix) next_unix += one_day dataset.loc[next_date] = [np.nan for _ in range(len(dataset.columns)-1)]+[i] #################################################################################### model2 = svm.SVR(kernel = 'rbf', C= 100, gamma= 0.06) model2.fit(x_train, y_train) confidence2 = model2.score(x_test, y_test) predict_2 = model2.predict(x_small) dataset['Predict_RBF'] = np.nan print('Score for RBF Reg: :',confidence2) print('\n') for i in predict_2: next_date = datetime.datetime.fromtimestamp(next_unix) next_unix += one_day dataset.loc[next_date] = [np.nan for _ in range(len(dataset.columns)-1)]+[i] ####################################################################################

from sklearn import svm import pickle def fun(line): a = line.strip().split() for i in range(len(a)): a[i] = float(a[i]) return a[3:] f = open('Train','r') g = open('TrainTrue','r') X = [] Y = [] for line in f.readlines(): line2 = g.readline() if (('--' not in line) and ('--' not in line2)): X.append(fun(line)) Y.append(float(line2.strip().split()[-1])+273.16) #print fun(line) print ('Reading Done') f.close() clf = svm.SVR(cache_size=7000) clf.fit(X, Y) pickle.dump(clf, open('Model','wb'))

def test_SVR_poly(*data): ''' 测试 多项式核的 SVR 的预测性能随 degree、gamma、coef0 的影响. :param data: 可变参数。它是一个元组，这里要求其元素依次为：训练样本集、测试样本集、训练样本的值、测试样本的值 :return: None ''' X_train, X_test, y_train, y_test = data fig = plt.figure() ### 测试 degree #### degrees = range(1, 20) train_scores = [] test_scores = [] for degree in degrees: regr = svm.SVR(kernel='poly', degree=degree, coef0=1) regr.fit(X_train, y_train) train_scores.append(regr.score(X_train, y_train)) test_scores.append(regr.score(X_test, y_test)) ax = fig.add_subplot(1, 3, 1) ax.plot(degrees, train_scores, label="Training score ", marker='+') ax.plot(degrees, test_scores, label=" Testing score ", marker='o') ax.set_title("SVR_poly_degree r=1") ax.set_xlabel("p") ax.set_ylabel("score") ax.set_ylim(-1, 1.) ax.legend(loc="best", framealpha=0.5) ### 测试 gamma，固定 degree为3， coef0 为 1 #### gammas = range(1, 40) train_scores = [] test_scores = [] for gamma in gammas: regr = svm.SVR(kernel='poly', gamma=gamma, degree=3, coef0=1) regr.fit(X_train, y_train) train_scores.append(regr.score(X_train, y_train)) test_scores.append(regr.score(X_test, y_test)) ax = fig.add_subplot(1, 3, 2) ax.plot(gammas, train_scores, label="Training score ", marker='+') ax.plot(gammas, test_scores, label=" Testing score ", marker='o') ax.set_title("SVR_poly_gamma r=1") ax.set_xlabel(r"$\gamma$") ax.set_ylabel("score") ax.set_ylim(-1, 1) ax.legend(loc="best", framealpha=0.5) ### 测试 r，固定 gamma 为 20，degree为 3 ###### rs = range(0, 20) train_scores = [] test_scores = [] for r in rs: regr = svm.SVR(kernel='poly', gamma=20, degree=3, coef0=r) regr.fit(X_train, y_train) train_scores.append(regr.score(X_train, y_train)) test_scores.append(regr.score(X_test, y_test)) ax = fig.add_subplot(1, 3, 3) ax.plot(rs, train_scores, label="Training score ", marker='+') ax.plot(rs, test_scores, label=" Testing score ", marker='o') ax.set_title("SVR_poly_r gamma=20 degree=3") ax.set_xlabel(r"r") ax.set_ylabel("score") ax.set_ylim(-1, 1.) ax.legend(loc="best", framealpha=0.5) plt.show()

def getbest(self): data = self.X data1 = data # data1[c] = data1[c].append(len(data1[c]),self.y[c]) # data = data1 # print(data) ''' data = [] for c in range(self.length): p = () p += self.X_te[c] p += self.y_te[c] data.append(p) nbc = nltk.NaiveBayesClassifier.train(data[:self.length * 0.7]) nbcacc = nltk.classify.accuracy(nbc, data[self.length * 0.7:]) # nbcacc = accuracy_score(self.y_te, ynbc) self.acc.append(("NaiveBayes", nbcacc)) ''' knn = KNeighborsClassifier(n_neighbors=3) # score = cross_val_score(knn, self.X_tr, self.y_tr, cv=3, scoring='accuracy') # print("scores ", score) knn.fit(self.X_tr, self.y_tr) yknn = knn.predict(self.X_te) knnacc = accuracy_score(self.y_te, yknn) self.acc.append(("knn", knnacc)) clf = svm.SVR() clf.fit(self.X_tr, self.y_tr) svr = clf.score(self.X_te, self.y_te) self.acc.append(("SVR", svr)) clf = LinearDiscriminantAnalysis() clf.fit(self.X_tr, self.y_tr) lda = clf.score(self.X_te, self.y_te) self.acc.append(("LDA", lda)) clf = GaussianNB() clf.fit(self.X_tr, self.y_tr) xx = clf.predict(self.X_te) gnb = accuracy_score(self.y_te, xx) self.acc.append(("GaussianNB", gnb)) clf = BernoulliNB() clf.fit(self.X_tr, self.y_tr) xx = clf.predict(self.X_te) gnb = accuracy_score(self.y_te, xx) self.acc.append(("BernoulliNB", gnb)) clf = MultinomialNB() clf.fit(self.X_tr, self.y_tr) xx = clf.predict(self.X_te) gnb = accuracy_score(self.y_te, xx) self.acc.append(("MultinomialNB", gnb)) clf = linear_model.LinearRegression() clf.fit(self.X_tr, self.y_tr) lrgacc = clf.score(self.X_te, self.y_te) self.acc.append(("LinearReg", lrgacc)) clf = linear_model.LogisticRegression() clf.fit(self.X_tr.astype('int'), self.y_tr.astype('int')) logacc = clf.score(self.X_te.astype('int'), self.y_te.astype('int')) self.acc.append(("LogisticReg", logacc)) clf = linear_model.SGDClassifier() clf.fit(self.X_tr, self.y_tr) ysgd = clf.predict(self.X_te) sgdacc = accuracy_score(self.y_te, ysgd) self.acc.append(("SGDC", sgdacc)) clf = DecisionTreeClassifier() clf.fit(self.X_tr, self.y_tr) dtc = clf.score(self.X_te, self.y_te) self.acc.append(("DecisionTree", dtc)) clf = SVC(kernel='rbf') clf.fit(self.X_tr, self.y_tr) ysvc = clf.predict(self.X_te) svcaccr = accuracy_score(self.y_te, ysvc) self.acc.append(("SVC-rbf", svcaccr)) clf = SVC(kernel='linear') clf.fit(self.X_tr, self.y_tr) ysvc = clf.predict(self.X_te) svcaccl = accuracy_score(self.y_te, ysvc) self.acc.append(("SVC-linear", svcaccl)) '''clf = SVC(kernel='poly', degree=5) clf.fit(self.X_tr, self.y_tr) ysvc = clf.predict(self.X_te) print("svcp", ysvc) svcaccp = accuracy_score(self.y_te, ysvc) self.acc.append(("SVC-poly", svcaccp))''' self.acc.sort(key=lambda tup: tup[1], reverse=True) for i in self.acc: print(i[0], " ", i[1] * 100)

boston_x = scale.transform(boston_X) pca = PCA(n_components=3) boston_x = pca.fit_transform(boston_x) fig = plt.figure() ax = plt.gca(projection='3d') # ax.scatter(boston_x[:, 0], boston_x[:, 1], boston_x[:, 2], marker='o', c=boston_y) x_tran, x_test, y_tran, y_test = train_test_split(boston_x, boston_y, test_size=0.3, random_state=42) result = [] z = np.zeros(shape=(10, 10)) test_number = len(y_test) for i in range(1, 11, 1): for j in range(1, 11, 1): clf = svm.SVR(C=i / 10, epsilon=j / 10, gamma='auto').fit(x_tran, y_tran) y_pre = clf.predict(x_test) result.append([i, j, clf.score(x_test, y_test)]) z[i - 1, j - 1] = clf.score(x_test, y_test) print(result) x = np.linspace(1, 10, 10) y = np.linspace(1, 10, 10) X, Y = np.meshgrid(x / 10, y / 10) ax.plot_surface(X, Y, z, cmap=cm.coolwarm) ax.set_zlim(0, 1) ax.zaxis.set_major_locator(LinearLocator(5)) ax.set_xlabel('C') ax.set_ylabel('eplison') ax.set_zlabel('score') plt.title('Boston_SVR') plt.show()

from sklearn import svm from sklearn import preprocessing scaler = preprocessing.StandardScaler().fit(X) X_scaled = scaler.transform(X) count = 1 mse_val = [] for C_var in [.001, 50, 500]: for e_var in [.001, 1, 10]: figure(num=count, figsize=(10, 8), dpi=150) svr = svm.SVR(kernel='rbf', C=C_var, epsilon=e_var, cache_size=2000) svr.fit(X_train, y_train) h = .02 x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = svr.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.contourf(xx, yy, Z, 8, cmap=cm, alpha=.75)

MLA = [ #Ensemble Methods ensemble.AdaBoostRegressor(n_estimators=100, loss='exponential'), ensemble.ExtraTreesRegressor(), ensemble.GradientBoostingRegressor(n_estimators=100), ensemble.RandomForestRegressor(), #GLM linear_model.LinearRegression(), linear_model.SGDRegressor(), linear_model.Ridge(), linear_model.Lasso(), #SVM svm.SVR(), svm.LinearSVR(), # ] # In[31]: np.array(top_importance[0:15]) # In[32]: