def run(self): kernel_lookup = { 'Radial Basis Function': 'rbf', 'Linear': 'linear', 'Polynomial': 'poly', 'Sigmoid': 'sigmoid', 'Precomputed': 'precomputed' } kernel = kernel_lookup[self.kernelComboBox.currentText()] params = { 'C': self.cDoubleSpinBox.value(), 'epsilon': self.epsilonDoubleSpinBox.value(), 'kernel': kernel, 'degree': self.degreeSpinBox.value(), 'gamma': self.gammaComboBox.currentText(), 'coef0': self.coeff0DoubleSpinBox.value(), 'shrinking': self.shrinkingCheckBox.isChecked(), 'tol': self.toleranceDoubleSpinBox.value(), 'cache_size': self.cacheSizeSpinBox.value(), 'verbose': self.verboseCheckBox.isChecked(), 'max_iter': int(self.maxIterationsSpinBox.value()) } return params, self.getChangedValues(params, SVR())
def init_model(self): return SVR(kernel="rbf", C=self.c, epsilon=self.eps, tol=self.tol, max_iter=self.max_iter, gamma=self.gamma)
def SVR_ST(trainFileName, testFileName): trainData = ld.LoadData_DATA_ST(trainFileName) testData = ld.LoadData_DATA_ST(testFileName) store = ['1', '2', '3', '4', '5'] res = [] for i in store: train_X = [] train_y = [] context = trainData[i] for array in context: array = [float(x) for x in array[2:]] train_X.append((array[2:-1])) train_y.append(array[-1]) test_X = [] items = [] context = testData[i] for array in context: items.append((array[0], array[1])) array = [float(x) for x in array[2:]] test_X.append((array[2:])) train_X = np.matrix(train_X) test_X = np.matrix(test_X) svr = SVR(kernel='linear', epsilon=0.5, C=1) pred_y = svr.fit(train_X[:, -8:-3], train_y).predict(test_X[:, -7:-2]) for i in range(len(test_X)): res.append([ items[i][0], items[i][1], '%.4f' % max(pred_y[i], 0), '%.4f' % test_X[i, -4], '%.4f' % (float(test_X[i, -5]) * 2) ]) return res
def connectWidgets(self): svr = SVR() svr.kernel = 'rbf' svr.degree = 3 svr.gamma = 'auto' svr.coef0 = 0.0 svr.tol = 1e-3 svr.C = 1.0 svr.epsilon = 0.1 svr.shrinking = True svr.cache_size = 200 svr.verbose = False svr.max_iter = -1 self.cLineEdit.setText(str(svr.C)) self.epsilonLineEdit.setText(str(svr.epsilon)) self.kernel_list.setCurrentItem( self.kernel_list.findItems('Radial Basis Function', QtCore.Qt.MatchExactly)[0]) self.degreeLineEdit.setText(str(svr.degree)) self.coeff0LineEdit.setText(str(svr.coef0)) self.shrinking_list.setCurrentItem( self.shrinking_list.findItems(str(svr.shrinking), QtCore.Qt.MatchExactly)[0]) self.toleranceLineEdit.setText(str(svr.tol)) self.maxIterationsLineEdit.setText(str(svr.max_iter))
def connectWidgets(self): svr = SVR() svr.kernel = 'rbf' svr.degree = 3 svr.gamma = 'auto' svr.coef0 = 0.0 svr.tol = 1e-3 svr.C = 1.0 svr.epsilon = 0.1 svr.shrinking = True svr.cache_size = 200 svr.verbose = False svr.max_iter = -1 self.cDoubleSpinBox.setValue(svr.C) self.epsilonDoubleSpinBox.setValue(svr.epsilon) self.defaultComboItem(self.kernelComboBox, svr.kernel) self.degreeSpinBox.setValue(svr.degree) self.defaultComboItem(self.gammaComboBox, svr.gamma) self.coeff0DoubleSpinBox.setValue(svr.coef0) self.shrinkingCheckBox.setChecked(svr.shrinking) self.toleranceDoubleSpinBox.setValue(svr.tol) self.cacheSizeSpinBox.setValue(svr.cache_size) self.verboseCheckBox.setChecked(svr.verbose) self.maxIterationsSpinBox.setValue(svr.max_iter)
def init_model(self): return SVR(kernel="sigmoid", C=self.c, epsilon=self.eps, tol=self.tol, max_iter=self.max_iter, coef0=self.coef0, gamma=self.gamma)
def test_ml_pipeline(): 'load a test data set, run SVM on it, and plot the predictions vs the actual values' data, targets = ReactivityDataLoader().load_mopac_learning() regressor = SVR(C=1000) trainData, testData, trainTargets, testTargets = train_test_split(data, targets) regressor.fit(trainData, trainTargets) os.chdir(str(Path.home() / 'Desktop')) main.plotScatterPlot(testTargets, regressor.predict(testData), 'predictedVsActual')
def init_model(self): return SVR(kernel="poly", degree=self.degree, C=self.c, epsilon=self.eps, tol=self.tol, max_iter=self.max_iter, coef0=self.coef0, gamma=self.gamma)
def run(self): params = { 'C': self.cDoubleSpinBox.value(), 'epsilon': self.epsilonDoubleSpinBox.value(), 'kernel': self.kernelComboBox.currentText(), 'degree': self.degreeSpinBox.value(), 'gamma': self.gammaComboBox.currentText(), 'coef0': self.coeff0DoubleSpinBox.value(), 'shrinking': self.shrinkingCheckBox.isChecked(), 'tol': self.toleranceDoubleSpinBox.value(), 'cache_size': self.cacheSizeSpinBox.value(), 'verbose': self.verboseCheckBox.isChecked(), 'max_iter': int(self.maxIterationsSpinBox.value()) } return params, self.getChangedValues(params, SVR())
def SVR_ALL(trainFileName, testFileName): train_X, train_y, _ = ld.LoadData_DATA_LABEL_ITEM(trainFileName) test_X, items = ld.LoadData_DATA_ITEM(testFileName) train_X = np.matrix(train_X) test_X = np.matrix(test_X) svr = SVR(kernel='linear', epsilon=0.5, C=1) pred_y = svr.fit(train_X[:, -8:-3], train_y).predict(test_X[:, -7:-2]) res = [] for i in range(len(test_X)): res.append([ items[i], 'all', '%.4f' % max(pred_y[i], 0), '%.4f' % test_X[i, -4], '%.4f' % (float(test_X[i, -5]) * 2) ]) return res
def SVR_ALL_train(): train_X, train_y, _ = ld.loadData_all('./data/EVAL_DataSet1.csv') test_X, test_y, items = ld.loadData_all('./data/VALIDATION_DataSet1.csv') train_X = np.matrix(train_X) test_X = np.matrix(test_X) svr = SVR(kernel='linear', epsilon=0.5, C=1) pred_y = svr.fit(train_X[:, -8:-1], train_y).predict(test_X[:, -8:-1]) res = [] for i in range(len(test_X)): res.append([ items[i], 'all', '%.2f' % max(pred_y[i], 0), '%.2f' % test_X[i, -4], '%.2f' % (float(test_X[i, -5]) * 2) ]) return res
def evalOne(parameters): all_obs = [] all_pred = [] for location in locations: trainX, testX, trainY, testY = splitDataForXValidation(location, "location", data, all_features, "target") normalizer_X = StandardScaler() trainX = normalizer_X.fit_transform(trainX) testX = normalizer_X.transform(testX) normalizer_Y = StandardScaler() trainY = normalizer_Y.fit_transform(trainY) testY = normalizer_Y.transform(testY) model = BaggingRegressor(base_estimator=SVR(kernel='rbf', C=parameters["C"], cache_size=5000), max_samples=parameters["max_samples"],n_estimators=parameters["n_estimators"], verbose=0, n_jobs=-1) model.fit(trainX, trainY) prediction = model.predict(testX) prediction = normalizer_Y.inverse_transform(prediction) testY = normalizer_Y.inverse_transform(testY) all_obs.extend(testY) all_pred.extend(prediction) return rmseEval(all_obs, all_pred)[1]
def individual_training_executor(self, dim): # make a pipeline with preprocessing, autoencoder, regression scaler = MinMaxScaler(feature_range=(-0.5,0.5)) autoencoder = Autoencoder(logPath=self.get_path(dim), hiddenDims=[50,dim],beta=0.1) mlPipeline = make_pipeline(scaler, autoencoder) # read in the data and train the autoencoder data, targets = self.read_mopac_reactivity_data() mlPipeline.fit(data, targets) # test the accuracy of an SVM on the transformed data using cross validation latent = mlPipeline.transform(data) regressor = SVR(C=10000) cross_validator = KFold(n_splits=5, shuffle=True, random_state=40) predictions = cross_val_predict(regressor, latent, targets, cv=cross_validator) # make a cross_val_predict-ed vs actual graph main.plotScatterPlot(targets, predictions, 'predictedVsActual') # print the cross validation actual and predicted targets to file actualThenPredicted = np.array([targets, predictions]) np.savetxt('actualThenPredicted.txt', actualThenPredicted)
def SVR_ST_train(): trainData = ld.loadData_ST('./data/EVAL_DataSetST1.csv') testData = ld.loadData_ST('./data/VALIDATION_DataSetST1.csv') store = ['1', '2', '3', '4', '5'] res = [] for i in store: train_X = [] train_y = [] context = trainData[i] for array in context: array = [float(x) for x in array[2:]] train_X.append((array[2:-1])) train_y.append(array[-1]) test_X = [] test_y = [] items = [] context = testData[i] for array in context: items.append((array[0], array[1])) array = [float(x) for x in array[2:]] test_X.append((array[2:-1])) test_y.append(array[-1]) train_X = np.matrix(train_X) test_X = np.matrix(test_X) svr = SVR(kernel='linear', epsilon=0.5, C=1) pred_y = svr.fit(train_X[:, -8:-1], train_y).predict(test_X[:, -8:-1]) for i in range(len(test_X)): res.append([ items[i][0], items[i][1], '%.2f' % max(pred_y[i], 0), '%.2f' % max(test_X[i, -4], 0), '%.2f' % max(2 * test_X[i, -5], 0) ]) return res
ARDRegression(), # HuberRegressor(), # epsilon: greater than 1.0, default 1.35 LinearRegression(n_jobs=5), PassiveAggressiveRegressor( random_state=randomstate), # C: 0.25, 0.5, 1, 5, 10 SGDRegressor(random_state=randomstate), TheilSenRegressor(n_jobs=5, random_state=randomstate), RANSACRegressor(random_state=randomstate), KNeighborsRegressor( weights='distance'), # n_neighbors: 3, 6, 9, 12, 15, 20 RadiusNeighborsRegressor(weights='distance'), # radius: 1, 2, 5, 10, 15 MLPRegressor(max_iter=10000000, random_state=randomstate), DecisionTreeRegressor( random_state=randomstate), # max_depth = 2, 3, 4, 6, 8 ExtraTreeRegressor(random_state=randomstate), # max_depth = 2, 3, 4, 6, 8 SVR() # C: 0.25, 0.5, 1, 5, 10 ] selectors = [ reliefF.reliefF, fisher_score.fisher_score, # chi_square.chi_square, JMI.jmi, CIFE.cife, DISR.disr, MIM.mim, CMIM.cmim, ICAP.icap, MRMR.mrmr, MIFS.mifs ]
l1_ratio=0.25, fit_intercept=True), 'complexity_label': 'non-zero coefficients', 'complexity_computer': lambda clf: np.count_nonzero(clf.coef_) }, { 'name': 'RandomForest', 'instance': RandomForestRegressor(), 'complexity_label': 'estimators', 'complexity_computer': lambda clf: clf.n_estimators }, { 'name': 'SVR', 'instance': SVR(kernel='rbf'), 'complexity_label': 'support vectors', 'complexity_computer': lambda clf: len(clf.support_vectors_) }, ] } benchmark(configuration) # benchmark n_features influence on prediction speed percentile = 90 percentiles = n_feature_influence({'ridge': Ridge()}, configuration['n_train'], configuration['n_test'], [100, 250, 500], percentile) plot_n_features_influence(percentiles, percentile) # benchmark throughput
def __sv_regressor__(self, data, target): from sklearn.svm.classes import SVR svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1) svr_rbf.fit(data, target) self.ensemble = svr_rbf
classifier = DecisionTreeClassifier(max_depth=tree_depth) if alg == 1: classifier = RandomForestClassifier(n_estimators=random_forest_size, random_state=seed, n_jobs=10) if alg == 2: classifier = create_ensemble(seed) if alg == 3: classifier = AdaBoostClassifier(DecisionTreeClassifier(), n_estimators=boosting_size, random_state=seed) if alg == 4: scaler = StandardScaler() svr = SVR(kernel='rbf', cache_size=4000, C=1e3, gamma=0.0001, max_iter=200000, epsilon=0.0001) classifier = Pipeline([('standardize', scaler), ('svr', svr)]) if alg == 5: classifier = GaussianNB() if classifier == "not_init": print("Classifier not init, exit") exit(-1) if debug: print("TRAINING MODEL...") classifier.fit(training_x_no_missing, training_y)
'sample_slicer__band': [[c] for c in np.unique(ds.sa.band)], 'target_trans__target':["age"], 'estimator__clf__C': [1], 'cv__n_splits': [50], 'analysis__radius':[9.], } _default_config = { 'prepro':['sample_slicer', 'feature_norm', 'target_trans'], 'sample_slicer__band': ['alpha'], 'sample_slicer__condition' : ['vipassana'], 'target_trans__target':"expertise_hours", 'estimator': [('clf', SVR(C=1, kernel='linear'))], 'estimator__clf__C':1, 'estimator__clf__kernel':'linear', 'cv': ShuffleSplit, 'cv__n_splits': 50, 'cv__test_size': 0.25, 'scores' : ['neg_mean_squared_error','r2'], 'analysis': SearchLight, 'analysis__n_jobs': 15, 'analysis__permutation':100, 'kwargs__cv_attr': 'subject', 'analysis__verbose':0,
def set_learning_method(config, X_train, y_train): """ Instantiates the sklearn's class corresponding to the value set in the configuration file for running the learning method. TODO: use reflection to instantiate the classes @param config: configuration object @return: an estimator with fit() and predict() methods """ estimator = None learning_cfg = config.get("learning", None) if learning_cfg: p = learning_cfg.get("parameters", None) o = learning_cfg.get("optimize", None) scorers = \ set_scorer_functions(learning_cfg.get("scorer", ['mae', 'rmse'])) method_name = learning_cfg.get("method", None) if method_name == "SVR": if o: tune_params = set_optimization_params(o) estimator = optimize_model(SVR(), X_train, y_train, tune_params, scorers, o.get("cv", 5), o.get("verbose", True), o.get("n_jobs", 1)) elif p: estimator = SVR(C=p.get("C", 10), epsilon=p.get('epsilon', 0.01), kernel=p.get('kernel', 'rbf'), degree=p.get('degree', 3), gamma=p.get('gamma', 0.0034), tol=p.get('tol', 1e-3), verbose=False) else: estimator = SVR() elif method_name == "SVC": if o: tune_params = set_optimization_params(o) estimator = optimize_model(SVC(), X_train, y_train, tune_params, scorers, o.get('cv', 5), o.get('verbose', True), o.get('n_jobs', 1)) elif p: estimator = SVC(C=p.get('C', 1.0), kernel=p.get('kernel', 'rbf'), degree=p.get('degree', 3), gamma=p.get('gamma', 0.0), coef0=p.get('coef0', 0.0), tol=p.get('tol', 1e-3), verbose=p.get('verbose', False)) else: estimator = SVC() elif method_name == "LassoCV": if p: estimator = LassoCV(eps=p.get('eps', 1e-3), n_alphas=p.get('n_alphas', 100), normalize=p.get('normalize', False), precompute=p.get('precompute', 'auto'), max_iter=p.get('max_iter', 1000), tol=p.get('tol', 1e-4), cv=p.get('cv', 10), verbose=False) else: estimator = LassoCV() elif method_name == "LassoLars": if o: tune_params = set_optimization_params(o) estimator = optimize_model(LassoLars(), X_train, y_train, tune_params, scorers, o.get("cv", 5), o.get("verbose", True), o.get("n_jobs", 1)) if p: estimator = LassoLars(alpha=p.get('alpha', 1.0), fit_intercept=p.get( 'fit_intercept', True), verbose=p.get('verbose', False), normalize=p.get('normalize', True), max_iter=p.get('max_iter', 500), fit_path=p.get('fit_path', True)) else: estimator = LassoLars() elif method_name == "LassoLarsCV": if p: estimator = LassoLarsCV(max_iter=p.get('max_iter', 500), normalize=p.get('normalize', True), max_n_alphas=p.get( 'max_n_alphas', 1000), n_jobs=p.get('n_jobs', 1), cv=p.get('cv', 10), verbose=False) else: estimator = LassoLarsCV() return estimator, scorers
from ex30.ex30_lib_graph import plot2 from sklearn.svm.classes import SVR OUTPUT_PNG_FILE = '/experiments/ex30/ex30_svr.png' X = [[float(x)] for x in range(0, 24)] Y = [ 12.0, 13.0, 13.0, 13.0, 28.0, 31.0, 38.0, 60.0, 85.0, 80.0, 64.0, 60.0, 59.0, 58.0, 65.0, 70.0, 80.0, 90.0, 110.0, 100.0, 85.0, 65.0, 45.0, 20.0 ] X2 = [[float(x) / 10.0] for x in range(0, 231)] model = SVR(kernel='rbf', C=10) model.fit(X, Y) Y_pred = model.predict(X2) print(str(Y_pred)) plot2(Y, Y_pred, OUTPUT_PNG_FILE, "Observed pollution concentration levels", "Predicted pollution concentration levels by SVR")
x_train = x_train.drop(['segment_id'], axis=1) y_test = test_set['time_to_failure'] x_test_seg = test_set['segment_id'] x_test = test_set.drop(['time_to_failure'], axis=1) x_test = x_test.drop(['segment_id'], axis=1) # prepare models models = [] # models.append(('LR', LogisticRegression())) # models.append(('LDA', LinearDiscriminantAnalysis())) # models.append(('KNN', KNeighborsClassifier())) # models.append(('CART', DecisionTreeClassifier())) # models.append(('NB', GaussianNB())) svReg = SVR(C=20.299419990722537, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.06841395086207253, kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=True); randForReg = RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=100, max_features='sqrt', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=5, min_weight_fraction_leaf=0.0, n_estimators=800, n_jobs=None, oob_score=False, random_state=None, verbose=0, warm_start=False) models.append(('LassoReg', Lasso(alpha=0.1))) models.append(('SVM', svReg)) models.append(('LinearReg', LinearRegression())) models.append(('randForest', randForReg)) mas = make_scorer(mean_absolute_error, greater_is_better=False);
task='meg') ds = loader.fetch() # Preprocessing pipeline = PreprocessingPipeline(nodes=[ SampleSlicer({ 'band': ['alpha'], 'condition': ['vipassana'] }), FeatureWiseNormalizer(), TargetTransformer("expertise_hours") ]) ds_ = pipeline.transform(ds) # Estimator estimator_pp = Pipeline(steps=[('svr', SVR(C=1, kernel='linear'))]) cross_validation = GroupShuffleSplit(n_splits=10, test_size=0.25) scores = ['r2', 'explained_variance'] cv_attr = 'subject' sl = SearchLight(estimator=estimator_pp, scoring=scores, cv=cross_validation) sl.fit(ds_, cv_attr=cv_attr) #### Cross Validation ### cross_validation = GroupShuffleSplit(n_splits=150, test_size=0.25) groups = LabelEncoder().fit_transform(ds_.sa.subject) X = ds_.samples y = LabelEncoder().fit_transform(ds_.targets) train_list = [] for train, test in cross_validation.split(X, y, groups=groups):
K_N_N = KNeighborsClassifier() SUPPORT_VECTOR = svm.SVC(kernel="linear") # Ensemble classifiers RANDOM_FOREST = RandomForestClassifier(n_estimators=100) GRADIENT_BOOST_CL = GradientBoostingClassifier(n_estimators=100) ADA_BOOST = AdaBoostClassifier(n_estimators=100) EXTRA_TREE = ExtraTreesClassifier(n_estimators=100) # Regressors GRADIENT_BOOST_RG = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1) LINEAR_RG = LinearRegression() RIDGE_RG = Ridge() LASSO_RG = Lasso() SVR_RG = SVR() def getClassifierMap(): CLASSIFIER_MAP = { "DECISION_TREE": DECISION_TREE, "LOGISTIC_REGRESSION": LOGISTIC_REGRESSION, "NAIVE_BAYS": NAIVE_BAYS, "K_N_N": K_N_N, "SUPPORT_VECTOR": SUPPORT_VECTOR, "RANDOM_FOREST": RANDOM_FOREST, "GRADIENT_BOOST": GRADIENT_BOOST_CL, "ADA_BOOST": GRADIENT_BOOST_CL, "EXTRA_TREE": EXTRA_TREE } return CLASSIFIER_MAP
print X_train.shape print y_train.shape print X_test.shape print y_test.shape print X_train[123, :] ''' norm1 = np.linalg.norm(y_train) if norm1 != 0: y_train, y_test = y_train/norm1, y_test/norm1 print norm1 ''' print y_train.shape model = SVR(C=1.0, gamma=1.0) model = LinearRegression() lasso = Lasso(alpha=0.1).fit(X_train, y_train) enet = ElasticNet(alpha=0.1, l1_ratio=0.7).fit(X_train, y_train) y_pred = lasso.predict(X_test) print "MSE", mean_squared_error(y_test, y_pred) m = np.mean(y_test) print "MSE (Mean)", mean_squared_error(y_test, m * np.ones(len(y_test))) print "r^2 on test data", r2_score(y_test, y_pred) plt.plot(enet.coef_, label='Elastic net coefficients') plt.plot(lasso.coef_, label='Lasso coefficients')
output = open(OUTPUT_DATA_FILE, 'w') output.write("location,observation,prediction\n") for location in locations: print(str(location)) trainX, testX, trainY, testY = splitDataForXValidation( location, "location", data, all_features, "target") normalizer_X = StandardScaler() trainX = normalizer_X.fit_transform(trainX) testX = normalizer_X.transform(testX) normalizer_Y = StandardScaler() trainY = normalizer_Y.fit_transform(trainY) testY = normalizer_Y.transform(testY) model = BaggingRegressor(base_estimator=SVR(kernel='rbf', C=40, cache_size=5000), max_samples=4200, n_estimators=10, verbose=0, n_jobs=-1) model.fit(trainX, trainY) prediction = model.predict(testX) prediction = normalizer_Y.inverse_transform(prediction) testY = normalizer_Y.inverse_transform(testY) for i in range(0, len(testY)): output.write(str(location)) output.write(",") output.write(str(testY[i])) output.write(",")
'RadiusNeighborsClassifier':RadiusNeighborsClassifier(), 'RadiusNeighborsRegressor':RadiusNeighborsRegressor(), 'RandomForestClassifier':RandomForestClassifier(), 'RandomForestRegressor':RandomForestRegressor(), 'RandomizedLasso':RandomizedLasso(), 'RandomizedLogisticRegression':RandomizedLogisticRegression(), 'RandomizedPCA':RandomizedPCA(), 'Ridge':Ridge(), 'RidgeCV':RidgeCV(), 'RidgeClassifier':RidgeClassifier(), 'RidgeClassifierCV':RidgeClassifierCV(), 'RobustScaler':RobustScaler(), 'SGDClassifier':SGDClassifier(), 'SGDRegressor':SGDRegressor(), 'SVC':SVC(), 'SVR':SVR(), 'SelectFdr':SelectFdr(), 'SelectFpr':SelectFpr(), 'SelectFwe':SelectFwe(), 'SelectKBest':SelectKBest(), 'SelectPercentile':SelectPercentile(), 'ShrunkCovariance':ShrunkCovariance(), 'SkewedChi2Sampler':SkewedChi2Sampler(), 'SparsePCA':SparsePCA(), 'SparseRandomProjection':SparseRandomProjection(), 'SpectralBiclustering':SpectralBiclustering(), 'SpectralClustering':SpectralClustering(), 'SpectralCoclustering':SpectralCoclustering(), 'SpectralEmbedding':SpectralEmbedding(), 'StandardScaler':StandardScaler(), 'TSNE':TSNE(),
def train(driverSpeed, sectionSpeed, newData, firstTime, n, minLon, lonLen, minLat, latLen, defaultVel): '''返回SVR,由[路段平均速度,个人平均速度,载客信息]->瞬时速度训练得到''' X = [] Y = [] for file in newData: df = pandas.read_csv( file, header=None, names=["taxiId", "lat", "lon", "busy", "time", "vel", "sec"], dtype={ "taxiId": numpy.int16, "lat": numpy.float32, "lon": numpy.float32, "busy": numpy.int8, "time": numpy.str, "vel": numpy.float32, "sec": numpy.int16 }) taxiId1 = -1 sectionId1 = 0 busy1 = 0 time1 = firstTime for row in df.itertuples(index=False): taxiId2 = row[0] busy2 = row[3] time2 = datetime.datetime.strptime(row[4], "%Y/%m/%d %H:%M:%S") v = row[5] sectionId2 = row[6] if taxiId1 == taxiId2 and time1.hour == time2.hour and not numpy.isnan( v): #前一个点额瞬时速度 Y.append(v) x = [] #路段平均速度 v = sectionSpeed[sectionId1][time1.hour - firstTime.hour] if numpy.isnan(v): x.append(defaultVel) else: x.append(v) #个人平均速度 v = driverSpeed[taxiId1 - 1][time1.hour - firstTime.hour] if numpy.isnan(v): x.append(defaultVel) else: x.append(v) #是否载客 x.append(busy1) X.append(x) taxiId1 = taxiId2 busy1 = busy2 time1 = time2 sectionId1 = sectionId2 clf = SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.2, gamma='auto', kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False) clf.fit(X, Y) return clf
def moudle_select(X, test_A, y, moudelselect, threshold=False, Rate=False): ''' Function :model X : train data test_A : predict data y : result label predict_A : predict data moudelselect : waht' model do you select? threshold:False Rate:False modelselect : 1,XGBRegressor 2,ensemble.RandomForestRegressor 3,linear_model.Lasso 4,LinearRegression 5,linear_model.BayesianRidge 6,DecisionTreeRegressor 7,ensemble.RandomForestRegressor 8,ensemble.GradientBoostingRegressor 9,ensemble.AdaBoostRegressor 10,BaggingRegressor 11,ExtraTreeRegressor 12,SVR 13,MLPRegressor other:MLPRegressor ''' mse = [] sum_mse = 0.0 predict_A = pd.DataFrame(np.zeros((100, 10))) for index in range(5): X_train, X_test, y_train, y_test = train_test_split(X, y) if (moudelselect == 1): model = xgb.XGBRegressor( model=xgb.XGBRegressor(max_depth=17, min_child_weigh=5, eta=0.025, gamma=0.06, subsample=1, learning_rate=0.1, n_estimators=100, silent=0, n_jobs=-1, objective='reg:linear')) elif (moudelselect == 2): model = ensemble.RandomForestRegressor( n_estimators=25, criterion='mse', max_depth=14, min_samples_split=0.1, min_samples_leaf=2, min_weight_fraction_leaf=0.0, max_features=0.95, max_leaf_nodes=None, min_impurity_split=1e-07, bootstrap=True, oob_score=False, n_jobs=-1, random_state=None, verbose=0, warm_start=False) elif (moudelselect == 3): model = linear_model.Lasso(alpha=0.1, max_iter=1000, normalize=False) elif (moudelselect == 4): model = LinearRegression(fit_intercept=False, n_jobs=1, normalize=False) elif (moudelselect == 5): model = linear_model.BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True, fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, n_iter=500, normalize=False, tol=10, verbose=False) elif (moudelselect == 6): model = DecisionTreeRegressor(criterion='mse', splitter='best', max_depth=3, min_samples_split=0.1, min_samples_leaf=0.1, min_weight_fraction_leaf=0.1, max_features=None, random_state=None, max_leaf_nodes=None, presort=False) elif (moudelselect == 7): model = ensemble.RandomForestRegressor( n_estimators=1000, criterion='mse', max_depth=14, min_samples_split=0.1, min_samples_leaf=2, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_split=1e-07, bootstrap=True, oob_score=False, n_jobs=-1, random_state=None, verbose=0, warm_start=False) elif (moudelselect == 8): model = ensemble.GradientBoostingRegressor(n_estimators=800, learning_rate=0.1, max_depth=4, random_state=0, loss='ls') elif (moudelselect == 9): model = ensemble.AdaBoostRegressor(base_estimator=None, n_estimators=120, learning_rate=1, loss='linear', random_state=None) elif (moudelselect == 10): model = BaggingRegressor(base_estimator=None, n_estimators=500, max_samples=1.0, max_features=1.0, bootstrap=True) elif (moudelselect == 11): model = ExtraTreeRegressor(criterion='mse', splitter='random', max_depth=3, min_samples_split=0.1, min_samples_leaf=1, min_weight_fraction_leaf=0.01, max_features='auto', random_state=None, max_leaf_nodes=None, min_impurity_split=1e-07) elif (moudelselect == 12): model = SVR(kernel='rbf', degree=3, gamma='auto', coef0=0.1, tol=0.001, C=1, epsilon=0.1, shrinking=True, cache_size=200, verbose=False, max_iter=-1) elif (moudelselect == 13): model = MLPRegressor(hidden_layer_sizes=(100, ), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08) else: model = MLPRegressor(activation='relu', alpha=0.001, solver='lbfgs', max_iter=90, hidden_layer_sizes=(11, 11, 11), random_state=1) model.fit(X_train, y_train) y_pred = model.predict(X_test) print("index: ", index, mean_squared_error(y_test, y_pred)) sum_mse += mean_squared_error(y_test, y_pred) # # if (threshold == False): y_predict = model.predict(test_A) predict_A.ix[:, index] = y_predict mse.append(mean_squared_error(y_test, y_pred)) else: if (mean_squared_error(y_test, y_pred) <= 0.03000): y_predict = model.predict(test_A) predict_A.ix[:, index] = y_predict mse.append(mean_squared_error(y_test, y_pred)) # if(Rate==False): # mse_rate = mse / np.sum(mse) # #predict_A = predict_A.ix[:,~(data==0).all()] # for index in range(len(mse_rate)): # y+=predict_A.ix[:,index]*mse_rate[index] # y = 0.0 mse = mse / np.sum(mse) mse = pd.Series(mse) mse_rate_asc = mse.sort_values(ascending=False) mse_rate_asc = mse_rate_asc.reset_index(drop=True) mse_rate_desc = mse.sort_values(ascending=True) indexs = list(mse_rate_desc.index) for index in range(len(mse)): y += mse_rate_asc.ix[index] * predict_A.ix[:, indexs[index]] print("y_predict_mean: ", y.mean()) print("y_predict_var: ", y.var()) y = pd.DataFrame(y) y.to_excel("H:/java/python/src/machinelearning/test/predict.xlsx", index=False) predict_A.to_excel( "H:/java/python/src/machinelearning/test/predict_testA.xlsx", index=False) print("Averge mse:", sum_mse / len(mse))