def _random_search(self, random_iter, x, y, kernel_cache_size): # Default Values c = 1.0 gamma = 0.0 nu = 0.5 best_score = -sys.maxint if random_iter > 0: sys.stdout.write("Do a random search %d times" % random_iter) param_dist = { "C": numpy.power(2.0, range(-5, 16)), "gamma": numpy.power(2.0, range(-15, 4)), "nu": uniform(loc=0.0001, scale=1 - 0.0001) } param_list = [ { "C": c, "gamma": gamma, "nu": nu }, ] param_list.extend( list( ParameterSampler(param_dist, n_iter=random_iter - 1, random_state=self._rng))) for idx, d in enumerate(param_list): nusvr = NuSVR(kernel='rbf', gamma=d['gamma'], C=d['C'], nu=d['nu'], random_state=self._rng, cache_size=kernel_cache_size) train_x, test_x, train_y, test_y = \ train_test_split(x, y, test_size=0.5, random_state=self._rng) self._check_scaling(scaled_x=train_x) nusvr.fit(train_x, train_y) sc = nusvr.score(test_x, test_y) # Tiny output m = "." if idx % 10 == 0: m = "#" if sc > best_score: m = "<" best_score = sc c = d['C'] gamma = d['gamma'] nu = d['nu'] sys.stdout.write(m) sys.stdout.flush() sys.stdout.write("Using C: %f, nu: %f and Gamma: %f\n" % (c, nu, gamma)) return nu, c, gamma
def _test_diabetes_compare_with_sklearn(kernel): diabetes = datasets.load_diabetes() clf_onedal = NuSVR(kernel=kernel, nu=.25, C=10.) clf_onedal.fit(diabetes.data, diabetes.target) result = clf_onedal.score(diabetes.data, diabetes.target) clf_sklearn = SklearnNuSVR(kernel=kernel, nu=.25, C=10.) clf_sklearn.fit(diabetes.data, diabetes.target) expected = clf_sklearn.score(diabetes.data, diabetes.target) assert result > expected - 1e-5 assert_allclose(clf_sklearn.intercept_, clf_onedal.intercept_, atol=1e-4) assert_allclose(clf_sklearn.support_vectors_.shape, clf_sklearn.support_vectors_.shape) assert_allclose(clf_sklearn.dual_coef_, clf_onedal.dual_coef_, atol=1e-2)
def cv_nu_SVR(X, y, K, C_test, nu_test): Accuracy = np.zeros((len(C_test), len(nu_test))) Xcv, Ycv = create_cv_set(X, y, K) k1 = 0 for c in C_test: k2 = 0 for nu in nu_test: current_acc = 0.0 for n in range(K): svc = NuSVR(C=c, nu=nu) X_train, y_train, X_test, y_test = create_train_set( Xcv, Ycv, n) #On entraine le SVM svc.fit(X_train, y_train) res_tmp = svc.score(X_test, y_test) current_acc = current_acc + res_tmp / (1.0 * K) Accuracy[k1, k2] = current_acc k2 = k2 + 1 k1 = k1 + 1 acc_test = 0 C_opt = 0 nu_opt = 0 for k1 in range(Accuracy.shape[0]): for k2 in range(Accuracy.shape[1]): if (Accuracy[k1, k2] > acc_test): acc_test = Accuracy[k1, k2] C_opt = C_test[k1] nu_opt = nu_test[k2] print("NuSVR, Parametres optimaux: C=", C_opt, " nu=", nu_opt) return C_opt, nu_opt
def _random_search(self, random_iter, x, y, kernel_cache_size): # Default Values c = 1.0 gamma = 0.0 nu = 0.5 best_score = -sys.maxint if random_iter > 0: sys.stdout.write("Do a random search %d times" % random_iter) param_dist = {"C": numpy.power(2.0, range(-5, 16)), "gamma": numpy.power(2.0, range(-15, 4)), "nu": uniform(loc=0.0001, scale=1-0.0001)} param_list = [{"C": c, "gamma": gamma, "nu": nu}, ] param_list.extend(list(ParameterSampler(param_dist, n_iter=random_iter-1, random_state=self._rng))) for idx, d in enumerate(param_list): nusvr = NuSVR(kernel='rbf', gamma=d['gamma'], C=d['C'], nu=d['nu'], random_state=self._rng, cache_size=kernel_cache_size) train_x, test_x, train_y, test_y = \ train_test_split(x, y, test_size=0.5, random_state=self._rng) self._check_scaling(scaled_x=train_x) nusvr.fit(train_x, train_y) sc = nusvr.score(test_x, test_y) # Tiny output m = "." if idx % 10 == 0: m = "#" if sc > best_score: m = "<" best_score = sc c = d['C'] gamma = d['gamma'] nu = d['nu'] sys.stdout.write(m) sys.stdout.flush() sys.stdout.write("Using C: %f, nu: %f and Gamma: %f\n" % (c, nu, gamma)) return nu, c, gamma
df = df.iloc[:2949, :] import pickle df.to_pickle("Final_Data") df.read_pickle("Final_Data") for idx, row in output_df.iterrows(): df.loc[row['FIPS'], 'annual_count_avg'] = row['Average Annual Count'] X = df.loc[:, :'WATR'] y = df['annual_count_avg'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) from sklearn.svm import LinearSVR svr = LinearSVR(random_state=0, tol=1e-5).fit(X_train, y_train) svr.score(X_test, y_test) from sklearn import svm svm = svm.SVR().fit(X_train, y_train) svm.score(X_test, y_test) from sklearn.svm import NuSVR nuSVR = NuSVR().fit(X_train, y_train) nuSVR.score(X_test, y_test) from sklearn import linear_model ridge = linear_model.Ridge(alpha=0.5).fit(X_train, y_train) ridge.score(X_test, y_test) np.argmax(ridge.coef_)
class AllRegressionModels: """ Wrapper class around all supported regression models: LinearRegression, RandomForest, SVR, NuSVR, LinearSVR, and XGBRegressor. AllRegressionModels runs every available regression algorithm on the given dataset and outputs the coefficient of determination and execution time of each successful model when all_regression_models() is run. """ def __init__(self, attributes=None, labels=None, test_size=0.25, verbose=False): """ Initializes an AllRegressionModels object. The following parameters are needed to use an AllRegressionModels object: – attributes: a numpy array of the desired independent variables (Default is None) – labels: a numpy array of the desired dependent variables (Default is None) – test_size: the proportion of the dataset to be used for testing the model; the proportion of the dataset to be used for training will be the complement of test_size (Default is 0.25) – verbose: specifies whether or not to ouput any and all logging during model training (Default is False) Note: These are the only parameters allowed. All other parameters for each model will use their default values. For more granular control, please instantiate each model individually. The following instance data is found after running all_regression_models() successfully: – linear_regression: a reference to the LinearRegression model – random_forest: a reference to the RandomForest model – SVR: a reference to the SVR model – nu_SVR: a reference to the NuSVR model – linear_SVR: a reference to the LinearSVR model – XGB_regressor: a reference to the XGBRegressor model After running all_regression_models(), the coefficient of determination and execution time for each model that ran successfully will be displayed in tabular form. Any models that failed to run will be listed. """ self.attributes = attributes self.labels = labels self.test_size = test_size self.verbose = verbose self.linear_regression = LinearRegression() self.random_forest = RandomForestRegressor(verbose=self.verbose) self.SVR = SVR(verbose=self.verbose) self.nu_SVR = NuSVR(verbose=self.verbose) self.linear_SVR = LinearSVR(verbose=self.verbose) self.XGB_regressor = XGBRegressor(verbosity=int(self.verbose)) self._regression_models = {"Model": ["R2 Score", "Time"]} self._failures = [] # Accessor methods def get_attributes(self): """ Accessor method for attributes. If an AllRegressionModels object is initialized without specifying attributes, attributes will be None. all_regression_models() cannot be called until attributes is a populated numpy array of independent variables; call set_attributes(new_attributes) to fix this. """ return self.attributes def get_labels(self): """ Accessor method for labels. If an AllRegressionModels object is initialized without specifying labels, labels will be None. all_regression_models() cannot be called until labels is a populated numpy array of dependent variables; call set_labels(new_labels) to fix this. """ return self.labels def get_test_size(self): """ Accessor method for test_size. Should return a number or None. """ return self.test_size def get_verbose(self): """ Accessor method for verbose. Will default to False if not set by the user. """ return self.verbose def get_all_regression_models(self): """ Accessor method that returns a list of all models. All models within the list will be None if all_regression_models() hasn't been called, yet. """ return [self.linear_regression, self.random_forest, self.SVR, self.nu_SVR, self.linear_SVR, self.XGB_regressor] def get_linear_regression(self): """ Accessor method for linear_regression. Will return None if all_regression_models() hasn't been called, yet. """ return self.linear_regression def get_random_forest(self): """ Accessor method for random_forest. Will return None if all_regression_models() hasn't been called, yet. """ return self.random_forest def get_SVR(self): """ Accessor method for SVR. Will return None if all_regression_models() hasn't been called, yet. """ return self.SVR def get_nu_SVR(self): """ Accessor method for nu_SVR. Will return None if all_regression_models() hasn't been called, yet. """ return self.nu_SVR def get_linear_SVR(self): """ Accessor method for linear_SVR. Will return None if all_regression_models() hasn't been called, yet. """ return self.linear_SVR def get_XGB_regressor(self): """ Accessor method for XGB_regressor. Will return None if all_regression_models() hasn't been called, yet. """ return self.XGB_regressor # Modifier methods def set_attributes(self, new_attributes=None): """ Modifier method for attributes. Input should be a numpy array of independent variables. Defaults to None. """ self.attributes = new_attributes def set_labels(self, new_labels=None): """ Modifier method for labels. Input should be a numpy array of dependent variables. Defaults to None. """ self.labels = new_labels def set_test_size(self, new_test_size=0.25): """ Modifier method for test_size. Input should be a number or None. Defaults to 0.25. """ self.test_size = new_test_size def set_verbose(self, new_verbose=False): """ Modifier method for verbose. Input should be a truthy/falsy value. Defaults to False. """ self.verbose = new_verbose # Regression functionality def all_regression_models(self): """ Driver method for running all regression models with given attributes and labels. all_regression_models() first trains the models and determines their coefficients of determination and execution time via _all_regression_models_runner(). Then, all_regression_models() calls _print_results() to format and print each successful model's measurements, while also listing any failed models. If verbose is True, all verbose logging for each model will be enabled. If verbose is False, all logging to stdout and stderr will be suppressed. """ # Call helper method for running all regression models; suppress output, if needed if not self.verbose: suppress_output = io.StringIO() with redirect_stderr(suppress_output), redirect_stdout(suppress_output): self._all_regression_models_runner() else: self._all_regression_models_runner() # Print results self._print_results() # Helper methods def _all_regression_models_runner(self): """ Helper method that runs all models using the given dataset and all default parameters. After running all models, each model is determined to be either a success or failure, and relevant data (R2 score, execution time) is recorded. _all_regression_models_runner() may only be called by all_regression_models(). """ # Split dataset dataset_X_train, dataset_X_test, dataset_y_train, dataset_y_test =\ train_test_split(self.attributes, self.labels, test_size=self.test_size) # Run and time all models; identify each as success or failure try: start_time = time.time() self.linear_regression.fit(dataset_X_train, dataset_y_train) end_time = time.time() self._regression_models["LinearRegression"] =\ [self.linear_regression.score(dataset_X_test, dataset_y_test), end_time - start_time] except: self._failures.append("LinearRegression") try: start_time = time.time() self.random_forest.fit(dataset_X_train, dataset_y_train) end_time = time.time() self._regression_models["RandomForest"] =\ [self.random_forest.score(dataset_X_test, dataset_y_test), end_time - start_time] except: self._failures.append("RandomForest") try: start_time = time.time() self.SVR.fit(dataset_X_train, dataset_y_train) end_time = time.time() self._regression_models["SVR"] = [self.SVR.score(dataset_X_test, dataset_y_test), end_time - start_time] except: self._failures.append("SVR") try: start_time = time.time() self.nu_SVR.fit(dataset_X_train, dataset_y_train) end_time = time.time() self._regression_models["NuSVR"] = [self.nu_SVR.score(dataset_X_test, dataset_y_test), end_time - start_time] except: self._failures.append("NuSVR") try: start_time = time.time() self.linear_SVR.fit(dataset_X_train, dataset_y_train) end_time = time.time() self._regression_models["LinearSVR"] =\ [self.linear_SVR.score(dataset_X_test, dataset_y_test), end_time - start_time] except: self._failures.append("LinearSVR") try: start_time = time.time() self.XGB_regressor.fit(dataset_X_train, dataset_y_train) end_time = time.time() self._regression_models["XGBRegressor"] =\ [self.XGB_regressor.score(dataset_X_test, dataset_y_test), end_time - start_time] except: self._failures.append("XGBRegressor") def _print_results(self): """ Helper method that prints results of _all_regression_models_runner() in tabular form. _print_results() may only be called by all_regression_models() after all models have attempted to run. """ # Print models that didn't fail print("\nResults:\n") for model, data in self._regression_models.items(): print("{:<20} {:<20} {:<20}".format(model, data[0], data[1])) print() # Print failures, if any if len(self._failures) > 0: print("The following models failed to run:\n") for entry in self._failures: print(entry) print()
print 'NuSVC config:' print nusvc.get_params() nusvc.fit(smr_train.feature_matrix, smr_train.labels) nusvc_score_train = nusvc.score(smr_train.feature_matrix, smr_train.labels) print 'NuSVC precision train: {}'.format(nusvc_score_train) nusvc_score_test = nusvc.score(smr_test.feature_matrix, smr_test.labels) print 'NuSVC precision test: {}'.format(nusvc_score_test) print '' nusvr = NuSVR() print 'NuSVR config:' print nusvr.get_params() nusvr.fit(smr_train.feature_matrix, smr_train.labels) nusvr_score_train = svc.score(smr_train.feature_matrix, smr_train.labels) print 'NuSVR precision train: {}'.format(nusvr_score_train) nusvr_score_test = nusvr.score(smr_test.feature_matrix, smr_test.labels) print 'NuSVR precision test: {}'.format(nusvr_score_test) print '' dtc = DecisionTreeClassifier() print 'DecisionTreeClassifier config:' print dtc.get_params() dtc.fit(smr_train.feature_matrix, smr_train.labels) dtc_score_train = dtc.score(smr_train.feature_matrix, smr_train.labels) print 'DecisionTreeClassifier precision train: {}'.format(dtc_score_train) dtc_score_test = dtc.score(smr_test.feature_matrix, smr_test.labels) print 'DecisionTreeClassifier precision test: {}'.format(dtc_score_test) print classification_report(smr_test.labels, dtc.predict(smr_test.feature_matrix)) print ''
def runTcheby(): global param, approx_pareto_front, archiveOK, NO_FILE_TO_WRITE ############################################################################ # PARAMETER #clf = SVR(C=1.0, epsilon=0.1, kernel="rbf") clf = NuSVR(cache_size=2000, shrinking=True,verbose=True) clf2 = -1 two_models_bool = False isReals = True start_fct, nb_functions = param[0:2] nb_iterations, neighboring_size = param[2:4] init_decisions, problem_size = param[4:6] max_decisions_maj, delta_neighbourhood = param[6:8] CR, search_space = param[8:10] F, distrib_index_n = param[10:12] pm, operator_fct = param[12:14] nb_samples, training_neighborhood_size = param[14:16] strategy, file_to_write = param[16:18] filter_strat, free_eval = param[18:20] param_print_every, file_to_writeR2 = param[20:22] filenameDIR, filenameSCORE = param[22:24] nb_objectives = len(start_fct) #get separatly offspring operator fct crossover_fct, mutation_fct, repair_fct = operator_fct best_decisions = copy.deepcopy(init_decisions) sampling_param = [crossover_fct, mutation_fct, repair_fct, best_decisions, F, problem_size, CR, search_space, distrib_index_n, pm] ############################################################################ # INITIALISATION qual_tools.resetGlobalVariables(filenameDIR, filenameSCORE, nb_iterations, nb_functions) eval_to.resetEval() #get the directions weight for both starting functions directions = dec.getDirections(nb_functions, nb_objectives) #init the neighboring constant nt.initNeighboringTab(nb_functions, neighboring_size, directions, nb_objectives) #giving global visibility to the best_decisions to get the result at the end approx_pareto_front = best_decisions #initial best decisions scores best_decisions_scores = [eval_to.free_eval(start_fct, best_decisions[i], problem_size) for i in range(nb_functions)] pop_size = nb_functions #current optimal scores for both axes z_opt_scores = gt.getMinTabOf(best_decisions_scores) eval_to.initZstar(z_opt_scores) #get the first training part of the item we will learn on model_directions = train_to.getDirectionsTrainingMatrix(directions) #if the data shall be write in a file writeOK = False if(file_to_write != NO_FILE_TO_WRITE): writeOK = True writeR2OK = False if(file_to_writeR2 != NO_FILE_TO_WRITE): writeR2OK = True ############################################################################ # MAIN ALGORITHM if(writeOK): iot.printObjectives(file_to_write, eval_to.getNbEvals(), 0,best_decisions_scores, problem_size, nb_objectives) #set of all the solution evaluated all_decisions = copy.deepcopy(best_decisions) all_decisions_scores = copy.deepcopy(best_decisions_scores) all_len = nb_functions #IDs tab to allow a random course through the directions in the main loop id_directions = [i for i in range(nb_functions)] #iterations loop for itera in range(nb_iterations): #Update model training_inputs, training_outputs, training_set_size, training_scores = train_to.getTrainingSet(model_directions, all_decisions, all_decisions_scores ,eval_to.getZstar_with_decal(), strategy, nb_functions, training_neighborhood_size) print(len(training_outputs)) clf.fit(training_inputs, training_outputs) if(writeR2OK): training_inputs_tcheby = eval_to.getManyTcheby(training_inputs, training_scores, eval_to.getZstar_with_decal(), training_set_size) random_index = numpy.arange(0,training_set_size) numpy.random.shuffle(random_index) n_folds = 10 folds_sizes = (training_set_size // n_folds) * numpy.ones(n_folds, dtype=numpy.int) folds_sizes[:training_set_size % n_folds] += 1 training_inputs_array = numpy.array(training_inputs) training_tcheby_array = numpy.array(training_inputs_tcheby) R2_cv = [] MSE_cv = [] MAE_cv = [] MDAE_cv = [] clfCV = NuSVR() current = 0 for fold_size in folds_sizes: start, stop = current, current + fold_size mask = numpy.ones(training_set_size, dtype=bool) mask[start:stop] = 0 current = stop clfCV.fit(training_inputs_array[random_index[mask]], training_tcheby_array[random_index[mask]]) test_fold_tcheby = training_tcheby_array[random_index[start:stop]] test_fold_predict = clfCV.predict(training_inputs_array[random_index[start:stop]]) R2_cv .append(r2_score (test_fold_tcheby, test_fold_predict)) MSE_cv .append(mean_squared_error (test_fold_tcheby, test_fold_predict)) MAE_cv .append(mean_absolute_error (test_fold_tcheby, test_fold_predict)) MDAE_cv.append(median_absolute_error(test_fold_tcheby, test_fold_predict)) R2 = clf.score(training_inputs, training_outputs) MSE_cv_mean = numpy.mean(MSE_cv) RMSE_cv_mean = math.sqrt(MSE_cv_mean) MAE_cv_mean = numpy.mean(MAE_cv) MDAE_cv_mean = numpy.mean(MDAE_cv) R2_cv_mean = numpy.mean(R2_cv) iot.printR2(file_to_writeR2, eval_to.getNbEvals(), itera, R2, R2_cv_mean, MSE_cv_mean , MAE_cv_mean, MDAE_cv_mean, RMSE_cv_mean, problem_size, print_every=1) #random course through the directions random.shuffle(id_directions) #functions loop for f in id_directions: #get all the indice of neighbors of a function in a certain distance of f and include f in f_neighbors, current_neighbourhing_size = nt.getNeighborsOf(f, delta_neighbourhood) #get a list of offspring from the neighbors list_offspring = samp_to.extended_sampling(f, f_neighbors, sampling_param, nb_samples) #apply a filter on the offspring list and select the best one filter_param = [itera, f, clf, clf2, two_models_bool, f_neighbors, list_offspring, model_directions, start_fct, problem_size, eval_to.getZstar_with_decal(), best_decisions_scores, best_decisions, nb_objectives] best_candidate = filt_to.model_based_filtring(filter_strat, free_eval, filter_param) #evaluation of the newly made solution mix_scores = eval_to.eval(start_fct, best_candidate, problem_size) #MAJ of the z_star point has_changed = eval_to.min_update_Z_star(mix_scores, nb_objectives) #retraining of the model with the new z_star if(has_changed): train_to.updateTrainingZstar(eval_to.getZstar_with_decal()) training_outputs = train_to.retrainSet(training_inputs, training_scores, eval_to.getZstar_with_decal(), training_set_size, nb_objectives) clf.fit(training_inputs, training_outputs) #add to training input new_input = [] new_input.extend(best_candidate) all_decisions.append(new_input) all_decisions_scores.append(mix_scores) all_len += 1 #boolean that is True if the offspring has been add to the archive added_to_S = False #count how many best decisions has been changed by the newly offspring cmpt_best_maj = 0 #random course through the neighbors list random.shuffle(f_neighbors) #course through the neighbors list for j in f_neighbors: #stop if already max number of remplacement reach if(cmpt_best_maj >= max_decisions_maj): break #compute g_tcheby #wj = (directions[0][j],directions[1][j]) wj = [directions[obj][j] for obj in range(0,nb_objectives)] g_mix = eval_to.g_tcheby(wj, mix_scores, eval_to.getZstar_with_decal()) g_best = eval_to.g_tcheby(wj, best_decisions_scores[j], eval_to.getZstar_with_decal()) #if the g_tcheby of the new solution is less distant from the z_optimal solution than the current best solution of the function j if(g_mix < g_best): cmpt_best_maj += 1 best_decisions[j] = best_candidate best_decisions_scores[j] = mix_scores #if we manage the archive and the solution have not been add already if(archiveOK and not(added_to_S)): arch_to.archivePut(best_candidate, mix_scores) added_to_S = True #print("Update", itera, "done.") #if manage archive if(archiveOK): arch_to.maintain_archive() #if write the result in a file if(writeOK): iot.printObjectives(file_to_write, eval_to.getNbEvals(), itera+1, best_decisions_scores, problem_size, nb_objectives, print_every=param_print_every) continue #graphic update #yield arch_to.getArchiveScore(), best_decisions_scores, itera+1, eval_to.getNbEvals(), eval_to.getZstar_with_decal(), pop_size, isReals if(not free_eval and writeR2OK): qual_tools.computeQualityEvaluation() qual_tools.generateDiffPredFreeFile() return
# Fitting SVR svr2 = SVR(kernel='rbf', gamma=1, epsilon=0.1) svr2.fit(x[:, None], y) xx = np.linspace(-4, 4, 100) yy = svr2.predict(xx[:, None]) # Fitting NuSVR svr3 = NuSVR(kernel='rbf', gamma=1, nu=0.9) svr3.fit(x[:, None], y) yy2 = svr3.predict(xx[:, None]) # Compare performance : R-square svr2.score(x[:, None], y) svr3.score(x[:, None], y) #%% # Visualize plt.scatter(x, y) plt.plot(xx, yy, 'k', label='SVR') plt.plot(xx, yy2, 'r:', label="NuSVR") plt.legend(fontsize=14) #%%
class SVM: """ Wrapper class around scikit-learn's support vector machine functionality. This class supports binary and multi-class classification on a dataset, along with regression via Support Vector Regression (SVR). Per scikit-learn's documentation: Support vector machines (SVMs) are a set of supervised learning methods used for classification, regression and outliers detection. The advantages of support vector machines are: – Effective in high dimensional spaces. – Still effective in cases where number of dimensions is greater than the number of samples. – Uses a subset of training points in the decision function (called support vectors), so it is also memory efficient. – Versatile: different Kernel functions can be specified for the decision function. Common kernels are provided, but it is also possible to specify custom kernels. The disadvantages of support vector machines include: – If the number of features is much greater than the number of samples, avoid over-fitting in choosing Kernel functions and regularization term is crucial. – SVMs do not directly provide probability estimates, these are calculated using an expensive five-fold cross-validation. """ def __init__(self, attributes=None, labels=None, test_size=0.25): """ Initializes a SVM object. The following parameters are needed to use a SVM: – attributes: a numpy array of the independent variables – labels: a numpy array of the classes (for classification) or dependent variables (for regression) – test_size: the proportion of the dataset to be used for testing the model (defaults to 0.25); the proportion of the dataset to be used for training will be the complement of test_size After successfully running one of the classifier methods (SVC(), nu_SVC(), or linear_SVC()), the corresponding classifier below will be trained: – classifier_SVC: a classifier trained using scikit-learn's SVC implementation – accuracy_SVC: the accuracy of the SVC model, based on its predictions for dataset_X_test – roc_auc_SVC: the area under the ROC curve for the SVC model – classifier_nu_SVC: a classifier trained using scikit-learn's NuSVC implementation – accuracy_nu_SVC: the accuracy of the NuSVC model, based on its predictions for dataset_X_test – roc_auc_nu_SVC: the area under the ROC curve for the NuSVC model – classifier_linear_SVC: a classifier trained using scikit-learn's LinearSVC implementation – accuracy_linear_SVC: the accuracy of the LinearSVC model, based on its predictions for dataset_X_test After successfully running one of the regression methods (SVR(), nu_SVR(), or linear_SVR()), the corresponding regression model below will be trained: – regression_SVR: a regression model trained using scikit-learn's SVR implementation – r2_score_SVR: the coefficient of determination for the SVR model – r_score_SVR: the correlation coefficient for the SVR model – regression_nu_SVR: a regression model trained using scikit-learn's NuSVR implementation – r2_score_nu_SVR: the coefficient of determination for the NuSVR model – r_score_nu_SVR: the correlation coefficient for the NuSVR model – regression_linear_SVR: a regression model trained using scikit-learn's LinearSVR implementation – r2_score_linear_SVR: the coefficient of determination for the LinearSVR model – r_score_linear_SVR: the correlation coefficient for the LinearSVR model """ self.attributes = attributes self.labels = labels self.test_size = 0.25 self.classifier_SVC = None self.accuracy_SVC = None self.roc_auc_SVC = None self.classifier_nu_SVC = None self.accuracy_nu_SVC = None self.roc_auc_nu_SVC = None self.classifier_linear_SVC = None self.accuracy_linear_SVC = None self.regression_SVR = None self.r2_score_SVR = None self.r_score_SVR = None self.regression_nu_SVR = None self.r2_score_nu_SVR = None self.r_score_nu_SVR = None self.regression_linear_SVR = None self.r2_score_linear_SVR = None self.r_score_linear_SVR = None # References to training and testing subsets of dataset; instance data for re-use purposes self.dataset_X_train = None self.dataset_y_train = None self.dataset_X_test = None self.dataset_y_test = None # Accessor Methods def get_attributes(self): """ Accessor method for attributes. If a SVM object is initialized without specifying attributes, attributes will be None. No SVM functionality can be used until attributes is a populated numpy array. Call set_attributes(new_attributes) to fix this. """ return self.attributes def get_labels(self): """ Accessor method for labels. If a SVM object is initialized without specifying labels, labels will be None. No SVM functionality can be used until labels is a populated numpy array. Call set_labels(new_labels) to fix this. """ return self.labels def get_test_size(self): """ Accessor method for test_size. Should return a number or None. """ return self.test_size def get_classifier_SVC(self): """ Accessor method for classifier_SVC. Will return None if SVC() hasn't successfully run, yet. """ return self.classifier_SVC def get_accuracy_SVC(self): """ Accessor method for accuracy_SVC. Will return None if SVC() hasn't successfully run, yet. """ return self.accuracy_SVC def get_roc_auc_SVC(self): """ Accessor method for roc_auc_SVC. Will return None if SVC() hasn't successfully run, yet. """ return self.roc_auc_SVC def get_classifier_nu_SVC(self): """ Accessor method for classifier_nu_SVC. Will return None if nu_SVC() hasn't successfully run, yet. """ return self.classifier_nu_SVC def get_accuracy_nu_SVC(self): """ Accessor method for accuracy_nu_SVC. Will return None if nu_SVC() hasn't successfully run, yet. """ return self.accuracy_nu_SVC def get_roc_auc_nu_SVC(self): """ Accessor method for roc_auc_nu_SVC. Will return None if nu_SVC() hasn't successfully run, yet. """ return self.roc_auc_nu_SVC def get_classifier_linear_SVC(self): """ Accessor method for classifier_linear_SVC. Will return None if linear_SVC() hasn't successfully run, yet. """ return self.classifier_linear_SVC def get_accuracy_linear_SVC(self): """ Accessor method for accuracy_linear_SVC. Will return None if linear_SVC() hasn't successfully run, yet. """ return self.accuracy_linear_SVC def get_regression_SVR(self): """ Accessor method for regression_SVR. Will return None if SVR() hasn't successfully run, yet. """ return self.regression_SVR def get_r2_score_SVR(self): """ Accessor method for r2_score_SVR. Will return None if SVR() hasn't successfully run, yet. """ return self.r2_score_SVR def get_r_score_SVR(self): """ Accessor method for r_score_SVR. Will return None if SVR() hasn't successfully run, yet. """ return self.r_score_SVR def get_regression_nu_SVR(self): """ Accessor method for regression_nu_SVR. Will return None if nu_SVR() hasn't successfully run, yet. """ return self.regression_nu_SVR def get_r2_score_nu_SVR(self): """ Accessor method for r2_score_nu_SVR. Will return None if nu_SVR() hasn't successfully run, yet. """ return self.r2_score_nu_SVR def get_r_score_nu_SVR(self): """ Accessor method for r_score_nu_SVR. Will return None if nu_SVR() hasn't successfully run, yet. """ return self.r_score_nu_SVR def get_regression_linear_SVR(self): """ Accessor method for regression_linear_SVR. Will return None if linear_SVR() hasn't successfully run, yet. """ return self.regression_linear_SVR def get_r2_score_linear_SVR(self): """ Accessor method for r2_score_linear_SVR. Will return None if linear_SVR() hasn't successfully run, yet. """ return self.r2_score_linear_SVR def get_r_score_linear_SVR(self): """ Accessor method for r_score_linear_SVR. Will return None if linear_SVR() hasn't successfully run, yet. """ return self.r_score_linear_SVR # Modifier Methods def set_attributes(self, new_attributes=None): """ Modifier method for attributes. Input should be a populated numpy array. Defaults to None. """ self.attributes = new_attributes def set_labels(self, new_labels=None): """ Modifier method for labels. Input should be a populated numpy array. Defaults to None. """ self.labels = new_labels def set_test_size(self, new_test_size=0.25): """ Modifier method for test_size. Input should be a float between 0.0 and 1.0 or None. Defaults to 0.25. The training size will be set to the complement of test_size. """ self.test_size = new_test_size # Wrappers for SVM classification classes def SVC(self, C=1.0, kernel="rbf", degree=3, gamma="scale", coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape="ovr", break_ties=False, random_state=None): """ Wrapper for scikit-learn's C-Support Vector Classification implementation. Parameters per scikit-learn's documentation: – C: Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. The penalty is a squared l2 penalty. (Default is 1.0) – kernel: Specifies the kernel type to be used in the algorithm. It must be one of ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ or a callable. If none is given, ‘rbf’ will be used. If a callable is given it is used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape (n_samples, n_samples). (Default is "rbf") – degree: Degree of the polynomial kernel function ("poly"). Ignored by all other kernels. (Default is 3) – gamma: Kernel coefficient for "rbf", "poly", and "sigmoid". If gamma="scale", then it uses 1 / (n_features * training_samples.var()) as value of gamma. IF gamma="auto", it uses 1 / n_features. (Default is "scale") – coef0: Independent term in kernel function. It is only significant in "poly" and "sigmoid". (Default is 0.0) – shrinking: Whether to use the shrinking heuristic. (Default is True) – probability: Whether to enable probability estimates. This must be enabled prior to calling fit, will slow down that method as it internally uses 5-fold cross-validation, and predict_proba may be inconsistent with predict. (Default is False) – tol: Tolerance for stopping criterion. (Default is 1e-3, or 0.001) – cache_size: Specify the size of the kernel cache in MB. (Default is 200) – class_weight: Set the parameter C of class i to class_weight[i]*C for SVC. If not given, all classes are supposed to have weight one. The “balanced” mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y)). (Default is None) – verbose: Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in libsvm that, if enabled, may not work properly in a multithreaded context. (Default is False) – max_iter: Hard limit on iterations within solver, or -1 for no limit. (Default is -1) – decision_function_shape: Whether to return a one-vs-rest (‘ovr’) decision function of shape (n_samples, n_classes) as all other classifiers, or the original one-vs-one (‘ovo’) decision function of libsvm which has shape (n_samples, n_classes * (n_classes - 1) / 2). However, one-vs-one (‘ovo’) is always used as multi-class strategy. The parameter is ignored for binary classification. (Default is "ovr") – break_ties: If true, decision_function_shape='ovr', and number of classes > 2, predict will break ties according to the confidence values of decision_function; otherwise the first class among the tied classes is returned. Please note that breaking ties comes at a relatively high computational cost compared to a simple predict. (Default is False) – random_state: Controls the pseudo random number generation for shuffling the data for probability estimates. Ignored when probability is False. Pass an int for reproducible output across multiple function calls. (Default is None) The implementation is based on libsvm. The fit time scales at least quadratically with the number of samples and may be impractical beyond tens of thousands of samples. """ if self._check_inputs(): # Initialize classifier self.classifier_SVC =\ SVC(C=C, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, shrinking=shrinking, probability=probability, tol=tol, cache_size=cache_size, class_weight=class_weight, verbose=verbose, max_iter=max_iter, decision_function_shape=decision_function_shape, break_ties=break_ties, random_state=random_state) # Split data, if needed; if testing/training sets are still None, call _split_data() if self.dataset_X_test is None: self._split_data() # Train classifier; handle exception if arguments are incorrect try: self.classifier_SVC.fit(self.dataset_X_train, self.dataset_y_train) except Exception as e: print( "An exception occurred while training the SVC model. Check your arguments and try again." ) print("Here is the exception message:") print(e) self.classifier_SVC = None return # Evaluate accuracy and ROC-AUC of model using testing set and actual classification self.accuracy_SVC = self.classifier_SVC.score( self.dataset_X_test, self.dataset_y_test) if probability: self.roc_auc_SVC = roc_auc_score( self.classifier_SVC.predict(self.dataset_X_test), self.classifier_SVC.predict_proba(self.dataset_X_test)[::, 1]) def nu_SVC(self, nu=0.5, kernel="rbf", degree=3, gamma="scale", coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape="ovr", break_ties=False, random_state=None): """ Wrapper for scikit-learn's Nu-Support Vector Classification implementation. Per scikit-learn's documentation, NuSVC is similar to SVC, but uses a parameter, nu, to set the number of support vectors. Parameters per scikit-learn's documentation: – nu: An upper bound on the fraction of margin errors and a lower bound of the fraction of support vectors. Should be in the interval (0, 1]. (Default is 0.5) – C: Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. The penalty is a squared l2 penalty. (Default is 1.0) – kernel: Specifies the kernel type to be used in the algorithm. It must be one of ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ or a callable. If none is given, ‘rbf’ will be used. If a callable is given it is used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape (n_samples, n_samples). (Default is "rbf") – degree: Degree of the polynomial kernel function ("poly"). Ignored by all other kernels. (Default is 3) – gamma: Kernel coefficient for "rbf", "poly", and "sigmoid". If gamma="scale", then it uses 1 / (n_features * training_samples.var()) as value of gamma. IF gamma="auto", it uses 1 / n_features. (Default is "scale") – coef0: Independent term in kernel function. It is only significant in "poly" and "sigmoid". (Default is 0.0) – shrinking: Whether to use the shrinking heuristic. (Default is True) – probability: Whether to enable probability estimates. This must be enabled prior to calling fit, will slow down that method as it internally uses 5-fold cross-validation, and predict_proba may be inconsistent with predict. (Default is False) – tol: Tolerance for stopping criterion. (Default is 1e-3, or 0.001) – cache_size: Specify the size of the kernel cache in MB. (Default is 200) – class_weight: Set the parameter C of class i to class_weight[i]*C for SVC. If not given, all classes are supposed to have weight one. The “balanced” mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y)). (Default is None) – verbose: Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in libsvm that, if enabled, may not work properly in a multithreaded context. (Default is False) – max_iter: Hard limit on iterations within solver, or -1 for no limit. (Default is -1) – decision_function_shape: Whether to return a one-vs-rest (‘ovr’) decision function of shape (n_samples, n_classes) as all other classifiers, or the original one-vs-one (‘ovo’) decision function of libsvm which has shape (n_samples, n_classes * (n_classes - 1) / 2). However, one-vs-one (‘ovo’) is always used as multi-class strategy. The parameter is ignored for binary classification. (Default is "ovr") – break_ties: If true, decision_function_shape='ovr', and number of classes > 2, predict will break ties according to the confidence values of decision_function; otherwise the first class among the tied classes is returned. Please note that breaking ties comes at a relatively high computational cost compared to a simple predict. (Default is False) – random_state: Controls the pseudo random number generation for shuffling the data for probability estimates. Ignored when probability is False. Pass an int for reproducible output across multiple function calls. (Default is None) The implementation is based on libsvm. """ if self._check_inputs(): # Initialize classifier self.classifier_nu_SVC =\ NuSVC(nu=nu, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, shrinking=shrinking, probability=probability, tol=tol, cache_size=cache_size, class_weight=class_weight, verbose=verbose, max_iter=max_iter, decision_function_shape=decision_function_shape, break_ties=break_ties, random_state=random_state) # Split data, if needed; if testing/training sets are still None, call _split_data() if self.dataset_X_test is None: self._split_data() # Train classifier; handle exception if arguments are incorrect try: self.classifier_nu_SVC.fit(self.dataset_X_train, self.dataset_y_train) except Exception as e: print( "An exception occurred while training the NuSVC model. Check your arguments and try again." ) print("Here is the exception message:") print(e) self.classifier_nu_SVC = None return # Evaluate accuracy and ROC-AUC of model using testing set and actual classification self.accuracy_nu_SVC = self.classifier_nu_SVC.score( self.dataset_X_test, self.dataset_y_test) if probability: self.roc_auc_nu_SVC = roc_auc_score( self.classifier_nu_SVC.predict(self.dataset_X_test), self.classifier_nu_SVC.predict_proba( self.dataset_X_test)[::, 1]) def linear_SVC(self, penalty="l2", loss="squared_hinge", dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None, max_iter=1000): """ Wrapper for scikit-learn's Linear Support Vector Classification implementation. Per scikit-learn's documentation, LinearSVC is similar to SVC with a linear kernel, but implemented with liblinear instead of libsvm, providing more flexibility in choice of penalties and loss functions. LinearSVC should also scale better to large sample sizes. LinearSVC supports both dense and sparse input, and the multiclass support is handled according to a one-vs-the-rest scheme. Parameters per scikit-learn's documentation: – penalty: Specifies the norm used in the penalization. The ‘l2’ penalty is the standard used in SVC. The ‘l1’ leads to coef_ vectors that are sparse. (Default is "l2") – loss: Specifies the loss function. ‘hinge’ is the standard SVM loss (used e.g. by the SVC class) while ‘squared_hinge’ is the square of the hinge loss. (Default is "squared_hinge") – dual: Select the algorithm to either solve the dual or primal optimization problem. Prefer dual=False when n_samples > n_features. (Default is True) – tol: Tolerance for stopping criteria. (Default is 1e-4, or 0.0001) – C: Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. (Default is 1.0) – multi_class: Determines the multi-class strategy if y contains more than two classes. "ovr" trains n_classes one-vs-rest classifiers, while "crammer_singer" optimizes a joint objective over all classes. While crammer_singer is interesting from a theoretical perspective as it is consistent, it is seldom used in practice as it rarely leads to better accuracy and is more expensive to compute. If "crammer_singer" is chosen, the options loss, penalty and dual will be ignored. (Default is "ovr") – fit_intercept: Whether to calculate the intercept for this model. If set to false, no intercept will be used in calculations (i.e. data is expected to be already centered). (Default is True) – intercept_scaling: When self.fit_intercept is True, instance vector x becomes [x, self.intercept_scaling], i.e. a “synthetic” feature with constant value equals to intercept_scaling is appended to the instance vector. The intercept becomes intercept_scaling * synthetic feature weight Note! the synthetic feature weight is subject to l1/l2 regularization as all other features. To lessen the effect of regularization on synthetic feature weight (and therefore on the intercept) intercept_scaling has to be increased. (Default is 1) – class_weight: Set the parameter C of class i to class_weight[i]*C for SVC. If not given, all classes are supposed to have weight one. The “balanced” mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y)). (Default is None) – verbose: Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in liblinear that, if enabled, may not work properly in a multithreaded context. (Default is 0) – random_state: Controls the pseudo random number generation for shuffling the data for the dual coordinate descent (if dual=True). When dual=False the underlying implementation of LinearSVC is not random and random_state has no effect on the results. Pass an int for reproducible output across multiple function calls. (Default is None) – max_iter: The maximum number of iterations to be run. (Default is 1000) """ if self._check_inputs(): # Initialize classifier self.classifier_linear_SVC =\ LinearSVC(penalty=penalty, loss=loss, dual=dual, tol=tol, C=C, multi_class=multi_class, fit_intercept=fit_intercept, intercept_scaling=intercept_scaling, class_weight=class_weight, verbose=verbose, random_state=random_state, max_iter=max_iter) # Split data, if needed; if testing/training sets are still None, call _split_data() if self.dataset_X_test is None: self._split_data() # Train classifier; handle exception if arguments are incorrect try: self.classifier_linear_SVC.fit(self.dataset_X_train, self.dataset_y_train) except Exception as e: print( "An exception occurred while training the LinearSVC model. Check your arguments and try again." ) print("Here is the exception message:") print(e) self.classifier_linear_SVC = None return # Evaluate accuracy of model using testing set and actual classification self.accuracy_linear_SVC = self.classifier_linear_SVC.score( self.dataset_X_test, self.dataset_y_test) # Wrappers for SVM regression classes def SVR(self, kernel='rbf', degree=3, gamma='scale', coef0=0.0, tol=0.001, C=1.0, epsilon=0.1, shrinking=True, cache_size=200, verbose=False, max_iter=-1): """ Wrapper for scikit-learn's Epsilon-Support Vector Regression implementation. Per scikit-learn's documentation, this implementation is based on libsvm. Scaling to tens of thousands of samples is difficult, as the fit time complexity is more than quadratic with the number of samples. For large datasets, consider using LinearSVR by calling linear_SVR(). Parameters per scikit-learn's documentation: – kernel: Specifies the kernel type to be used in the algorithm. It must be one of ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ or a callable. If none is given, ‘rbf’ will be used. If a callable is given it is used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape (n_samples, n_samples). (Default is "rbf") – degree: Degree of the polynomial kernel function ("poly"). Ignored by all other kernels. (Default is 3) – gamma: Kernel coefficient for "rbf", "poly", and "sigmoid". If gamma="scale", then it uses 1 / (n_features * training_samples.var()) as value of gamma. IF gamma="auto", it uses 1 / n_features. (Default is "scale") – coef0: Independent term in kernel function. It is only significant in "poly" and "sigmoid". (Default is 0.0) – tol: Tolerance for stopping criterion. (Default is 1e-3, or 0.001) – C: Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. The penalty is a squared l2 penalty. (Default is 1.0) – epsilon: Epsilon in the epsilon-SVR model. It specifies the epsilon-tube within which no penalty is associated in the training loss function with points predicted within a distance epsilon from the actual value. (Default is 0.1) – shrinking: Whether to use the shrinking heuristic. (Default is True) – cache_size: Specify the size of the kernel cache in MB. (Default is 200) – verbose: Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in libsvm that, if enabled, may not work properly in a multithreaded context. (Default is False) – max_iter: Hard limit on iterations within solver, or -1 for no limit. (Default is -1) """ if self._check_inputs(): # Initialize regression model self.regression_SVR =\ SVR(kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, tol=tol, C=C, epsilon=epsilon, shrinking=shrinking, cache_size=cache_size, verbose=verbose, max_iter=max_iter) # Split data, if needed; if testing/training sets are still None, call _split_data() if self.dataset_X_test is None: self._split_data() # Train regression model; handle exception if arguments are incorrect and/or if labels isn't # quantitative data try: self.regression_SVR.fit(self.dataset_X_train, self.dataset_y_train) except Exception as e: print( "An exception occurred while training the SVR model. Check you arguments and try again." ) print("Does labels only contain quantitative data?") print("Here is the exception message:") print(e) self.regression_SVR = None return # Get coefficient of determination for model self.r2_score_SVR = self.regression_SVR.score( self.dataset_X_test, self.dataset_y_test) self.r_score_SVR = sqrt(self.r2_score_SVR) def nu_SVR(self, nu=0.5, C=1.0, kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, tol=0.001, cache_size=200, verbose=False, max_iter=-1): """ Wrapper for scikit-learn's Nu Support Vector Regression implementation. Per scikit-learn's documentation, NuSVR uses the parameter nu to control the number of support vectors, similar to NuSVC. Yet unlike NuSVC, nu replaces the parameter epsilon of epsilon-SVR, not C. This implementation is based on libsvm. Parameters per scikit-learn's documentation: – nu: An upper bound on the fraction of margin errors and a lower bound of the fraction of support vectors. Should be in the interval (0, 1]. (Default is 0.5) – C: Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. The penalty is a squared l2 penalty. (Default is 1.0) – kernel: Specifies the kernel type to be used in the algorithm. It must be one of ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ or a callable. If none is given, ‘rbf’ will be used. If a callable is given it is used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape (n_samples, n_samples). (Default is "rbf") – degree: Degree of the polynomial kernel function ("poly"). Ignored by all other kernels. (Default is 3) – gamma: Kernel coefficient for "rbf", "poly", and "sigmoid". If gamma="scale", then it uses 1 / (n_features * training_samples.var()) as value of gamma. IF gamma="auto", it uses 1 / n_features. (Default is "scale") – coef0: Independent term in kernel function. It is only significant in "poly" and "sigmoid". (Default is 0.0) – shrinking: Whether to use the shrinking heuristic. (Default is True) – tol: Tolerance for stopping criterion. (Default is 1e-3, or 0.001) – cache_size: Specify the size of the kernel cache in MB. (Default is 200) – verbose: Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in libsvm that, if enabled, may not work properly in a multithreaded context. (Default is False) – max_iter: Hard limit on iterations within solver, or -1 for no limit. (Default is -1) """ if self._check_inputs(): # Initialize regression model self.regression_nu_SVR =\ NuSVR(nu=nu, C=C, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, shrinking=shrinking, tol=tol, cache_size=cache_size, verbose=verbose, max_iter=max_iter) # Split data, if needed; if testing/training sets are still None, call _split_data() if self.dataset_X_test is None: self._split_data() # Train regression model; handle exception if arguments are incorrect and/or if labels isn't # quantitative data try: self.regression_nu_SVR.fit(self.dataset_X_train, self.dataset_y_train) except Exception as e: print( "An exception occurred while training the NuSVR model. Check you arguments and try again." ) print("Does labels only contain quantitative data?") print("Here is the exception message:") print(e) self.regression_nu_SVR = None return # Get coefficient of determination for model self.r2_score_nu_SVR = self.regression_nu_SVR.score( self.dataset_X_test, self.dataset_y_test) self.r_score_nu_SVR = sqrt(self.r2_score_nu_SVR) def linear_SVR(self, epsilon=0.0, tol=0.0001, C=1.0, loss='epsilon_insensitive', fit_intercept=True, intercept_scaling=1.0, dual=True, verbose=0, random_state=None, max_iter=1000): """ Wrapper for scikit-learn's Linear Support Vector Regression implementation. Per scikit-learn's documentation, LinearSVR is similar to SVR with a linear kernel, but is implemented with liblinear instead of libsvm. This provides greater flexibility in choice of penalties and loss functions, and should scale better to large sample sizes. LinearSVM supports both dense and sparse input. Parameters per scikit-learn's documentation: – epsilon: Epsilon in the epsilon-SVR model. It specifies the epsilon-tube within which no penalty is associated in the training loss function with points predicted within a distance epsilon from the actual value. (Default is 0.1) – tol: Tolerance for stopping criterion. (Default is 1e-3, or 0.001) – C: Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. The penalty is a squared l2 penalty. (Default is 1.0) – loss: Specifies the loss function. The epsilon-insensitive loss (standard SVR) is the L1 loss, while the squared epsilon-insensitive loss (‘squared_epsilon_insensitive’) is the L2 loss. (Default is "epsilon_insensitive") – fit_intercept: Whether to calculate the intercept for this model. If set to false, no intercept will be used in calculations (i.e. data is expected to be already centered). (Default is True) – intercept_scaling: When self.fit_intercept is True, instance vector x becomes [x, self.intercept_scaling], i.e. a “synthetic” feature with constant value equals to intercept_scaling is appended to the instance vector. The intercept becomes intercept_scaling * synthetic feature weight Note! the synthetic feature weight is subject to l1/l2 regularization as all other features. To lessen the effect of regularization on synthetic feature weight (and therefore on the intercept) intercept_scaling has to be increased. (Default is 1) – dual: Select the algorithm to either solve the dual or primal optimization problem. Prefer dual=False when n_samples > n_features. (Default is True) – verbose: Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in liblinear that, if enabled, may not work properly in a multithreaded context. (Default is 0) – random_state: Controls the pseudo random number generation for shuffling the data for the dual coordinate descent (if dual=True). When dual=False the underlying implementation of LinearSVC is not random and random_state has no effect on the results. Pass an int for reproducible output across multiple function calls. (Default is None) – max_iter: The maximum number of iterations to be run. (Default is 1000) """ if self._check_inputs(): # Initialize regression model self.regression_linear_SVR =\ LinearSVR(epsilon=epsilon, tol=tol, C=C, loss=loss, fit_intercept=fit_intercept, intercept_scaling=intercept_scaling, dual=dual, verbose=verbose, random_state=random_state, max_iter=max_iter) # Split data, if needed; if testing/training sets are still None, call _split_data() if self.dataset_X_test is None: self._split_data() # Train regression model; handle exception if arguments are incorrect and/or labels isn't # quantitative data try: self.regression_linear_SVR.fit(self.dataset_X_train, self.dataset_y_train) except Exception as e: print( "An exception occurred while training the LinearSVR model. Check you arguments and try again." ) print("Does labels only contain quantitative data?") print("Here is the exception message:") print(e) self.regression_linear_SVR = None return # Get coefficient of determination and correlation coefficient for model self.r2_score_linear_SVR = self.regression_linear_SVR.score( self.dataset_X_test, self.dataset_y_test) self.r_score_linear_SVR = sqrt(self.r2_score_linear_SVR) # Helper methods def _split_data(self): """ Helper method for splitting attributes and labels into training and testing sets. This method runs under the assumption that all relevant instance data has been checked for correctness. """ self.dataset_X_train, self.dataset_X_test, self.dataset_y_train, self.dataset_y_test =\ train_test_split(self.attributes, self.labels, test_size=self.test_size) def _check_inputs(self): """ Verifies if instance data is ready for use in SVM model. """ # Check if attributes exists if self.attributes is None: print( "attributes is missing; call set_attributes(new_attributes) to fix this! new_attributes should be a", "populated dataset of independent variables.") return False # Check if labels exists if self.labels is None: print( "labels is missing; call set_labels(new_labels) to fix this! new_labels should be a populated dataset", "of classes.") return False # Check if attributes and labels have same number of rows (samples) if self.attributes.shape[0] != self.labels.shape[0]: print( "attributes and labels don't have the same number of rows. Make sure the number of samples in each", "dataset matches!") return False # Check if test_size is a number if self.test_size is not None and not isinstance( self.test_size, (int, float)): print( "test_size must be None or a number; call set_test_size(new_test_size) to fix this!" ) return False return True
x = min_max_scaler.fit_transform(x) y = min_max_scaler.fit_transform(y) y = y.ravel() #改为列向量 ''' 十次随机验证 ''' scores1 = [] scores2 = [] for i in range(10): x_t, x_v, y_t, y_v = train_test_split(x, y, test_size=0.2) svr1 = SVR() svr2 = NuSVR() svr1.fit(x_t, y_t) svr2.fit(x_t, y_t) score1 = svr1.score(x_v, y_v) score2 = svr2.score(x_v, y_v) scores1.append(round(score1, 2)) scores2.append(round(score2, 2)) print('svr十次r方为:\n', scores1, '\nnusvr十次r方为:\n', scores2) score1_m = np.mean(scores1) score2_m = np.mean(scores2) print('{:.2f},{:.2f}'.format(score1_m, score2_m)) #x_t,x_v,y_t,y_v=train_test_split(x,y,test_size=0.2) #svr = GridSearchCV(SVR(), param_grid={"kernel": ("poly", 'rbf'),\ # "C": np.logspace(1,20, 5), "gamma": np.logspace(0, 1, 5)},scoring='r2') #svr.fit(x_t,y_t) #svr=SVR() #scores = cross_val_score(svr, x, y, cv=5, scoring='r2') #print(scores)