def find_optimal_n_components_plsr(x_train, y_train, max_n_components, num_folds_cv): # Store the average mse of different n_components list_ave_mse = [] for n_components in range(1, max_n_components + 1): # Store the mse of the current n_components in cv list_mse = [] # Define the function of compute mse in the training and testing data def compute_mse(x_train, y_train, x_test, y_test): model = PLSRegression(n_components=n_components).fit(x_train, y_train) predictions = model.predict(x_test) mse = optunity.metrics.mse(y_test, predictions) list_mse.append(mse) return mse # The cv object cv = optunity.cross_validated(x=x_train, y=y_train, num_folds=num_folds_cv) try: compute_mse_cv = cv(compute_mse) compute_mse_cv() except ValueError: print('Value error. The n_component in PLSR is bigger than the dimension of the input data!') print('Found the optimal n_component in the valid range.') break # Record the ave_mes for this parameter ave_mse = np.mean(list_mse) list_ave_mse.append(ave_mse) # Find the min and index of list_ave_mse optimal_n_components = np.argmin(list_ave_mse) + 1 print("The optimal number of components of PLS: ", optimal_n_components) return optimal_n_components
def __init__(self, images, labels, n_folds_cv): self.__space_ = {'kernel': {'linear': {'C': [0, 2]}, 'rbf': {'logGamma': [-5, 0], 'C': [0, 10]}, 'poly': {'degree': [2, 5], 'C': [0, 5], 'coef0': [0, 2]} } } self.__sgd_space_ = {'alpha1': [0.0001, 5], 'power_t':[0.1,0.9]} self.__log = log.Logger() self.__cv_decorator_ = optunity.cross_validated(x=images, y=labels, num_folds=n_folds_cv)
def run_optunity(self): cv_decorator = optunity.cross_validated( x=self.X, y=self.Y, ) svm_tuned_auroc = cv_decorator(self.svm_tuned_auroc) optimal_svm_pars, info, _ = optunity.maximize_structured( svm_tuned_auroc, self.space, num_evals=150, pmap=optunity.pmap) print("Optimal parameters" + str(optimal_svm_pars)) print("AUROC of tuned SVM: %1.3f" % info.optimum) df = optunity.call_log2dataframe(info.call_log) print(df.sort_values('value', ascending=False))
def train(self): self._pca.fit(self._features_data) features_pca = self._pca.transform(self._features_data) cv_decorator = optunity.cross_validated(x=features_pca, y=self._labels, num_folds=5) svm_tuned = cv_decorator(svm_tuned_precision) optimal_svm_pars, _, _ = optunity.maximize_structured( svm_tuned, _SVM_SEARCH_SPACE, num_evals=self._config.get('num_evals', 100)) self._model = _train_model(features_pca, self._labels, **optimal_svm_pars)
def prepare_svm(X, Y, prob_setting): ''' Code inspired by http://optunity.readthedocs.org/en/latest/notebooks/notebooks/sklearn-svc.html#tune-svc-without-deciding-the-kernel-in-advance ''' cv_decorator = optunity.cross_validated(x=X, y=Y, num_folds=10) space = {'kernel': {'linear': {'C': [0, 1000], 'class_weight_param': [1, 22]}, 'rbf': {'logGamma': [-5, 1], 'C': [0, 1000], 'class_weight_param': [1, 22]}, 'poly': {'degree': [2, 5], 'C': [0, 1000], 'coef0': [0, 100], 'class_weight_param': [1, 22]}}} def train_model(x_train, y_train, kernel, C, logGamma, degree, coef0, classWeightParam): if kernel=='linear': model = SVC(kernel=kernel, C=C, class_weight={1: classWeightParam}) elif kernel=='poly': model = SVC(kernel=kernel, C=C, degree=degree, coef0=coef0, class_weight={1: classWeightParam}) elif kernel=='rbf': model = SVC(kernel=kernel, C=C, gamma=10 ** logGamma, class_weight={1: classWeightParam}) else: raise ValueError("Unknown kernel function: %s" % kernel) model.fit(x_train, y_train) return model def svm_tuned_auroc(x_train, y_train, x_test, y_test, kernel='linear', C=0, logGamma=0, degree=0, coef0=0, class_weight_param=1): model = train_model(x_train, y_train, kernel, C, logGamma, degree, coef0, class_weight_param) decision_values = model.decision_function(x_test) return optunity.metrics.roc_auc(y_test, decision_values) svm_tuned_auroc = cv_decorator(svm_tuned_auroc) optimal_svm_pars, info, _ = optunity.maximize_structured(svm_tuned_auroc, space, num_evals=200) print("Optimal parameters:"+str(optimal_svm_pars)) print("AUROC of tuned SVM: %1.3f" % info.optimum) classifier = build_svc(optimal_svm_pars, prob_setting) classifier.fit(X, Y) return classifier
import math import itertools import optunity import optunity.metrics import sklearn.svm from sklearn.datasets import load_diabetes diabetes = load_diabetes() n = diabetes.data.shape[0] data = diabetes.data targets = diabetes.target # we explicitly generate the outer_cv decorator so we can use it twice outer_cv = optunity.cross_validated(x=x1, y=y1, num_folds=3) def compute_mse_standard(x_train, y_train, x_test, y_test): """Computes MSE of an SVR with RBF kernel and default hyperparameters. """ model = sklearn.svm.SVR().fit(x_train, y_train) predictions = model.predict(x_test) return optunity.metrics.mse(y_test, predictions) # wrap with outer cross-validation compute_mse_standard = outer_cv(compute_mse_standard) compute_mse_standard() ######################################
regularization=[0.001, 0.05], step=[0.01, 0.2]) predict, w, b = train_lr(x_train, y_train, **pars) yhat = predict(x_test) loss = optunity.metrics.logloss(y_test, yhat) brier = optunity.metrics.brier(y_test, yhat) print('+ model: ' + str(b.get_value())[:5] + ' + ' + str(w.get_value()[0])[:5] + ' * x1 + ' + str(w.get_value()[1])[:5] + ' * x2') print('++ log loss in test fold: ' + str(loss)) print('++ Brier loss in test fold: ' + str(brier)) print('') return loss, brier # wrap both evaluation functions in cross-validation # we will compute two metrics using nested cross-validation # for this purpose we use list_mean() as aggregator outer_cv = optunity.cross_validated(x=data, y=labels, num_folds=3, aggregator=optunity.cross_validation.list_mean) lr_untuned = outer_cv(lr_untuned) lr_tuned = outer_cv(lr_tuned) print('true model: 1 + 2 * x1 + 3 * x2') print('') # perform experiment print('evaluating untuned LR model') untuned_loss, untuned_brier = lr_untuned() print('evaluating tuned LR model') tuned_loss, tuned_brier = lr_tuned() print('Log loss (lower is better):') print('untuned: ' + str(untuned_loss))
def modelling_PLSRegression(max_n_components, num_folds_outer_cv, num_folds_inner_cv, input_data_array, wavelengths, labels, flag_save=False, flag_fig=False, id_cv=0): """ Modelling a PSL regression using cross-validation. :param max_n_components: :param num_folds_outer_cv: :param num_folds_inner_cv: :param input_data_array: :param wavelengths: for the purpose of recored only :param labels: the values need to be predicted :param flag_save: :param flag_fig: :param id_cv: the id of cv to check :return: the record of cv and the model trained using all of the data. Author: Huajian Liu Email: [email protected] Version: v0 (10, Feb, 2019) """ start = datetime.datetime.now() print('') print('PLS regression') print('The range of n_components is: [1, ' + str(max_n_components) + ']') print('') # For records date_time = datetime.datetime.now().strftime('%y-%m-%d-%H-%M-%S') save_record_name = 'record_plsr_' + date_time + '.sav' save_model_name = 'model_plsr' + date_time + '.sav' #################################################################################################################### # Outer CV function for computing mean square error compute_mse_pls() #################################################################################################################### print('Conducting outer cross-validation') # For record. params_each_fold = [] errors_each_fold = [] predictions_labels_each_fold = [] tuned_models_each_fold = [] # Define the function for outer CV def compute_mse_pls(x_train, y_train, x_test, y_test): """Find the optimized n_nomponents. Train a model using the opt-parameter. compute MSE """ # ############################################################################################################## # # Find the optimal parameter (n_components) of PLS # ############################################################################################################## optimal_n_components = find_optimal_n_components_plsr(x_train, y_train, max_n_components=max_n_components, num_folds_cv=num_folds_inner_cv) ################################################################################################################ # Train a model using the optimal parameters and the x_train and y_train ################################################################################################################ # Train tuned_model = PLSRegression(n_components=optimal_n_components).fit(x_train, y_train) # Predict the testing data and training data predictions_train = tuned_model.predict(x_train) predictions_train = predictions_train.reshape(x_train.shape[0], order='C') # Make it one-D predictions_test = tuned_model.predict(x_test) predictions_test = predictions_test.reshape(x_test.shape[0], order='C') ################################################################################################################ # Record errors and parameters ################################################################################################################ errors_train = errors_prediction(y_train, predictions_train) errors_test = errors_prediction(y_test, predictions_test) print('R^2_train: ', errors_train['r2_score']) print('R^2_validation:', errors_test['r2_score']) print('') predictions_labels_each_fold.append({'predictions_train': predictions_train, 'labels_train': y_train, 'predictions_test': predictions_test, 'labels_test': y_test}) params_each_fold.append({'optimal_n_component': optimal_n_components}) errors_each_fold.append({'errors_train': errors_train, 'errors_test': errors_test}) tuned_models_each_fold.append(tuned_model) return errors_test['mse'] # Activate outer CV outer_cv = optunity.cross_validated(x=input_data_array, y=labels, num_folds=num_folds_outer_cv) compute_mse_pls = outer_cv(compute_mse_pls) compute_mse_pls() print('The cross-validation has been done!', datetime.datetime.now().strftime('%y-%m-%d-%H-%M-%S')) stop = datetime.datetime.now() print('Total time used ', stop - start) ave_errors = errors_average(errors_each_fold) print_ave_errors_cv(ave_errors) ################################################################################################################ # Train a model using all of the data ################################################################################################################ print('') print('Traing the finial model using all of the data') optimal_n_components = find_optimal_n_components_plsr(input_data_array, labels, max_n_components=max_n_components, num_folds_cv=num_folds_outer_cv) # Train a model using the optimal parameters and the x_train and y_train tuned_model_finial = PLSRegression(n_components=optimal_n_components).fit(input_data_array, labels) print('') #################################################################################################################### # Record the results #################################################################################################################### record_pls = {'model_name': save_model_name, 'date_time': date_time, 'num_folds_outer_cv': num_folds_outer_cv, 'num_folds_inner_cv': num_folds_inner_cv, 'tuned_models_each_fold': tuned_models_each_fold, 'predictions_labels_each_fold': predictions_labels_each_fold, 'optimal_parameters_each_fold': params_each_fold, 'errors_each_fold': errors_each_fold, 'average_errors': ave_errors, 'wavelengths': wavelengths, 'input_data_array': input_data_array, 'tuned_model_finial': tuned_model_finial } if flag_fig: # Plot a record in one (random selected) of the cv plot_regression_result(predictions_labels_each_fold[id_cv]['labels_train'], predictions_labels_each_fold[id_cv]['predictions_train']) plot_regression_result(predictions_labels_each_fold[id_cv]['labels_test'], predictions_labels_each_fold[id_cv]['predictions_test']) #################################################################################################################### # Save record #################################################################################################################### if flag_save: joblib.dump(record_pls, save_record_name) print('The the record has been saved in the current working folder.') return record_pls
label[index] = file[index][0] return label #导入数据 f1 = np.loadtxt('D:/Study/Bioinformatics/AFP/feature_matrix/Antifp_Main/ASDC/train_ASDC.csv', delimiter = ',', skiprows = 1) y_train = np.loadtxt('D:/Study/Bioinformatics/AFP/feature_matrix/Antifp_Main/train_label.csv', delimiter = ',') sample = get_matrix(f1) label = y_train print(sample) print(label) #we will make the cross-validation decorator once, so we can reuse it later for the other tuning task # by reusing the decorator, we get the same folds etc. cv_decorator = optunity.cross_validated(x=sample, y=label, num_folds=5) def svr_rforest_tuned_acc(x_train, y_train, x_test, y_test, n_estimators, max_depth,min_samples_leaf, min_samples_split): rf = RandomForestClassifier(n_estimators=int(n_estimators),max_features='log2', max_depth=int(max_depth),min_samples_leaf=int(min_samples_leaf), min_samples_split=int(min_samples_split), n_jobs=-1).fit(x_train,y_train) y_pre = rf.predict(x_test) #pcc = round(np.corrcoef(y_pre, y_test)[0][1], 5) acc = optunity.metrics.accuracy(y_pre, y_test) # auc = optunity.metrics.roc_auc(y_test, decision_values) return acc #auc = optunity.metrics.roc_auc(y_test, decision_values) #print(pcc_test) #return optunity.metrics.mse(y_test, y_pre)
negative_digit = 9 positive_idx = [i for i in range(n) if digits.target[i] == positive_digit] negative_idx = [i for i in range(n) if digits.target[i] == negative_digit] # add some noise to the data to make it a little challenging original_data = digits.data[positive_idx + negative_idx, ...] data = original_data + 5 * numpy.random.randn(original_data.shape[0], original_data.shape[1]) labels = [True] * len(positive_idx) + [False] * len(negative_idx) # we will use nested 3-fold cross-validation # in the outer cross-validation procedure # we make the decorator explicitly so we can reuse the same folds # in both tuned and untuned approaches folds = optunity.cross_validation.generate_folds(data.shape[0], num_folds=3) outer_cv = optunity.cross_validated(x=data, y=labels, num_folds=3, folds=[folds], aggregator=optunity.cross_validation.identity) outer_cv = optunity.cross_validated(x=data, y=labels, num_folds=3) # compute area under ROC curve of default parameters def compute_roc_standard(x_train, y_train, x_test, y_test): model = sklearn.svm.SVC().fit(x_train, y_train) decision_values = model.decision_function(x_test) auc = optunity.metrics.roc_auc(y_test, decision_values) return auc # decorate with cross-validation compute_roc_standard = outer_cv(compute_roc_standard) roc_standard = compute_roc_standard() #print('Nested cv area under ROC curve of non-tuned model: ' + str(roc_standard)) # compute area under ROC curve with tuned parameters
scaler = StandardScaler() data = X data = scaler.fit_transform(data) labels = y space = { 'kernel': { 'rbf': { 'logGamma': [-5, 0], 'C': [0, 10] }, } } cv_decorator = optunity.cross_validated(x=data, y=labels, num_folds=5) def train_model(x_train, y_train, kernel, C, logGamma, degree, coef0): """A generic SVM training function, with arguments based on the chosen kernel.""" if kernel == 'linear': model = svm.SVC(kernel=kernel, C=C, cache_size=10000, verbose=3) elif kernel == 'poly': model = svm.SVC(kernel=kernel, C=C, degree=degree, coef0=coef0, cache_size=10000, verbose=3) elif kernel == 'rbf': model = svm.SVC(kernel=kernel,
return optunity.metrics.logloss(y_test, yhat) pars, _, _ = optunity.minimize(inner_cv, num_evals=50, regularization=[0.001, 0.05], step=[0.01, 0.2]) predict, w, b = train_lr(x_train, y_train, **pars) yhat = predict(x_test) loss = optunity.metrics.logloss(y_test, yhat) brier = optunity.metrics.brier(y_test, yhat) return loss, brier # wrap both evaluation functions in cross-validation # we will compute two metrics using nested cross-validation # for this purpose we use list_mean() as aggregator outer_cv = optunity.cross_validated( x=train, y=labels, num_folds=3, aggregator=optunity.cross_validation.list_mean) lr_untuned = outer_cv(lr_untuned) lr_tuned = outer_cv(lr_tuned) print('true model: 1 + 2 * x1 + 3 * x2') print('') # perform experiment print('evaluating untuned LR model') untuned_loss, untuned_brier = lr_untuned()
negative_digit = 9 positive_idx = [i for i in range(n) if digits.target[i] == positive_digit] negative_idx = [i for i in range(n) if digits.target[i] == negative_digit] # add some noise to the data to make it a little challenging original_data = digits.data[positive_idx + negative_idx, ...] data = original_data + 5 * numpy.random.randn(original_data.shape[0], original_data.shape[1]) labels = [True] * len(positive_idx) + [False] * len(negative_idx) # we will use nested 3-fold cross-validation # in the outer cross-validation procedure # we make the decorator explicitly so we can reuse the same folds # in both tuned and untuned approaches folds = optunity.cross_validation.generate_folds(data.shape[0], num_folds=3) outer_cv = optunity.cross_validated(x=data, y=labels, num_folds=3, folds=[folds], aggregator=optunity.cross_validation.identity) outer_cv = optunity.cross_validated(x=data, y=labels, num_folds=3) # compute area under ROC curve of default parameters def compute_roc_standard(x_train, y_train, x_test, y_test): model = sklearn.svm.SVC().fit(x_train, y_train) decision_values = model.decision_function(x_test) auc = optunity.metrics.roc_auc(y_test, decision_values) return auc # decorate with cross-validation compute_roc_standard = outer_cv(compute_roc_standard) roc_standard = compute_roc_standard() print('Nested cv area under ROC curve of non-tuned model: ' + str(roc_standard))
# Data direc = '../data/' file = direc + 'housing.csv' df = pd.read_csv(file, delim_whitespace=True, header=None) # split into X and y X = df.iloc[:, 0:13].as_matrix() y = df.iloc[:, 13].as_matrix() num = X.shape[1] x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0) # SVC outer_cv = optunity.cross_validated(x=x_train, y=y_train, num_folds=3) space = { 'kernel': { 'linear': { 'C': [0, 100] }, 'rbf': { 'gamma': [0, 50], 'C': [1, 100] }, 'poly': { 'degree': [2, 5], 'C': [1000, 20000], 'coef0': [0, 1] } }
def modelling_svr_rbf(C_svr_rbf, gamma_svr_rbf, wavelengths_range, input_type, num_folds_outer_cv, num_iter_inner_cv, num_folds_inner_cv, num_evals_inner_cv, samples, wavelengths, labels, flag_save, flag_fig): """ Model a svr with rbf kernel.""" start = datetime.datetime.now() print('') print('svr (kernel = rbf)') print('The range of C is: ', C_svr_rbf) print('The range of gamma is: ', gamma_svr_rbf) print('') # For records date_time = datetime.datetime.now().strftime('%y-%m-%d-%H-%M-%S') model_name = 'svr_rbf' save_record_name = 'record' + '_' + wavelengths_range + '_' + input_type + '_' + model_name + '.sav' save_model_name = 'model' + '_' + wavelengths_range + '_' + input_type + '_' + model_name + '.sav' # ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// # CV # /// print('Conducting cross-validation') # For record params_each_fold = [] errors_each_fold = [] predictions_labels_each_fold = [] tuned_models_each_fold = [] # ================================================================================================================== # Thf function for outer_cv # ========================== def compute_mse_svr_rbf(x_train, y_train, x_test, y_test): """Find the optimal hyperparameters of svm; Train a model using the optmal parametes compute MSE """ # ------------------------------------------------------------------------------------------------------------- # Find optimal parameters # ------------------------ @optunity.cross_validated(x=x_train, y=y_train, num_iter=num_iter_inner_cv, num_folds=num_folds_inner_cv) def tune_cv(x_train, y_train, x_test, y_test, C, gamma): model = SVR(C=C, gamma=gamma).fit(x_train, y_train) predictions = model.predict(x_test) return optunity.metrics.mse(y_test, predictions) # Optimise parameters optimal_pars, _, _ = optunity.minimize(tune_cv, num_evals=num_evals_inner_cv, C=C_svr_rbf, gamma=gamma_svr_rbf) print("THe optimal hyperparameters of SVR (kernel = rbf): " + str(optimal_pars)) # ----------------------- # Find optimal parameters # ------------------------------------------------------------------------------------------------------------- # Train a model using the optimal parameters and the x_train and y_train tuned_model = SVR(**optimal_pars).fit(x_train, y_train) # Predict the testing data and training data predictions_train = tuned_model.predict(x_train) predictions_train = predictions_train.reshape(x_train.shape[0], order='C') # Make it one-D predictions_test = tuned_model.predict(x_test) predictions_test = predictions_test.reshape(x_test.shape[0], order='C') # Errors errors_train = errors_prediction(y_train, predictions_train) errors_test = errors_prediction(y_test, predictions_test) print('R^2_train: ', errors_train['r2_score']) print('R^2_test:', errors_test['r2_score']) # Save the parameters and errors predictions_labels_each_fold.append({'predictions_train': predictions_train, 'labels_train': y_train, 'predictions_test': predictions_test, 'labels_test': y_test}) params_each_fold.append(optimal_pars) errors_each_fold.append({'errors_train': errors_train, 'errors_test': errors_test}) tuned_models_each_fold.append(tuned_model) return errors_test['mse'] # ========================= # The function for outer cv # ================================================================================================================== # The fellow is the same as: # @optunity.cross_validated(x=samples, y=labels, num_folds=num_folds_outer_cv) # def compute_mse_svr_rbf: # ... # # compute_mse_svr_rbf() outer_cv = optunity.cross_validated(x=samples, y=labels, num_folds=num_folds_outer_cv) # function decoter compute_mse_svr_rbf = outer_cv(compute_mse_svr_rbf) # Decorate computer_mse_svr_rbf compute_mse_svr_rbf() print('The cross-validation has been done!', datetime.datetime.now().strftime('%y-%m-%d-%H-%M-%S')) stop = datetime.datetime.now() print('Total time used ', stop - start) # /// # CV # ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// # Record the results ave_errors = errors_average(errors_each_fold) record_svr_rbf = {'model_name': save_model_name, 'date_time': date_time, 'C_range': C_svr_rbf, 'gamma_range': gamma_svr_rbf, 'num_folds_outer_cv': num_folds_outer_cv, 'num_iter_inner_cv': num_iter_inner_cv, 'num_folds_inner_cv': num_folds_inner_cv, 'num_evals_inner_cv': num_evals_inner_cv, 'tuned_models_each_fold': tuned_models_each_fold, 'predictions_labels_each_fold': predictions_labels_each_fold, 'optimal_parameters_each_fold': params_each_fold, 'errors_each_fold': errors_each_fold, 'average_errors': ave_errors, 'wavelengths': wavelengths } # Print average of cv print_ave_errors_cv(ave_errors) if flag_fig: # Plot a record in one (random selected) of the cv plot_regression_result(predictions_labels_each_fold[0]['labels_train'], predictions_labels_each_fold[0]['predictions_train']) plot_regression_result(predictions_labels_each_fold[0]['labels_test'], predictions_labels_each_fold[0]['predictions_test']) # ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// # Train a model using all of the data # //////////////////////////////////// # ================================================================================================================== # Find the optimal parameters # ============================ print('Training a SVR (kernel = rbf) instance.') @optunity.cross_validated(x=samples, y=labels, num_iter=num_iter_inner_cv, num_folds=num_folds_inner_cv) def tune_cv(x_train, y_train, x_test, y_test, C, gamma): model = SVR(C=C, gamma=gamma).fit(x_train, y_train) predictions = model.predict(x_test) return optunity.metrics.mse(y_test, predictions) # Optimise parameters optimal_pars, _, _ = optunity.minimize(tune_cv, num_evals=num_evals_inner_cv, C=C_svr_rbf, gamma=gamma_svr_rbf) # ============================ # Find the optimal parameters # ================================================================================================================== # Train a model using all of the data tuned_model_finial = SVR(**optimal_pars).fit(samples, labels) # /////////////////////////////////// # Train a model using all of the data # ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// # Save the model if flag_save: joblib.dump(record_svr_rbf, save_record_name) joblib.dump(tuned_model_finial, save_model_name) print('The tuned_model_finial and the record has been saved!') return record_svr_rbf, tuned_model_finial # SVM regression with rbf kernel ################################ # SVM regression with rbf kernel ########################################################################################################################
def train(self, file, tuning, cache, save, svr, ignore): if svr: self.svr = True print("== SVR mode ==") else: self.svr = False if tuning: self.tuning = int(tuning) else: self.tuning = 0 self.file = file if cache: self.cache = cache else: self.cache = 0 self.datas = json.load(self.file) if ignore and ignore > 0: class_0 = 0 class_1 = 0 class_2 = 0 self.ignore = ignore counter = list() for i in range(0, len(self.datas)): if self.datas[i]['score2'] > 300: counter.append(i) continue if self.ignore > 1 and self.datas[i]['followers_count'] > 10000: counter.append(i) continue if self.ignore > 2 and self.datas[i]['score'] > 20000: counter.append(i) continue if self.ignore > 3: if self.datas[i]['score2'] >= 200: if class_2 > 30000: counter.append(i) continue else: class_2 += 1 elif self.datas[i]['score2'] >= 50: if class_1 > 30000: counter.append(i) continue else: class_1 += 1 elif self.datas[i]['score2'] >= 0: if class_0 > 30000: counter.append(i) continue else: class_0 += 1 self.datas = [self.datas[i] for i in range(0, len(self.datas)) if i not in counter] print(str(len(counter))+" aberrant values removed") else: self.ignore = 0 # Split data self.train, self.test = train_test_split(self.datas, test_size=0.33, shuffle=True, random_state=42) # Format data if self.svr: self.test_y, self.train_y = [row['score'] for row in self.test], [row['score'] for row in self.train] else: self.test_y, self.train_y = [[row['score'], row['score2']] for row in self.test], [[row['score'], row['score2']] for row in self.train] self.test_X, self.train_X = [[row['hashtag'], row['weekday'], row['hour'], row['followers_count'], row['friends_count'], row['listed_count'], row['statuses_count'], row['text'], 0, 0, 0, 0, 0, 0, 0, 0, 0] for row in self.test], [[row['hashtag'], row['weekday'], row['hour'], row['followers_count'], row['friends_count'], row['listed_count'], row['statuses_count'], row['text'], 0, 0, 0, 0, 0, 0, 0, 0, 0] for row in self.train] self.names = ['hashtag', 'weekday', 'hour', 'followers_count', 'friends_count', 'listed_count', 'statuses_count', 'text', 'quote', 'link', '...', '!', '?', '@', 'upper', 'polarity', 'subjectivity' ] # Prepare features self.prepare_columns() # baselines self.cache_baseline() # Normalize dataset print("Prepare dataset...") self.cache_dataset() if self.tuning == 1: print("Tuning model") if self.svr: outer_cv = optunity.cross_validated(x=self.train_X, y=self.train_y, num_folds=3) def compute_mse_rbf_tuned(x_train, y_train, x_test, y_test): """Computes MSE of an SVR with RBF kernel and optimized hyperparameters.""" # define objective function for tuning @optunity.cross_validated(x=x_train, y=y_train, num_iter=2, num_folds=5) def tune_cv(x_train, y_train, x_test, y_test, C, gamma): print("tune_cv model C="+str(C)+", gamma="+str(gamma)) model = SVR(C=C, gamma=gamma).fit(x_train, y_train) print("tune_cv model fit") predictions = model.predict(x_test) return optunity.metrics.mse(y_test, predictions) # optimize parameters optimal_pars, _, _ = optunity.minimize(tune_cv, 150, C=[1, 100], gamma=[0, 50]) print("optimal hyperparameters: " + str(optimal_pars)) tuned_model = SVR(**optimal_pars).fit(x_train, y_train) predictions = tuned_model.predict(x_test) return optunity.metrics.mse(y_test, predictions) # wrap with outer cross-validation compute_mse_rbf_tuned = outer_cv(compute_mse_rbf_tuned) compute_mse_rbf_tuned() else: sample_leaf_options = [1,5,10,50,100,200,500] for leaf_size in sample_leaf_options : print(":: leaf_size = " + str(leaf_size)) self.min_samples_leaf = leaf_size self.cache_model() print("Predict model") self.predictions = self.regr_rf.predict(self.test_X) print("Feature importance : ") print(sorted(zip(map(lambda x: round(x, 4), self.regr_rf.feature_importances_), self.names), reverse=True)) self.test_score_rf = mean_squared_error(self.test_y, self.predictions) print('=Model Test MSE: %.3f' % self.test_score_rf) self.test_score = self.test_score_rf self.evaluation() elif self.tuning == 2: print("Tuning model 2") param_grid = { \ 'bootstrap': [True, False],\ 'max_depth': [80, 90, 100, 110],\ 'max_features': [2, 3],\ 'min_samples_leaf': [1, 3, 4, 5, 500],\ 'min_samples_split': [8, 10, 12],\ 'n_estimators': [100, 200, 300, 1000]\ } rf = RandomForestRegressor() # Instantiate the grid search model grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 2) grid_search.fit(self.train_X, self.train_y) print("grid_search.best_params_=") print(grid_search.best_params_) else: print("Train model") self.cache_model() if save: print("Save model") self.save_model() print("Predict model") self.predictions = self.regr_rf.predict(self.test_X) if not self.svr: print("Feature importance : ") print(sorted(zip(map(lambda x: round(x, 4), self.regr_rf.feature_importances_), self.names), reverse=True)) self.test_score_rf = mean_squared_error(self.test_y, self.predictions) print('=Model Test MSE: %.3f' % self.test_score_rf) self.test_score = self.test_score_rf print('r2 = ') print(r2_score(self.test_y, self.predictions, multioutput='raw_values')) classif = list() classif_pred = list() for i in range(0, len(self.test_y)): if self.test_y[i][1] >= 200: classif.append(2) elif self.test_y[i][1] >= 50: classif.append(1) elif self.test_y[i][1] >= 0: classif.append(0) for i in range(0, len(self.predictions)): if self.predictions[i][1] >= 200: classif_pred.append(2) elif self.predictions[i][1] >= 50: classif_pred.append(1) elif self.predictions[i][1] >= 0: classif_pred.append(0) target_names = ['class 0', 'class 1', 'class 2'] print(classification_report(classif, classif_pred, target_names=target_names)) x = np.asarray(self.test_y)[:,0] y = np.asarray(self.predictions)[:,0] max = [np.amax(x), np.amax(y)] x1 = [0, np.amax(max)] plt.figure() plt.plot(x, y, 'r+') plt.plot(x1, x1) plt.figure() x = np.asarray(self.test_y)[:,1] y = np.asarray(self.predictions)[:,1] max = [np.amax(x), np.amax(y)] x1 = [0, np.amax(max)] plt.plot(x, y, 'g+') plt.plot(x1, x1) plt.show() self.evaluation()
def dead_single_opt(pmts, pmts_check, events): N = int(len(events) / 5) Events = [[event[j] for event in events if event[pmts[-1]] > 50][0:N] for j in pmts] logging.info('Number of Events Trained: ' + str(len(Events[0]))) logging.info('PMT Used to Train: ' + str(pmts[-1])) data_train = list(zip(*Events[0:-1])) target_train = Events[-1] print('Normalizing Data') scaler = preprocessing.StandardScaler().fit(data_train) data_train = scaler.transform(data_train) # we explicitly generate the outer_cv decorator so we can use it twice outer_cv = optunity.cross_validated(x=data_train, y=target_train, num_folds=2) mse_old = 10e7 def compute_mse_rbf_tuned(x_train, y_train, x_test, y_test): """Computes MSE of an SVR with RBF kernel and optimized hyperparameters.""" global optimal_parameters, clf # define objective function for tuning @optunity.cross_validated(x=x_train, y=y_train, num_iter=2, num_folds=2) def tune_cv(x_train, y_train, x_test, y_test, C, gamma): # sample_weights = my_scaling_odr(y_train) # sample_weights = [i / max(Events[-1]) for i in Events[-1]] model = svm.SVR(C=C, gamma=gamma).fit( x_train, y_train) #, sample_weight=sample_weights predictions = model.predict(x_test) return optunity.metrics.mse(y_test, predictions) # optimize parameters optimal_pars, _, _ = optunity.minimize(tune_cv, 200, C=[1, 4000], gamma=[0, 10], pmap=optunity.pmap) logging.info("Optimal hyperparameters: " + str(optimal_pars)) # sample_weights = my_scaling_odr(y_train) tuned_model = svm.SVR(**optimal_pars).fit(x_train, y_train) predictions = tuned_model.predict(x_test) mse = optunity.metrics.mse(y_test, predictions) logging.info('mse: ' + str(mse)) if mse < mse_old: optimal_parameters = optimal_pars clf = tuned_model return mse # wrap with outer cross-validation compute_mse_rbf_tuned = outer_cv(compute_mse_rbf_tuned) print('Beginning Cross-Validated Optimization of HyperParameters') compute_mse_rbf_tuned() Events_check = [[ event[j] for event in events if event[pmts_check[-1]] > 50 ] for j in pmts_check] logging.info('Number of Events Trained: ' + str(len(Events_check[0]))) logging.info('PMT Used to Train Final Function: ' + str(pmts_check[-1])) X_Span = list(zip(*Events_check[:-1])) X_Span = scaler.transform(X_Span) print('Predicting Data Now') pmt_estimate = clf.predict(X_Span) # print('Plotting Guessed Data Now') diff = [(pmt_estimate[i] - Events_check[-1][i]) / (Events_check[-1][i] + 1) for i in range(0, len(Events_check[-1]))] # print(np.mean(diff), np.std(diff)) # print(np.mean(np.abs(diff)), np.std(np.abs(diff))) logging.critical('Final Average Absolute Relative Error: ' + str(round(np.mean(np.abs(diff)), 3)) + '+-' + str(round(np.std(np.abs(diff)), 3))) # plt.figure() # plt.plot(Events_check[-1], pmt_estimate, '*') # plt.plot([0, max(Events_check[-1])], [0, max(Events_check[-1])], 'r', label='Error = 0%') # plt.xlabel('Actual PMT Value') # plt.ylabel('Estimated PMT Value') # plt.show() return clf, scaler
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=5) x_train = x_train.values x_test = x_test.values y_train = y_train.values y_test = y_test.values import optunity import optunity.metrics from sklearn.svm import SVR from sklearn.pipeline import Pipeline from sklearn import preprocessing outer_cv = optunity.cross_validated(x=X, y=y, num_folds=3) def compute_mse_rbf_tuned(x_train, y_train, x_test, y_test): """Computes MSE of an SVR with RBF kernel and optimized hyperparameters.""" # define objective function for tuning @optunity.cross_validated(x=x_train, y=y_train, num_iter=2, num_folds=5) def tune_cv(x_train, y_train, x_test, y_test, C, gamma, epsilon): pipe = Pipeline([('scaler', preprocessing.StandardScaler()), ('svr', SVR(C=C, gamma=gamma, epsilon=epsilon))]) model = pipe.fit(x_train, y_train) predictions = model.predict(x_test) return optunity.metrics.mse(y_test, predictions) # optimize parameters
box_constraints = { "learning_rate": [-7, -3], "num_nodes": [2, 20], "num_layers": [1, 4], "lr_decay": [0.0, 0.001], "momentum": [0.8, 0.95], "L1_reg": [0.05, 5.0], "L2_reg": [0.05, 5.0], "dropout": [0.0, 0.5] } opt_fxn = get_objective_function(100, update_fn=update_fn) train = filter_train_by_visit(visit_type, data['train']) opt_fxn = optunity.cross_validated(x=train[cols].values, y=np.column_stack( (train.is_diab.values, train[time_col_train].values)), num_folds=num_folds)(opt_fxn) opt_params, call_log, _ = optunity.maximize(opt_fxn, num_evals=50, solver_name='sobol', **box_constraints) hyperparams = opt_params hyperparams['hidden_layers_sizes'] = [int(hyperparams['num_nodes'])] * int( hyperparams['num_layers']) del hyperparams['num_layers'] del hyperparams['num_nodes'] hyperparams['batch_norm'] = True hyperparams['standardize'] = True hyperparams['learning_rate'] = 10**hyperparams['learning_rate']
degree=degree, coef0=coef0, class_weight='balanced') elif kernel == 'rbf': model = SVC(kernel=kernel, C=C, gamma=10**logGamma, class_weight='balanced') else: raise ArgumentError("Unknown kernel function: %s" % kernel) model.fit(x_train, y_train) return model cv_decorator = optunity.cross_validated(x=scaler.transform( vec.transform(x_train).toarray()), y=classes, num_folds=3) def svm_rbf_tuned_auroc(x_train, y_train, x_test, y_test, C, logGamma): model = SVC(C=C, gamma=10**logGamma, class_weight='balanced').fit(x_train, y_train) decision_values = model.decision_function(x_test) auc = optunity.metrics.roc_auc(y_test, decision_values) return auc def svm_tuned_auroc(x_train, y_train, x_test, y_test,
y = np.stack((ya, yb), axis=1) NUM_EPOCHS = num_epochs NUM_FOLDS = num_folds global main_logger main_logger = load_logger(logdir) #main_logger.debug('Parameters: ' + str(args)) box_constraints = load_box_constraints(box) main_logger.debug('Box Constraints: ' + str(box_constraints)) opt_fxn = get_objective_function(NUM_EPOCHS, logdir, utils.get_optimizer_from_str(update_fn)) opt_fxn = optunity.cross_validated(x=x, y=y, num_folds=NUM_FOLDS, strata=strata)(opt_fxn) main_logger.debug('Maximizing C-Index. Num_iterations: %d' % num_evals) opt_params, call_log, _ = optunity.maximize(opt_fxn, num_evals=num_evals, solver_name='sobol', **box_constraints) main_logger.debug('Optimal Parameters: ' + str(opt_params)) main_logger.debug('Saving Call log...') print(call_log._asdict()) save_call_log( os.path.join(logdir, 'optunity_log_%s.pkl' % (str(uuid.uuid4()))), call_log._asdict())
NUM_EPOCHS = args.num_epochs NUM_FOLDS = args.num_folds global main_logger main_logger = load_logger(args.logdir) main_logger.debug('Parameters: ' + str(args)) main_logger.debug('Loading dataset: ' + args.dataset) x, y, strata = load_dataset(args.dataset) box_constraints = load_box_constraints(args.box) main_logger.debug('Box Constraints: ' + str(box_constraints)) opt_fxn = get_objective_function(NUM_EPOCHS, args.logdir, utils.get_optimizer_from_str(args.update_fn)) opt_fxn = optunity.cross_validated(x=x, y=y, num_folds=NUM_FOLDS, strata=strata)(opt_fxn) main_logger.debug('Maximizing C-Index. Num_iterations: %d' % args.num_evals) opt_params, call_log, _ = optunity.maximize(opt_fxn, num_evals=args.num_evals, solver_name='sobol', **box_constraints) main_logger.debug('Optimal Parameters: ' + str(opt_params)) main_logger.debug('Saving Call log...') print(call_log._asdict()) save_call_log(os.path.join(args.logdir, 'optunity_log_%s.pkl' % (str(uuid.uuid4()))), call_log._asdict()) exit(0)