Example #1
0
def find_optimal_n_components_plsr(x_train, y_train, max_n_components, num_folds_cv):
    # Store the average mse of different n_components
    list_ave_mse = []
    for n_components in range(1, max_n_components + 1):
        # Store the mse of the current n_components in cv
        list_mse = []

        # Define the function of compute mse in the training and testing data
        def compute_mse(x_train, y_train, x_test, y_test):
            model = PLSRegression(n_components=n_components).fit(x_train, y_train)
            predictions = model.predict(x_test)
            mse = optunity.metrics.mse(y_test, predictions)
            list_mse.append(mse)
            return mse

        # The cv object
        cv = optunity.cross_validated(x=x_train, y=y_train, num_folds=num_folds_cv)
        try:
            compute_mse_cv = cv(compute_mse)
            compute_mse_cv()
        except ValueError:
            print('Value error. The n_component in PLSR is bigger than the dimension of the input data!')
            print('Found the optimal n_component in the valid range.')
            break

        # Record the ave_mes for this parameter
        ave_mse = np.mean(list_mse)
        list_ave_mse.append(ave_mse)

    # Find the min and index of list_ave_mse
    optimal_n_components = np.argmin(list_ave_mse) + 1
    print("The optimal number of components of PLS: ", optimal_n_components)

    return optimal_n_components
Example #2
0
 def __init__(self, images, labels, n_folds_cv):
     self.__space_ = {'kernel': {'linear': {'C': [0, 2]},
                 'rbf': {'logGamma': [-5, 0], 'C': [0, 10]},
                 'poly': {'degree': [2, 5], 'C': [0, 5], 'coef0': [0, 2]}
                 }
      }
     
     self.__sgd_space_ = {'alpha1': [0.0001, 5], 'power_t':[0.1,0.9]}
     self.__log = log.Logger()
     self.__cv_decorator_ = optunity.cross_validated(x=images, y=labels, num_folds=n_folds_cv)
    def run_optunity(self):
        cv_decorator = optunity.cross_validated(
            x=self.X,
            y=self.Y,
        )

        svm_tuned_auroc = cv_decorator(self.svm_tuned_auroc)

        optimal_svm_pars, info, _ = optunity.maximize_structured(
            svm_tuned_auroc, self.space, num_evals=150, pmap=optunity.pmap)
        print("Optimal parameters" + str(optimal_svm_pars))
        print("AUROC of tuned SVM: %1.3f" % info.optimum)

        df = optunity.call_log2dataframe(info.call_log)
        print(df.sort_values('value', ascending=False))
    def train(self):
        self._pca.fit(self._features_data)
        features_pca = self._pca.transform(self._features_data)

        cv_decorator = optunity.cross_validated(x=features_pca,
                                                y=self._labels,
                                                num_folds=5)

        svm_tuned = cv_decorator(svm_tuned_precision)

        optimal_svm_pars, _, _ = optunity.maximize_structured(
            svm_tuned,
            _SVM_SEARCH_SPACE,
            num_evals=self._config.get('num_evals', 100))

        self._model = _train_model(features_pca, self._labels,
                                   **optimal_svm_pars)
def prepare_svm(X, Y, prob_setting):
    '''
    Code inspired by http://optunity.readthedocs.org/en/latest/notebooks/notebooks/sklearn-svc.html#tune-svc-without-deciding-the-kernel-in-advance
    '''
    cv_decorator = optunity.cross_validated(x=X, y=Y, num_folds=10)
    space = {'kernel': {'linear': {'C': [0, 1000], 'class_weight_param': [1, 22]},
                        'rbf': {'logGamma': [-5, 1], 'C': [0, 1000], 'class_weight_param': [1, 22]},
                        'poly': {'degree': [2, 5], 'C': [0, 1000], 'coef0': [0, 100],
                                 'class_weight_param': [1, 22]}}}

    def train_model(x_train, y_train, kernel, C, logGamma, degree, coef0, classWeightParam):
        if kernel=='linear':
            model = SVC(kernel=kernel, C=C, class_weight={1: classWeightParam})
        elif kernel=='poly':
            model = SVC(kernel=kernel, C=C, degree=degree, coef0=coef0, class_weight={1: classWeightParam})
        elif kernel=='rbf':
            model = SVC(kernel=kernel, C=C, gamma=10 ** logGamma, class_weight={1: classWeightParam})
        else:
            raise ValueError("Unknown kernel function: %s" % kernel)
        model.fit(x_train, y_train)
        return model


    def svm_tuned_auroc(x_train, y_train, x_test, y_test, kernel='linear', C=0, logGamma=0, degree=0, coef0=0, class_weight_param=1):
        model = train_model(x_train, y_train, kernel, C, logGamma, degree, coef0, class_weight_param)
        decision_values = model.decision_function(x_test)
        return optunity.metrics.roc_auc(y_test, decision_values)

    svm_tuned_auroc = cv_decorator(svm_tuned_auroc)

    optimal_svm_pars, info, _ = optunity.maximize_structured(svm_tuned_auroc, space, num_evals=200)
    print("Optimal parameters:"+str(optimal_svm_pars))
    print("AUROC of tuned SVM: %1.3f" % info.optimum)
    classifier = build_svc(optimal_svm_pars, prob_setting)
    classifier.fit(X, Y)
    return classifier
import math
import itertools
import optunity
import optunity.metrics
import sklearn.svm

from sklearn.datasets import load_diabetes
diabetes = load_diabetes()
n = diabetes.data.shape[0]

data = diabetes.data
targets = diabetes.target

# we explicitly generate the outer_cv decorator so we can use it twice
outer_cv = optunity.cross_validated(x=x1, y=y1, num_folds=3)


def compute_mse_standard(x_train, y_train, x_test, y_test):
    """Computes MSE of an SVR with RBF kernel and default hyperparameters.
    """
    model = sklearn.svm.SVR().fit(x_train, y_train)
    predictions = model.predict(x_test)
    return optunity.metrics.mse(y_test, predictions)


# wrap with outer cross-validation
compute_mse_standard = outer_cv(compute_mse_standard)

compute_mse_standard()

######################################
                                   regularization=[0.001, 0.05],
                                   step=[0.01, 0.2])
    predict, w, b = train_lr(x_train, y_train, **pars)
    yhat = predict(x_test)
    loss = optunity.metrics.logloss(y_test, yhat)
    brier = optunity.metrics.brier(y_test, yhat)
    print('+ model: ' + str(b.get_value())[:5] + ' + ' + str(w.get_value()[0])[:5] + ' * x1 + ' + str(w.get_value()[1])[:5] + ' * x2')
    print('++ log loss in test fold: ' + str(loss))
    print('++ Brier loss in test fold: ' + str(brier))
    print('')
    return loss, brier

# wrap both evaluation functions in cross-validation
# we will compute two metrics using nested cross-validation
# for this purpose we use list_mean() as aggregator
outer_cv = optunity.cross_validated(x=data, y=labels, num_folds=3,
                                    aggregator=optunity.cross_validation.list_mean)
lr_untuned = outer_cv(lr_untuned)
lr_tuned = outer_cv(lr_tuned)

print('true model: 1 + 2 * x1 + 3 * x2')
print('')

# perform experiment
print('evaluating untuned LR model')
untuned_loss, untuned_brier = lr_untuned()

print('evaluating tuned LR model')
tuned_loss, tuned_brier = lr_tuned()

print('Log loss (lower is better):')
print('untuned: ' + str(untuned_loss))
Example #8
0
def modelling_PLSRegression(max_n_components,
                            num_folds_outer_cv,
                            num_folds_inner_cv,
                            input_data_array,
                            wavelengths,
                            labels,
                            flag_save=False,
                            flag_fig=False,
                            id_cv=0):
    """
    Modelling a PSL regression using cross-validation.

    :param max_n_components:
    :param num_folds_outer_cv:
    :param num_folds_inner_cv:
    :param input_data_array:
    :param wavelengths: for the purpose of recored only
    :param labels: the values need to be predicted
    :param flag_save:
    :param flag_fig:
    :param id_cv: the id of cv to check
    :return: the record of cv and the model trained using all of the data.

    Author: Huajian Liu
    Email: [email protected]

    Version: v0 (10, Feb, 2019)
    """
    start = datetime.datetime.now()
    print('')
    print('PLS regression')
    print('The range of n_components is: [1, ' + str(max_n_components) + ']')
    print('')

    # For records
    date_time = datetime.datetime.now().strftime('%y-%m-%d-%H-%M-%S')
    save_record_name = 'record_plsr_' + date_time + '.sav'
    save_model_name = 'model_plsr' + date_time + '.sav'

    ####################################################################################################################
    # Outer CV function for computing mean square error compute_mse_pls()
    ####################################################################################################################
    print('Conducting outer cross-validation')

    # For record.
    params_each_fold = []
    errors_each_fold = []
    predictions_labels_each_fold = []
    tuned_models_each_fold = []

    # Define the function for outer CV
    def compute_mse_pls(x_train, y_train, x_test, y_test):
        """Find the optimized n_nomponents.
           Train a model using the opt-parameter.
           compute MSE
        """

        # ##############################################################################################################
        # # Find the optimal parameter (n_components) of PLS
        # ##############################################################################################################
        optimal_n_components = find_optimal_n_components_plsr(x_train, y_train, max_n_components=max_n_components,
                                                              num_folds_cv=num_folds_inner_cv)


        ################################################################################################################
        # Train a model using the optimal parameters and the x_train and y_train
        ################################################################################################################
        # Train
        tuned_model = PLSRegression(n_components=optimal_n_components).fit(x_train, y_train)

        # Predict the testing data and training data
        predictions_train = tuned_model.predict(x_train)
        predictions_train = predictions_train.reshape(x_train.shape[0], order='C') # Make it one-D
        predictions_test = tuned_model.predict(x_test)
        predictions_test = predictions_test.reshape(x_test.shape[0], order='C')

        ################################################################################################################
        # Record errors and parameters
        ################################################################################################################
        errors_train = errors_prediction(y_train, predictions_train)
        errors_test = errors_prediction(y_test, predictions_test)
        print('R^2_train: ', errors_train['r2_score'])
        print('R^2_validation:', errors_test['r2_score'])
        print('')

        predictions_labels_each_fold.append({'predictions_train': predictions_train,
                                             'labels_train': y_train,
                                             'predictions_test': predictions_test,
                                             'labels_test': y_test})
        params_each_fold.append({'optimal_n_component': optimal_n_components})
        errors_each_fold.append({'errors_train': errors_train, 'errors_test': errors_test})
        tuned_models_each_fold.append(tuned_model)

        return errors_test['mse']

    # Activate outer CV
    outer_cv = optunity.cross_validated(x=input_data_array, y=labels, num_folds=num_folds_outer_cv)
    compute_mse_pls = outer_cv(compute_mse_pls)
    compute_mse_pls()

    print('The cross-validation has been done!', datetime.datetime.now().strftime('%y-%m-%d-%H-%M-%S'))
    stop = datetime.datetime.now()
    print('Total time used ', stop - start)

    ave_errors = errors_average(errors_each_fold)
    print_ave_errors_cv(ave_errors)

    ################################################################################################################
    # Train a model using all of the data
    ################################################################################################################
    print('')
    print('Traing the finial model using all of the data')

    optimal_n_components = find_optimal_n_components_plsr(input_data_array, labels, max_n_components=max_n_components,
                                                          num_folds_cv=num_folds_outer_cv)

    # Train a model using the optimal parameters and the x_train and y_train
    tuned_model_finial = PLSRegression(n_components=optimal_n_components).fit(input_data_array, labels)
    print('')

    ####################################################################################################################
    # Record the results
    ####################################################################################################################
    record_pls = {'model_name': save_model_name,
                  'date_time': date_time,
                  'num_folds_outer_cv': num_folds_outer_cv,
                  'num_folds_inner_cv': num_folds_inner_cv,
                  'tuned_models_each_fold': tuned_models_each_fold,
                  'predictions_labels_each_fold': predictions_labels_each_fold,
                  'optimal_parameters_each_fold': params_each_fold,
                  'errors_each_fold': errors_each_fold,
                  'average_errors': ave_errors,
                  'wavelengths': wavelengths,
                  'input_data_array': input_data_array,
                  'tuned_model_finial': tuned_model_finial
                  }

    if flag_fig:
        # Plot a record in one (random selected) of the cv
        plot_regression_result(predictions_labels_each_fold[id_cv]['labels_train'],
                               predictions_labels_each_fold[id_cv]['predictions_train'])
        plot_regression_result(predictions_labels_each_fold[id_cv]['labels_test'],
                               predictions_labels_each_fold[id_cv]['predictions_test'])



    ####################################################################################################################
    # Save record
    ####################################################################################################################
    if flag_save:
        joblib.dump(record_pls, save_record_name)
        print('The the record has been saved in the current working folder.')

    return record_pls
Example #9
0
        label[index] = file[index][0]
    return label


#导入数据
f1 = np.loadtxt('D:/Study/Bioinformatics/AFP/feature_matrix/Antifp_Main/ASDC/train_ASDC.csv', delimiter = ',', skiprows = 1)
y_train = np.loadtxt('D:/Study/Bioinformatics/AFP/feature_matrix/Antifp_Main/train_label.csv', delimiter = ',')

sample = get_matrix(f1)
label = y_train
print(sample)
print(label)
#we will make the cross-validation decorator once, so we can reuse it later for the other tuning task
# by reusing the decorator, we get the same folds etc.

cv_decorator = optunity.cross_validated(x=sample, y=label, num_folds=5)

def svr_rforest_tuned_acc(x_train, y_train, x_test, y_test, n_estimators, max_depth,min_samples_leaf,
                        min_samples_split):
    rf = RandomForestClassifier(n_estimators=int(n_estimators),max_features='log2',
                                max_depth=int(max_depth),min_samples_leaf=int(min_samples_leaf),
                                min_samples_split=int(min_samples_split), n_jobs=-1).fit(x_train,y_train)
    y_pre = rf.predict(x_test)
    #pcc = round(np.corrcoef(y_pre, y_test)[0][1], 5)
    acc = optunity.metrics.accuracy(y_pre, y_test)
    # auc = optunity.metrics.roc_auc(y_test, decision_values)
    return acc
    #auc = optunity.metrics.roc_auc(y_test, decision_values)
    #print(pcc_test)
    #return optunity.metrics.mse(y_test, y_pre)
Example #10
0
negative_digit = 9

positive_idx = [i for i in range(n) if digits.target[i] == positive_digit]
negative_idx = [i for i in range(n) if digits.target[i] == negative_digit]

# add some noise to the data to make it a little challenging
original_data = digits.data[positive_idx + negative_idx, ...]
data = original_data + 5 * numpy.random.randn(original_data.shape[0], original_data.shape[1])
labels = [True] * len(positive_idx) + [False] * len(negative_idx)

# we will use nested 3-fold cross-validation
# in the outer cross-validation procedure
# we make the decorator explicitly so we can reuse the same folds
# in both tuned and untuned approaches
folds = optunity.cross_validation.generate_folds(data.shape[0], num_folds=3)
outer_cv = optunity.cross_validated(x=data, y=labels, num_folds=3, folds=[folds],
                                    aggregator=optunity.cross_validation.identity)
outer_cv = optunity.cross_validated(x=data, y=labels, num_folds=3)

# compute area under ROC curve of default parameters
def compute_roc_standard(x_train, y_train, x_test, y_test):
    model = sklearn.svm.SVC().fit(x_train, y_train)
    decision_values = model.decision_function(x_test)
    auc = optunity.metrics.roc_auc(y_test, decision_values)
    return auc

# decorate with cross-validation
compute_roc_standard = outer_cv(compute_roc_standard)
roc_standard = compute_roc_standard()
#print('Nested cv area under ROC curve of non-tuned model: ' + str(roc_standard))

# compute area under ROC curve with tuned parameters
Example #11
0
scaler = StandardScaler()

data = X
data = scaler.fit_transform(data)
labels = y

space = {
    'kernel': {
        'rbf': {
            'logGamma': [-5, 0],
            'C': [0, 10]
        },
    }
}

cv_decorator = optunity.cross_validated(x=data, y=labels, num_folds=5)


def train_model(x_train, y_train, kernel, C, logGamma, degree, coef0):
    """A generic SVM training function, with arguments based on the chosen kernel."""
    if kernel == 'linear':
        model = svm.SVC(kernel=kernel, C=C, cache_size=10000, verbose=3)
    elif kernel == 'poly':
        model = svm.SVC(kernel=kernel,
                        C=C,
                        degree=degree,
                        coef0=coef0,
                        cache_size=10000,
                        verbose=3)
    elif kernel == 'rbf':
        model = svm.SVC(kernel=kernel,
Example #12
0
        return optunity.metrics.logloss(y_test, yhat)

    pars, _, _ = optunity.minimize(inner_cv,
                                   num_evals=50,
                                   regularization=[0.001, 0.05],
                                   step=[0.01, 0.2])
    predict, w, b = train_lr(x_train, y_train, **pars)
    yhat = predict(x_test)
    loss = optunity.metrics.logloss(y_test, yhat)
    brier = optunity.metrics.brier(y_test, yhat)
    return loss, brier


# wrap both evaluation functions in cross-validation
# we will compute two metrics using nested cross-validation
# for this purpose we use list_mean() as aggregator
outer_cv = optunity.cross_validated(
    x=train,
    y=labels,
    num_folds=3,
    aggregator=optunity.cross_validation.list_mean)
lr_untuned = outer_cv(lr_untuned)
lr_tuned = outer_cv(lr_tuned)

print('true model: 1 + 2 * x1 + 3 * x2')
print('')

# perform experiment
print('evaluating untuned LR model')
untuned_loss, untuned_brier = lr_untuned()
Example #13
0
negative_digit = 9

positive_idx = [i for i in range(n) if digits.target[i] == positive_digit]
negative_idx = [i for i in range(n) if digits.target[i] == negative_digit]

# add some noise to the data to make it a little challenging
original_data = digits.data[positive_idx + negative_idx, ...]
data = original_data + 5 * numpy.random.randn(original_data.shape[0], original_data.shape[1])
labels = [True] * len(positive_idx) + [False] * len(negative_idx)

# we will use nested 3-fold cross-validation
# in the outer cross-validation procedure
# we make the decorator explicitly so we can reuse the same folds
# in both tuned and untuned approaches
folds = optunity.cross_validation.generate_folds(data.shape[0], num_folds=3)
outer_cv = optunity.cross_validated(x=data, y=labels, num_folds=3, folds=[folds],
                                    aggregator=optunity.cross_validation.identity)
outer_cv = optunity.cross_validated(x=data, y=labels, num_folds=3)

# compute area under ROC curve of default parameters
def compute_roc_standard(x_train, y_train, x_test, y_test):
    model = sklearn.svm.SVC().fit(x_train, y_train)
    decision_values = model.decision_function(x_test)
    auc = optunity.metrics.roc_auc(y_test, decision_values)
    return auc

# decorate with cross-validation
compute_roc_standard = outer_cv(compute_roc_standard)
roc_standard = compute_roc_standard()
print('Nested cv area under ROC curve of non-tuned model: ' + str(roc_standard))

# Data
direc = '../data/'
file = direc + 'housing.csv'
df = pd.read_csv(file, delim_whitespace=True, header=None)
# split into X and y
X = df.iloc[:, 0:13].as_matrix()
y = df.iloc[:, 13].as_matrix()
num = X.shape[1]
x_train, x_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.1,
                                                    random_state=0)

# SVC
outer_cv = optunity.cross_validated(x=x_train, y=y_train, num_folds=3)
space = {
    'kernel': {
        'linear': {
            'C': [0, 100]
        },
        'rbf': {
            'gamma': [0, 50],
            'C': [1, 100]
        },
        'poly': {
            'degree': [2, 5],
            'C': [1000, 20000],
            'coef0': [0, 1]
        }
    }
Example #15
0
def modelling_svr_rbf(C_svr_rbf,
                      gamma_svr_rbf,
                      wavelengths_range,
                      input_type,
                      num_folds_outer_cv,
                      num_iter_inner_cv,
                      num_folds_inner_cv,
                      num_evals_inner_cv,
                      samples,
                      wavelengths,
                      labels,
                      flag_save,
                      flag_fig):
    """ Model a svr with rbf kernel."""

    start = datetime.datetime.now()
    print('')
    print('svr (kernel = rbf)')
    print('The range of C is: ', C_svr_rbf)
    print('The range of gamma is: ', gamma_svr_rbf)
    print('')

    # For records
    date_time = datetime.datetime.now().strftime('%y-%m-%d-%H-%M-%S')
    model_name = 'svr_rbf'
    save_record_name = 'record' + '_' + wavelengths_range + '_' + input_type + '_' + model_name + '.sav'
    save_model_name = 'model' + '_' + wavelengths_range + '_' + input_type + '_' + model_name + '.sav'

    # //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    # CV
    # ///
    print('Conducting cross-validation')

    # For record
    params_each_fold = []
    errors_each_fold = []
    predictions_labels_each_fold = []
    tuned_models_each_fold = []

    # ==================================================================================================================
    # Thf function for outer_cv
    # ==========================
    def compute_mse_svr_rbf(x_train, y_train, x_test, y_test):
        """Find the optimal hyperparameters of svm;
           Train a model using the optmal parametes
           compute MSE
        """

        # -------------------------------------------------------------------------------------------------------------
        # Find optimal parameters
        # ------------------------
        @optunity.cross_validated(x=x_train, y=y_train, num_iter=num_iter_inner_cv,
                                  num_folds=num_folds_inner_cv)
        def tune_cv(x_train, y_train, x_test, y_test, C, gamma):
            model = SVR(C=C, gamma=gamma).fit(x_train, y_train)
            predictions = model.predict(x_test)
            return optunity.metrics.mse(y_test, predictions)

        # Optimise parameters
        optimal_pars, _, _ = optunity.minimize(tune_cv, num_evals=num_evals_inner_cv, C=C_svr_rbf, gamma=gamma_svr_rbf)
        print("THe optimal hyperparameters of SVR (kernel = rbf): " + str(optimal_pars))
        # -----------------------
        # Find optimal parameters
        # -------------------------------------------------------------------------------------------------------------

        # Train a model using the optimal parameters and the x_train and y_train
        tuned_model = SVR(**optimal_pars).fit(x_train, y_train)

        # Predict the testing data and training data
        predictions_train = tuned_model.predict(x_train)
        predictions_train = predictions_train.reshape(x_train.shape[0], order='C') # Make it one-D
        predictions_test = tuned_model.predict(x_test)
        predictions_test = predictions_test.reshape(x_test.shape[0], order='C')

        # Errors
        errors_train = errors_prediction(y_train, predictions_train)
        errors_test = errors_prediction(y_test, predictions_test)
        print('R^2_train: ', errors_train['r2_score'])
        print('R^2_test:', errors_test['r2_score'])

        # Save the parameters and errors
        predictions_labels_each_fold.append({'predictions_train': predictions_train,
                                             'labels_train': y_train,
                                             'predictions_test': predictions_test,
                                             'labels_test': y_test})
        params_each_fold.append(optimal_pars)
        errors_each_fold.append({'errors_train': errors_train, 'errors_test': errors_test})
        tuned_models_each_fold.append(tuned_model)
        return errors_test['mse']
    # =========================
    # The function for outer cv
    # ==================================================================================================================

    # The fellow is the same as:
    # @optunity.cross_validated(x=samples, y=labels, num_folds=num_folds_outer_cv)
    # def compute_mse_svr_rbf:
    #     ...
    #
    # compute_mse_svr_rbf()
    outer_cv = optunity.cross_validated(x=samples, y=labels, num_folds=num_folds_outer_cv)  # function decoter
    compute_mse_svr_rbf = outer_cv(compute_mse_svr_rbf) # Decorate computer_mse_svr_rbf
    compute_mse_svr_rbf()

    print('The cross-validation has been done!', datetime.datetime.now().strftime('%y-%m-%d-%H-%M-%S'))
    stop = datetime.datetime.now()
    print('Total time used ', stop - start)
    # ///
    # CV
    # //////////////////////////////////////////////////////////////////////////////////////////////////////////////////

    # Record the results
    ave_errors = errors_average(errors_each_fold)
    record_svr_rbf = {'model_name': save_model_name,
                      'date_time': date_time,
                      'C_range': C_svr_rbf,
                      'gamma_range': gamma_svr_rbf,
                      'num_folds_outer_cv': num_folds_outer_cv,
                      'num_iter_inner_cv': num_iter_inner_cv,
                      'num_folds_inner_cv': num_folds_inner_cv,
                      'num_evals_inner_cv': num_evals_inner_cv,
                      'tuned_models_each_fold': tuned_models_each_fold,
                      'predictions_labels_each_fold': predictions_labels_each_fold,
                      'optimal_parameters_each_fold': params_each_fold,
                      'errors_each_fold': errors_each_fold,
                      'average_errors': ave_errors,
                      'wavelengths': wavelengths
                      }

    # Print average of cv
    print_ave_errors_cv(ave_errors)

    if flag_fig:
        # Plot a record in one (random selected) of the cv
        plot_regression_result(predictions_labels_each_fold[0]['labels_train'],
                               predictions_labels_each_fold[0]['predictions_train'])
        plot_regression_result(predictions_labels_each_fold[0]['labels_test'],
                               predictions_labels_each_fold[0]['predictions_test'])


    # //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    # Train a model using all of the data
    # ////////////////////////////////////
    # ==================================================================================================================
    # Find the optimal parameters
    # ============================
    print('Training a SVR (kernel = rbf) instance.')
    @optunity.cross_validated(x=samples, y=labels, num_iter=num_iter_inner_cv,
                              num_folds=num_folds_inner_cv)
    def tune_cv(x_train, y_train, x_test, y_test, C, gamma):
        model = SVR(C=C, gamma=gamma).fit(x_train, y_train)
        predictions = model.predict(x_test)
        return optunity.metrics.mse(y_test, predictions)


    # Optimise parameters
    optimal_pars, _, _ = optunity.minimize(tune_cv, num_evals=num_evals_inner_cv, C=C_svr_rbf, gamma=gamma_svr_rbf)
    # ============================
    # Find the optimal parameters
    # ==================================================================================================================

    # Train a model using all of the data
    tuned_model_finial = SVR(**optimal_pars).fit(samples, labels)
    # ///////////////////////////////////
    # Train a model using all of the data
    # //////////////////////////////////////////////////////////////////////////////////////////////////////////////////

    # Save the model
    if flag_save:
        joblib.dump(record_svr_rbf, save_record_name)
        joblib.dump(tuned_model_finial, save_model_name)
        print('The tuned_model_finial and the record has been saved!')

    return record_svr_rbf, tuned_model_finial
    # SVM regression with rbf kernel
################################
# SVM regression with rbf kernel
########################################################################################################################
Example #16
0
	def train(self, file, tuning, cache, save, svr, ignore):
		
		if svr:
			self.svr = True
			print("== SVR mode ==")
		else:
			self.svr = False
			
		if tuning:
			self.tuning = int(tuning)
		else:
			self.tuning = 0
			
		self.file = file
		
		if cache:
			self.cache = cache
		else:
			self.cache = 0
			
		self.datas = json.load(self.file)
		
		if ignore and ignore > 0:
			class_0 = 0
			class_1 = 0
			class_2 = 0
			self.ignore = ignore
			counter = list()
			for i in range(0, len(self.datas)):
				if self.datas[i]['score2'] > 300:
					counter.append(i)
					continue
				if self.ignore > 1 and self.datas[i]['followers_count'] > 10000:
					counter.append(i)
					continue
				if self.ignore > 2 and self.datas[i]['score'] > 20000:
					counter.append(i)
					continue
				if self.ignore > 3:
					if self.datas[i]['score2'] >= 200:
						if class_2 > 30000:
							counter.append(i)
							continue
						else:
							class_2 += 1
					elif self.datas[i]['score2'] >= 50:
						if class_1 > 30000:
							counter.append(i)
							continue
						else:
							class_1 += 1
					elif self.datas[i]['score2'] >= 0:
						if class_0 > 30000:
							counter.append(i)
							continue
						else:
							class_0 += 1
				
			self.datas = [self.datas[i] for i in range(0, len(self.datas)) if i not in counter]
			print(str(len(counter))+" aberrant values removed")
		else:
			self.ignore = 0
		# Split data
		self.train, self.test = train_test_split(self.datas, test_size=0.33, shuffle=True, random_state=42)
		
		# Format data
		if self.svr:
			self.test_y, self.train_y = [row['score'] for row in self.test], [row['score'] for row in self.train]
		else:
			self.test_y, self.train_y = [[row['score'], row['score2']] for row in self.test], [[row['score'], row['score2']] for row in self.train]
		self.test_X, self.train_X = [[row['hashtag'], row['weekday'], row['hour'], row['followers_count'], row['friends_count'], row['listed_count'], row['statuses_count'], row['text'], 0, 0, 0, 0, 0, 0, 0, 0, 0] for row in self.test], [[row['hashtag'], row['weekday'], row['hour'], row['followers_count'], row['friends_count'], row['listed_count'], row['statuses_count'], row['text'], 0, 0, 0, 0, 0, 0, 0, 0, 0] for row in self.train]
		self.names = ['hashtag', 'weekday', 'hour', 'followers_count', 'friends_count', 'listed_count', 'statuses_count', 'text', 'quote', 'link', '...', '!', '?', '@', 'upper', 'polarity', 'subjectivity' ]
		
		# Prepare features
		self.prepare_columns()
		
		# baselines
		self.cache_baseline()
		
		# Normalize dataset
		print("Prepare dataset...")
		self.cache_dataset()
		
		if self.tuning == 1:
			print("Tuning model")
			if self.svr:
				outer_cv = optunity.cross_validated(x=self.train_X, y=self.train_y, num_folds=3)
				def compute_mse_rbf_tuned(x_train, y_train, x_test, y_test):
					"""Computes MSE of an SVR with RBF kernel and optimized hyperparameters."""

					# define objective function for tuning
					@optunity.cross_validated(x=x_train, y=y_train, num_iter=2, num_folds=5)
					def tune_cv(x_train, y_train, x_test, y_test, C, gamma):
						print("tune_cv model C="+str(C)+", gamma="+str(gamma))
						model = SVR(C=C, gamma=gamma).fit(x_train, y_train)
						print("tune_cv model fit")
						predictions = model.predict(x_test)
						return optunity.metrics.mse(y_test, predictions)

					# optimize parameters
					optimal_pars, _, _ = optunity.minimize(tune_cv, 150, C=[1, 100], gamma=[0, 50])
					print("optimal hyperparameters: " + str(optimal_pars))

					tuned_model = SVR(**optimal_pars).fit(x_train, y_train)
					predictions = tuned_model.predict(x_test)
					return optunity.metrics.mse(y_test, predictions)

				# wrap with outer cross-validation
				compute_mse_rbf_tuned = outer_cv(compute_mse_rbf_tuned)
				compute_mse_rbf_tuned()
			else:
				sample_leaf_options = [1,5,10,50,100,200,500]
				for leaf_size in sample_leaf_options :
					print(":: leaf_size = " + str(leaf_size))
					self.min_samples_leaf = leaf_size
					self.cache_model()
					
					print("Predict model")
					self.predictions = self.regr_rf.predict(self.test_X)
					
					print("Feature importance : ")
					print(sorted(zip(map(lambda x: round(x, 4), self.regr_rf.feature_importances_), self.names), reverse=True))
					self.test_score_rf = mean_squared_error(self.test_y, self.predictions)
					print('=Model Test MSE: %.3f' % self.test_score_rf)
					
					self.test_score = self.test_score_rf
					
					self.evaluation()
				
		elif self.tuning == 2:
			print("Tuning model 2")
			param_grid = { \
				'bootstrap': [True, False],\
				'max_depth': [80, 90, 100, 110],\
				'max_features': [2, 3],\
				'min_samples_leaf': [1, 3, 4, 5, 500],\
				'min_samples_split': [8, 10, 12],\
				'n_estimators': [100, 200, 300, 1000]\
			}
			rf = RandomForestRegressor()
			# Instantiate the grid search model
			grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 2)
			grid_search.fit(self.train_X, self.train_y)
			print("grid_search.best_params_=")
			print(grid_search.best_params_)
			
				
		else:
			print("Train model")
			
			self.cache_model()
			
			if save:
				print("Save model")
				self.save_model()
			
			print("Predict model")
			self.predictions = self.regr_rf.predict(self.test_X)
			
			if not self.svr:
				print("Feature importance : ")
				print(sorted(zip(map(lambda x: round(x, 4), self.regr_rf.feature_importances_), self.names), reverse=True))
			self.test_score_rf = mean_squared_error(self.test_y, self.predictions)
			print('=Model Test MSE: %.3f' % self.test_score_rf)
			self.test_score = self.test_score_rf
			print('r2 = ')
			print(r2_score(self.test_y, self.predictions, multioutput='raw_values'))
			
			classif = list()
			classif_pred = list()
			for i in range(0, len(self.test_y)):
				if self.test_y[i][1] >= 200:
					classif.append(2)
				elif self.test_y[i][1] >= 50:
					classif.append(1)		
				elif self.test_y[i][1] >= 0:
					classif.append(0)
					
			for i in range(0, len(self.predictions)):
				if self.predictions[i][1] >= 200:
					classif_pred.append(2)
				elif self.predictions[i][1] >= 50:
					classif_pred.append(1)		
				elif self.predictions[i][1] >= 0:
					classif_pred.append(0)
			
			target_names = ['class 0', 'class 1', 'class 2']
			print(classification_report(classif, classif_pred, target_names=target_names))
			
			x = np.asarray(self.test_y)[:,0]
			y = np.asarray(self.predictions)[:,0]
			max = [np.amax(x), np.amax(y)]
			x1 = [0, np.amax(max)]
			plt.figure()
			plt.plot(x, y, 'r+')
			plt.plot(x1, x1)
			plt.figure()
			x = np.asarray(self.test_y)[:,1]
			y = np.asarray(self.predictions)[:,1]
			max = [np.amax(x), np.amax(y)]
			x1 = [0, np.amax(max)]
			plt.plot(x, y, 'g+')
			plt.plot(x1, x1)
			plt.show()
			
			self.evaluation()
def dead_single_opt(pmts, pmts_check, events):
    N = int(len(events) / 5)
    Events = [[event[j] for event in events if event[pmts[-1]] > 50][0:N]
              for j in pmts]
    logging.info('Number of Events Trained: ' + str(len(Events[0])))
    logging.info('PMT Used to Train: ' + str(pmts[-1]))
    data_train = list(zip(*Events[0:-1]))
    target_train = Events[-1]

    print('Normalizing Data')
    scaler = preprocessing.StandardScaler().fit(data_train)
    data_train = scaler.transform(data_train)

    # we explicitly generate the outer_cv decorator so we can use it twice
    outer_cv = optunity.cross_validated(x=data_train,
                                        y=target_train,
                                        num_folds=2)
    mse_old = 10e7

    def compute_mse_rbf_tuned(x_train, y_train, x_test, y_test):
        """Computes MSE of an SVR with RBF kernel and optimized hyperparameters."""
        global optimal_parameters, clf
        # define objective function for tuning
        @optunity.cross_validated(x=x_train,
                                  y=y_train,
                                  num_iter=2,
                                  num_folds=2)
        def tune_cv(x_train, y_train, x_test, y_test, C, gamma):
            # sample_weights = my_scaling_odr(y_train)
            # sample_weights = [i / max(Events[-1]) for i in Events[-1]]

            model = svm.SVR(C=C, gamma=gamma).fit(
                x_train, y_train)  #, sample_weight=sample_weights
            predictions = model.predict(x_test)
            return optunity.metrics.mse(y_test, predictions)

        # optimize parameters
        optimal_pars, _, _ = optunity.minimize(tune_cv,
                                               200,
                                               C=[1, 4000],
                                               gamma=[0, 10],
                                               pmap=optunity.pmap)
        logging.info("Optimal hyperparameters: " + str(optimal_pars))
        # sample_weights = my_scaling_odr(y_train)

        tuned_model = svm.SVR(**optimal_pars).fit(x_train, y_train)
        predictions = tuned_model.predict(x_test)
        mse = optunity.metrics.mse(y_test, predictions)
        logging.info('mse: ' + str(mse))
        if mse < mse_old:
            optimal_parameters = optimal_pars
            clf = tuned_model
        return mse

    # wrap with outer cross-validation
    compute_mse_rbf_tuned = outer_cv(compute_mse_rbf_tuned)
    print('Beginning Cross-Validated Optimization of HyperParameters')
    compute_mse_rbf_tuned()
    Events_check = [[
        event[j] for event in events if event[pmts_check[-1]] > 50
    ] for j in pmts_check]
    logging.info('Number of Events Trained: ' + str(len(Events_check[0])))
    logging.info('PMT Used to Train Final Function: ' + str(pmts_check[-1]))
    X_Span = list(zip(*Events_check[:-1]))
    X_Span = scaler.transform(X_Span)
    print('Predicting Data Now')
    pmt_estimate = clf.predict(X_Span)

    # print('Plotting Guessed Data Now')

    diff = [(pmt_estimate[i] - Events_check[-1][i]) / (Events_check[-1][i] + 1)
            for i in range(0, len(Events_check[-1]))]
    # print(np.mean(diff), np.std(diff))
    # print(np.mean(np.abs(diff)), np.std(np.abs(diff)))
    logging.critical('Final Average Absolute Relative Error: ' +
                     str(round(np.mean(np.abs(diff)), 3)) + '+-' +
                     str(round(np.std(np.abs(diff)), 3)))

    # plt.figure()
    # plt.plot(Events_check[-1], pmt_estimate, '*')
    # plt.plot([0, max(Events_check[-1])], [0, max(Events_check[-1])], 'r', label='Error = 0%')
    # plt.xlabel('Actual PMT Value')
    # plt.ylabel('Estimated PMT Value')
    # plt.show()

    return clf, scaler
Example #18
0
x_train, x_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=5)
x_train = x_train.values
x_test = x_test.values
y_train = y_train.values
y_test = y_test.values

import optunity
import optunity.metrics
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn import preprocessing

outer_cv = optunity.cross_validated(x=X, y=y, num_folds=3)


def compute_mse_rbf_tuned(x_train, y_train, x_test, y_test):
    """Computes MSE of an SVR with RBF kernel and optimized hyperparameters."""

    # define objective function for tuning
    @optunity.cross_validated(x=x_train, y=y_train, num_iter=2, num_folds=5)
    def tune_cv(x_train, y_train, x_test, y_test, C, gamma, epsilon):
        pipe = Pipeline([('scaler', preprocessing.StandardScaler()),
                         ('svr', SVR(C=C, gamma=gamma, epsilon=epsilon))])
        model = pipe.fit(x_train, y_train)
        predictions = model.predict(x_test)
        return optunity.metrics.mse(y_test, predictions)

    # optimize parameters
Example #19
0
box_constraints = {
    "learning_rate": [-7, -3],
    "num_nodes": [2, 20],
    "num_layers": [1, 4],
    "lr_decay": [0.0, 0.001],
    "momentum": [0.8, 0.95],
    "L1_reg": [0.05, 5.0],
    "L2_reg": [0.05, 5.0],
    "dropout": [0.0, 0.5]
}
opt_fxn = get_objective_function(100, update_fn=update_fn)
train = filter_train_by_visit(visit_type, data['train'])

opt_fxn = optunity.cross_validated(x=train[cols].values,
                                   y=np.column_stack(
                                       (train.is_diab.values,
                                        train[time_col_train].values)),
                                   num_folds=num_folds)(opt_fxn)
opt_params, call_log, _ = optunity.maximize(opt_fxn,
                                            num_evals=50,
                                            solver_name='sobol',
                                            **box_constraints)
hyperparams = opt_params

hyperparams['hidden_layers_sizes'] = [int(hyperparams['num_nodes'])] * int(
    hyperparams['num_layers'])
del hyperparams['num_layers']
del hyperparams['num_nodes']
hyperparams['batch_norm'] = True
hyperparams['standardize'] = True
hyperparams['learning_rate'] = 10**hyperparams['learning_rate']
Example #20
0
                    degree=degree,
                    coef0=coef0,
                    class_weight='balanced')
    elif kernel == 'rbf':
        model = SVC(kernel=kernel,
                    C=C,
                    gamma=10**logGamma,
                    class_weight='balanced')
    else:
        raise ArgumentError("Unknown kernel function: %s" % kernel)
    model.fit(x_train, y_train)
    return model


cv_decorator = optunity.cross_validated(x=scaler.transform(
    vec.transform(x_train).toarray()),
                                        y=classes,
                                        num_folds=3)


def svm_rbf_tuned_auroc(x_train, y_train, x_test, y_test, C, logGamma):
    model = SVC(C=C, gamma=10**logGamma,
                class_weight='balanced').fit(x_train, y_train)
    decision_values = model.decision_function(x_test)
    auc = optunity.metrics.roc_auc(y_test, decision_values)
    return auc


def svm_tuned_auroc(x_train,
                    y_train,
                    x_test,
                    y_test,
y = np.stack((ya, yb), axis=1)

NUM_EPOCHS = num_epochs
NUM_FOLDS = num_folds

global main_logger
main_logger = load_logger(logdir)
#main_logger.debug('Parameters: ' + str(args))

box_constraints = load_box_constraints(box)
main_logger.debug('Box Constraints: ' + str(box_constraints))

opt_fxn = get_objective_function(NUM_EPOCHS, logdir,
                                 utils.get_optimizer_from_str(update_fn))
opt_fxn = optunity.cross_validated(x=x,
                                   y=y,
                                   num_folds=NUM_FOLDS,
                                   strata=strata)(opt_fxn)

main_logger.debug('Maximizing C-Index. Num_iterations: %d' % num_evals)
opt_params, call_log, _ = optunity.maximize(opt_fxn,
                                            num_evals=num_evals,
                                            solver_name='sobol',
                                            **box_constraints)

main_logger.debug('Optimal Parameters: ' + str(opt_params))
main_logger.debug('Saving Call log...')
print(call_log._asdict())
save_call_log(
    os.path.join(logdir, 'optunity_log_%s.pkl' % (str(uuid.uuid4()))),
    call_log._asdict())
    NUM_EPOCHS = args.num_epochs
    NUM_FOLDS = args.num_folds

    global main_logger
    main_logger = load_logger(args.logdir)
    main_logger.debug('Parameters: ' + str(args))

    main_logger.debug('Loading dataset: ' + args.dataset)
    x, y, strata = load_dataset(args.dataset)

    box_constraints = load_box_constraints(args.box)
    main_logger.debug('Box Constraints: ' + str(box_constraints))

    opt_fxn = get_objective_function(NUM_EPOCHS, args.logdir, 
        utils.get_optimizer_from_str(args.update_fn))
    opt_fxn = optunity.cross_validated(x=x, y=y, num_folds=NUM_FOLDS,
        strata=strata)(opt_fxn)

    main_logger.debug('Maximizing C-Index. Num_iterations: %d' % args.num_evals)
    opt_params, call_log, _ = optunity.maximize(opt_fxn, num_evals=args.num_evals,
        solver_name='sobol',
        **box_constraints)

    main_logger.debug('Optimal Parameters: ' + str(opt_params))
    main_logger.debug('Saving Call log...')
    print(call_log._asdict())
    save_call_log(os.path.join(args.logdir, 'optunity_log_%s.pkl' % (str(uuid.uuid4()))), call_log._asdict())
    exit(0)