Beispiel #1
0
y_pred = model.predict(X_test)  # predict

y_pred = scaler.inverse_transform(y_pred)  # inverse StandardScaler

y_pred_pd = pd.DataFrame(data=y_pred, columns=['Toughness'])
y_pred_pd = y_pred_pd.reset_index()
y_pred_pd.to_csv('n96084094_HW4_1.csv', index=False)  # save the data

# In[7]:

# Fully connected (Dense)
from keras import models
from keras import layers

model = models.Sequential()  # set model
model.add(layers.Dense(16, activation='relu', input_shape=(64, )))  # 16 output
model.add(layers.Dense(16, activation='relu'))  # 16 output
model.add(layers.Dense(1,
                       activation='linear'))  # 1 output, y = a(wx + b), a = 1

# In[8]:

model.compile(
    optimizer='rmsprop',
    loss='mean_squared_error',  # regression problems
    metrics=['mse'
             ])  # regression problems, MSE, MAE, MAPE, Cosine, not accuracy

# In[9]:

history = model.fit(X_train,
Beispiel #2
0
result = pd.Series(cv_model, index=alphas)
result.plot()
print(result.min())
plt.title('lasso with alphas')
plt.show()
'''
# ridge ------------------------------------------ 
alphas = [10,20,30,40,50,60,70,80,90,100,110,130] # 80
cv_model = [rmse_cv(Ridge(alpha=n), more_train, y_train0).mean() for n in alphas]
result = pd.Series(cv_model, index = alphas)
print(result.min())
result.plot()
plt.title('ridge with alphas')
plt.show()
'''
'''
# DNN -----------------------------------------------
from keras.models import Sequential
from keras.layers.core import Dense,Activation,Dropout
from keras.utils import np_utils
model = Sequential()
model.add(Dense(output_dim=1000, input_dim=len(X_train.columns), activation='relu'))
model.add(Dense(output_dim=200, input_dim=1000, activation='relu'))
model.add(Dense(output_dim=1, input_dim=200, activation='relu'))
model.compile(loss = 'mean_squared_error', optimizer = 'adam')
model.fit(X_train.as_matrix(), y_train.as_matrix(), nb_epoch = 200, batch_size = 100) 
pred=model.predict(X_test.as_matrix()).reshape(len(X_test)) 
'''
'''
# gbrt -----------------------------------------
num = [150,200,250]   #best 200
regressor = DecisionTreeRegressor()
regressor.fit(X, y)
final_prediction = regressor.predict(test)

#Run support vector regression
from sklearn.svm import SVR
regressor = SVR(kernel='linear')
regressor.fit(X, y)
final_prediction = regressor.predict(test)

#run linear regression
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X, y)
final_prediction = regressor.predict(test)
"""
#Run ANN
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.models import model_from_json
import os

regressor = Sequential()
regressor.add(Dense(units = 61, kernel_initializer = 'uniform', activation = 'sigmoid', input_dim = 121))
# Adding the second hidden layer
regressor.add(Dense(units= 61, kernel_initializer ='uniform', activation = 'sigmoid'))
#here we dont use ant activation function for regression problem
regressor.add(Dense(units = 1, kernel_initializer = 'uniform'))
X=np.array(X)
y=np.array(y)
def create_model_output(model_type, input_values_combined, output_values_range,
                        count, clf, no_epochs):
    if model_type == 'regression':
        ''' ML regression Model '''
        [xtrain, xtest, ytrain,
         ytest] = splitPreProcess(input_values_combined, output_values_range,
                                  test_window, lstm_history)
        [pred_train, clf] = ML_linear(xtrain, ytrain)
        pred_train = clf.predict(xtrain)
        pred_test = clf.predict(xtest[lstm_history:, :])
        pred_train = pred_train.ravel()
        pred_test = pred_test.ravel()
        ytest = ytest[lstm_history:]
    elif model_type == 'neural net':
        ''' NN regression Model '''
        [xtrain, xtest, ytrain,
         ytest] = splitPreProcess(input_values_combined, output_values_range,
                                  test_window, lstm_history)
        [pred_train, optimal_size, optimal_alpha,
         clf] = ML_Optimizer_NN(xtrain, ytrain, xtest, ytest, range_size,
                                range_alpha)
        pred_train = clf.predict(xtrain)
        pred_test = clf.predict(xtest[lstm_history:, :])
        pred_train = pred_train.ravel()
        pred_test = pred_test.ravel()
        ytest = ytest[lstm_history:]
    elif model_type == 'RF':
        [xtrain, xtest, ytrain,
         ytest] = splitPreProcess(input_values_combined, output_values_range,
                                  test_window, lstm_history)
        clf = RandomForestRegressor(max_depth=10,
                                    random_state=0,
                                    n_estimators=50)
        clf.fit(xtrain, ytrain)
        print(clf.feature_importances_)
        pred_train = clf.predict(xtrain)
        pred_test = clf.predict(xtest[lstm_history:, :])
        pred_train = pred_train.ravel()
        pred_test = pred_test.ravel()
        ytest = ytest[lstm_history:]
    elif model_type == 'lstm':
        ''' LSTM regression Model '''
        [xtrain, xtest, ytrain,
         ytest] = splitPreProcess(input_values_combined, output_values_range,
                                  test_window, lstm_history)
        xtrain = xtrain.reshape((xtrain.shape[0], 1, xtrain.shape[1]))
        xtest = xtest.reshape((xtest.shape[0], 1, xtest.shape[1]))
        if count == 0:
            clf = Sequential()
            clf.add(LSTM(200, input_shape=(xtrain.shape[1], xtrain.shape[2])))
            clf.add(Dense(1))
            clf.compile(loss='mae', optimizer='adam')
            history = clf.fit(xtrain,
                              ytrain,
                              epochs=10,
                              batch_size=5,
                              validation_data=(xtrain, ytrain),
                              verbose=1,
                              shuffle=True)
            pyplot.plot(history.history['loss'], label='train')
            pyplot.plot(history.history['val_loss'], label='test')
            pyplot.legend()
            pyplot.show()
            pred_train = clf.predict(xtrain)
            pred_test = clf.predict(xtest[lstm_history:, :])
            pred_train = pred_train.ravel()
            pred_test = pred_test.ravel()
            ytest = ytest[lstm_history:]
            s = pickle.dumps(clf)
        else:
            clf = pickle.loads(s)
            pred_train = clf.predict(xtrain)
            pred_test = clf.predict(xtest[lstm_history:, :])
            pred_train = pred_train.ravel()
            pred_test = pred_test.ravel()
            ytest = ytest[lstm_history:]
    elif model_type == 'lstm_3D':
        ''' LSTM regression Model '''
        [xtrain, xtest, ytrain,
         ytest] = splitPreProcess(input_values_combined, output_values_range,
                                  test_window, lstm_history)
        [xtrain_3D, xtest_3D, ytrain_3D,
         ytest_3D] = create3D(xtrain, xtest, ytrain, ytest, lstm_history)
        xtrain = xtrain_3D
        xtest = xtest_3D
        ytrain = ytrain_3D
        ytest = ytest_3D
        if (count == 0) or (count > 0):
            clf = Sequential()
            clf.add(LSTM(200, input_shape=(xtrain.shape[1], xtrain.shape[2])))
            clf.add(Dense(1))
            clf.compile(loss='mae', optimizer='adam')
            history = clf.fit(xtrain,
                              ytrain,
                              epochs=no_epochs,
                              batch_size=10,
                              validation_data=(xtrain, ytrain),
                              verbose=1,
                              shuffle=True)
            pyplot.plot(history.history['loss'], label='train')
            pyplot.plot(history.history['val_loss'], label='test')
            pyplot.legend()
            pyplot.show()
            pred_train = clf.predict(xtrain)
            pred_test = clf.predict(xtest)
            pred_train = pred_train.ravel()
            pred_test = pred_test.ravel()
            s = pickle.dumps(clf)
        else:
            #clf = pickle.loads(s)
            pred_train = clf.predict(xtrain)
            pred_test = clf.predict(xtest)
            pred_train = pred_train.ravel()
            pred_test = pred_test.ravel()

    corr_coefficient_train = np.corrcoef(pred_train, ytrain)
    corr_coefficient_test = np.corrcoef(pred_test, ytest)
    final_performance.append([
        model_type,
        np.sum(abs(pred_test - ytest)), corr_coefficient_test[0, 1],
        np.sum(abs(pred_train - ytrain)), corr_coefficient_train[0, 1]
    ])
    print(final_performance)
    df_performance = pd.DataFrame(final_performance)
    df_performance.to_csv('df_performance.csv', index=False, header=True)
    return pred_train, pred_test, ytrain, ytest, clf
Beispiel #5
0
    def get_baseline(self, cv_mode=True, test_mode=False):
        """
        Computes a loss baseline for the ML-algorithm based on its default hyperparameter configuration
        (either cross validation loss or test loss after full training)
        :param cv_mode: bool
            Flag that indicates, whether to perform cross validation or simple validation
        :param test_mode: bool
            Flag that indicates, whether to compute the loss on the test set or not
        :return:
        baseline: float
             Loss of the baseline HP-configuration.
        """
        if self.is_time_series:
            # Use TimeSeriesSplit for time series data
            kf = TimeSeriesSplit(n_splits=5)
        else:
            # Create K-Folds cross validator for all other data types
            kf = KFold(n_splits=5, shuffle=self.shuffle)

        cv_baselines = []
        cv_iter = 0

        # Iterate over the cross validation splits
        for train_index, val_index in kf.split(X=self.x_train):
            cv_iter = cv_iter + 1

            # Cross validation
            if cv_mode and not test_mode:

                x_train_cv, x_val_cv = self.x_train.iloc[train_index], self.x_train.iloc[val_index]
                y_train_cv, y_val_cv = self.y_train.iloc[train_index], self.y_train.iloc[val_index]

            # Separate a validation set, but do not perform cross validation
            elif not cv_mode and not test_mode and cv_iter < 2:

                x_train_cv, x_val_cv, y_train_cv, y_val_cv = train_test_split(self.x_train, self.y_train, test_size=0.2,
                                                                              shuffle=self.shuffle, random_state=0)

            # Training on full training set and evaluation on test set
            elif not cv_mode and test_mode and cv_iter < 2:

                x_train_cv, x_val_cv = self.x_train, self.x_test
                y_train_cv, y_val_cv = self.y_train, self.y_test

            elif cv_mode and test_mode:

                raise Exception('Cross validation is not implemented for test mode.')

            # Iteration doesn't make sense for non cross validation
            else:
                continue

            if self.ml_algorithm == 'RandomForestRegressor':
                model = RandomForestRegressor(random_state=0)
                model.fit(x_train_cv, y_train_cv)
                y_pred = model.predict(x_val_cv)

            elif self.ml_algorithm == 'RandomForestClassifier':
                model = RandomForestClassifier(random_state=0)
                model.fit(x_train_cv, y_train_cv)
                y_pred = model.predict(x_val_cv)

            elif self.ml_algorithm == 'SVR':
                model = SVR(cache_size=500)
                model.fit(x_train_cv, y_train_cv)
                y_pred = model.predict(x_val_cv)

            elif self.ml_algorithm == 'SVC':
                model = SVC(random_state=0, cache_size=500)
                model.fit(x_train_cv, y_train_cv)
                y_pred = model.predict(x_val_cv)

            elif self.ml_algorithm == 'AdaBoostRegressor':
                model = AdaBoostRegressor(random_state=0)
                model.fit(x_train_cv, y_train_cv)
                y_pred = model.predict(x_val_cv)

            elif self.ml_algorithm == 'AdaBoostClassifier':
                model = AdaBoostClassifier(random_state=0)
                model.fit(x_train_cv, y_train_cv)
                y_pred = model.predict(x_val_cv)

            elif self.ml_algorithm == 'DecisionTreeRegressor':
                model = DecisionTreeRegressor(random_state=0)
                model.fit(x_train_cv, y_train_cv)
                y_pred = model.predict(x_val_cv)

            elif self.ml_algorithm == 'DecisionTreeClassifier':
                model = DecisionTreeClassifier(random_state=0)
                model.fit(x_train_cv, y_train_cv)
                y_pred = model.predict(x_val_cv)

            elif self.ml_algorithm == 'LinearRegression':
                model = LinearRegression()
                model.fit(x_train_cv, y_train_cv)
                y_pred = model.predict(x_val_cv)

            elif self.ml_algorithm == 'KNNRegressor':
                model = KNeighborsRegressor()
                model.fit(x_train_cv, y_train_cv)
                y_pred = model.predict(x_val_cv)

            elif self.ml_algorithm == 'KNNClassifier':
                model = KNeighborsClassifier()
                model.fit(x_train_cv, y_train_cv)
                y_pred = model.predict(x_val_cv)

            elif self.ml_algorithm == 'LogisticRegression':
                model = LogisticRegression()
                model.fit(x_train_cv, y_train_cv)
                y_pred = model.predict(x_val_cv)

            elif self.ml_algorithm == 'NaiveBayes':
                model = GaussianNB()
                model.fit(x_train_cv, y_train_cv)
                y_pred = model.predict(x_val_cv)

            elif self.ml_algorithm == 'MLPRegressor':
                model = MLPRegressor(random_state=0)
                model.fit(x_train_cv, y_train_cv)
                y_pred = model.predict(x_val_cv)

            elif self.ml_algorithm == 'MLPClassifier':
                model = MLPClassifier(random_state=0)
                model.fit(x_train_cv, y_train_cv)
                y_pred = model.predict(x_val_cv)

            elif self.ml_algorithm == 'ElasticNet':
                model = ElasticNet(random_state=0)
                model.fit(x_train_cv, y_train_cv)
                y_pred = model.predict(x_val_cv)

            elif self.ml_algorithm == 'KerasRegressor' or self.ml_algorithm == 'KerasClassifier':

                # Use the warmstart configuration to create a baseline for Keras models

                epochs = 100

                # Initialize the neural network
                model = keras.Sequential()

                # Add input layer
                model.add(keras.layers.InputLayer(input_shape=len(x_train_cv.keys())))

                # Add first hidden layer
                if warmstart_keras['hidden_layer1_size'] > 0:
                    model.add(
                        keras.layers.Dense(warmstart_keras['hidden_layer1_size'],
                                           activation=warmstart_keras['hidden_layer1_activation']))
                    model.add(keras.layers.Dropout(warmstart_keras['dropout1']))

                # Add second hidden layer
                if warmstart_keras['hidden_layer2_size'] > 0:
                    model.add(
                        keras.layers.Dense(warmstart_keras['hidden_layer2_size'],
                                           activation=warmstart_keras['hidden_layer2_activation']))
                    model.add(keras.layers.Dropout(warmstart_keras['dropout2']))

                # Add output layer
                if self.ml_algorithm == 'KerasRegressor':

                    model.add(keras.layers.Dense(1, activation='linear'))

                    # Select optimizer and compile the model
                    adam = keras.optimizers.Adam(learning_rate=warmstart_keras['init_lr'])
                    model.compile(optimizer=adam, loss='mse', metrics=['mse'])

                elif self.ml_algorithm == 'KerasClassifier':

                    # Determine the number of different classes depending on the data format
                    if type(y_train_cv) == pd.core.series.Series:
                        num_classes = int(max(y_train_cv) - min(y_train_cv) + 1)

                    elif type(y_train_cv) == pd.core.frame.DataFrame:
                        num_classes = len(y_train_cv.keys())

                    else:
                        raise Exception('Unknown data format!')

                    # Binary classification
                    if num_classes <= 2:

                        # 'Sigmoid is equivalent to a 2-element Softmax, where the second element is assumed to be zero'
                        # https://keras.io/api/layers/activations/#sigmoid-function
                        model.add(keras.layers.Dense(1, activation='sigmoid'))

                        adam = keras.optimizers.Adam(learning_rate=warmstart_keras['init_lr'])
                        model.compile(optimizer=adam, loss=keras.losses.BinaryCrossentropy(), metrics=['accuracy'])

                    # Multiclass classification
                    else:

                        # Use softmax activation for multiclass clf. -> 'Softmax converts a real vector to a vector of
                        # categorical probabilities.[...]the result could be interpreted as a probability distribution.'
                        # https://keras.io/api/layers/activations/#softmax-function
                        model.add(keras.layers.Dense(num_classes, activation='softmax'))

                        adam = keras.optimizers.Adam(learning_rate=warmstart_keras['init_lr'])
                        model.compile(optimizer=adam, loss=keras.losses.CategoricalCrossentropy(),
                                      metrics=[keras.metrics.CategoricalAccuracy()])

                # Learning rate schedule
                if warmstart_keras["lr_schedule"] == "cosine":
                    schedule = functools.partial(cosine, initial_lr=warmstart_keras["init_lr"], T_max=epochs)

                elif warmstart_keras["lr_schedule"] == "exponential":
                    schedule = functools.partial(exponential, initial_lr=warmstart_keras["init_lr"], T_max=epochs)

                elif warmstart_keras["lr_schedule"] == "constant":
                    schedule = functools.partial(fix, initial_lr=warmstart_keras["init_lr"])

                else:
                    raise Exception('Unknown learning rate schedule!')

                # Determine the learning rate for this iteration and pass it as callback
                lr = keras.callbacks.LearningRateScheduler(schedule)

                # Early stopping callback
                early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss',
                                                               min_delta=0,
                                                               patience=10,
                                                               verbose=1,
                                                               mode='auto',
                                                               restore_best_weights=True)

                callbacks_list = [lr, early_stopping]

                # Train the model
                model.fit(x_train_cv, y_train_cv, epochs=epochs, batch_size=warmstart_keras['batch_size'],
                          validation_data=(x_val_cv, y_val_cv), callbacks=callbacks_list,
                          verbose=0)

                # Make the prediction
                y_pred = model.predict(x_val_cv)

                # In case of binary classification round to the nearest integer
                if self.ml_algorithm == 'KerasClassifier':

                    # Binary classification
                    if num_classes <= 2:

                        y_pred = np.rint(y_pred)

                    # Multiclass classification
                    else:

                        # Identify the predicted class (maximum probability) in each row
                        for row_idx in range(y_pred.shape[0]):

                            # Predicted class
                            this_class = np.argmax(y_pred[row_idx, :])

                            # Iterate over columns / classes
                            for col_idx in range(y_pred.shape[1]):

                                if col_idx == this_class:
                                    y_pred[row_idx, col_idx] = 1
                                else:
                                    y_pred[row_idx, col_idx] = 0

                # KerasRegressor
                else:
                    y_pred = np.reshape(y_pred, newshape=(-1,))

            elif self.ml_algorithm == 'XGBoostRegressor':
                model = XGBRegressor(random_state=0)
                model.fit(x_train_cv, y_train_cv)
                y_pred = model.predict(x_val_cv)

            elif self.ml_algorithm == 'XGBoostClassifier':
                model = XGBClassifier(random_state=0)
                model.fit(x_train_cv, y_train_cv)
                y_pred = model.predict(x_val_cv)

            elif self.ml_algorithm == 'LGBMRegressor' or self.ml_algorithm == 'LGBMClassifier':
                # Create lgb datasets
                train_data = lgb.Dataset(x_train_cv, label=y_train_cv)
                valid_data = lgb.Dataset(x_val_cv, label=y_val_cv)

                # Specify the ML task and the random seed
                if self.ml_algorithm == 'LGBMRegressor':
                    # Regression task
                    params = {'objective': 'regression',
                              'seed': 0}

                elif self.ml_algorithm == 'LGBMClassifier':

                    # Determine the number of classes
                    num_classes = int(max(y_train_cv) - min(y_train_cv) + 1)

                    # Binary classification task
                    if num_classes <= 2:
                        params = {'objective': 'binary',
                                  'seed': 0}

                    # Multiclass classification task
                    else:
                        params = {'objective': 'multiclass',  # uses Softmax objective function
                                  'num_class': num_classes,
                                  'seed': 0}

                lgb_clf = lgb.train(params=params, train_set=train_data, valid_sets=[valid_data], verbose_eval=False)

                # Make the prediction
                y_pred = lgb_clf.predict(data=x_val_cv)

                # Classification task
                if self.ml_algorithm == 'LGBMClassifier':

                    # Binary classification: round to the nearest integer
                    if num_classes <= 2:

                        y_pred = np.rint(y_pred)

                    # Multiclass classification: identify the predicted class based on the one-hot-encoded probabilities
                    else:

                        y_one_hot_proba = np.copy(y_pred)
                        n_rows = y_one_hot_proba.shape[0]

                        y_pred = np.zeros(shape=(n_rows, 1))

                        # Identify the predicted class for each row (highest probability)
                        for row in range(n_rows):
                            y_pred[row, 0] = np.argmax(y_one_hot_proba[row, :])

            else:
                raise Exception('Unknown ML-algorithm!')

            # Add remaining ML-algorithms here

            cv_baselines.append(self.metric(y_val_cv, y_pred))

        if cv_mode:

            # Compute the average cross validation loss
            baseline = np.mean(cv_baselines)

        else:
            baseline = cv_baselines[0]

        return baseline
    def train(self):
        start = timeit.default_timer()
        train_x, train_y, feature_list = self.feature_extraction()
        self._feature_size = [train_x.shape[1], 1]
        self._features = feature_list

        if not self._silent:
            print "Train has %d instances " % (len(train_x))

        counts = Counter(train_y)
        expectation_ratio = 1 / float(len(counts.keys()))
        n_samples = len(train_y)
        for key, value in counts.items():
            tmp = float(expectation_ratio) / (float(value) / float(n_samples))
            if (tmp > 6) | (tmp < (1.0 / 6.0)):
                self._data_balance = True

        extra_fit_args = dict()
        if self._weight_col is not None:
            extra_fit_args['sample_weight'] = train_x[self._weight_col].values
            del train_x[self._weight_col]

        if 0 < self._bootstrap < 1.0:
            if self._bootstrap_seed is not None:
                if not self._silent:
                    print "Setting bootstrap seed to %d" % self._bootstrap_seed
                np.random.seed(self._bootstrap_seed)
                random.seed(self._bootstrap_seed)
            bootstrap_len = int(math.floor(self._bootstrap * len(train_x)))
            bootstrap_ix = random.sample(range(len(train_x)), bootstrap_len)
            train_x = train_x.iloc[bootstrap_ix]
            train_x.reset_index()
            train_y = train_y.iloc[bootstrap_ix]
            train_y.reset_index()

        model = None

        if self._model_type == "RandomForestRegressor":
            if model is None:
                if self._data_balance is True:
                    self._fit_args.update({"class_weight": "balanced"})
                model = RandomForestRegressor(**self._fit_args)
                model.fit(X=train_x, y=train_y, **extra_fit_args)
                self._model = model
                self._predict = lambda (fitted_model, pred_x
                                        ): self.continuous_predict(x=pred_x)
                self._have_feat_importance = True

        elif self._model_type == "RandomForestClassifier":
            if model is None:
                # if self._data_balance is True:
                #     self._fit_args.update({"class_weight": "balanced"})
                model = RandomForestClassifier(**self._fit_args)
                model.fit(X=train_x, y=train_y, **extra_fit_args)
                self._model = model
                self._predict = lambda (fitted_model, pred_x): self.pred_proba(
                    x=pred_x)
            self._staged_predict = lambda (
                fitted_model, pred_x): [self._predict((fitted_model, pred_x))]
            self._have_feat_importance = True

        elif self._model_type == "ExtraTreesRegressor":
            if model is None:
                if self._data_balance is True:
                    self._fit_args.update({"class_weight": "balanced"})
                model = ExtraTreesRegressor(**self._fit_args)
                model.fit(X=train_x, y=train_y, **extra_fit_args)
                self._model = model
                self._predict = lambda (fitted_model, pred_x
                                        ): self.continuous_predict(x=pred_x)
                self._have_feat_importance = True

        elif self._model_type == "ExtraTreesClassifier":
            if model is None:
                if self._data_balance is True:
                    self._fit_args.update({"class_weight": "balanced"})
                model = ExtraTreesClassifier(**self._fit_args)
                model.fit(X=train_x, y=train_y, **extra_fit_args)
            self._predict = lambda (fitted_model, pred_x): self.pred_proba(
                x=pred_x)
            self._staged_predict = lambda (
                fitted_model, pred_x): [self._predict((fitted_model, pred_x))]
            self._have_feat_importance = True

        elif self._model_type == "GradientBoostingRegressor":
            if model is None:
                model = GradientBoostingRegressor(**self._fit_args)
                model.fit(X=train_x, y=train_y, **extra_fit_args)
                self._model = model
            elif self._load_type == "fit_more":
                model.warm_start = True
                model.n_estimators += self._fit_args['n_estimators']
                model.fit(X=train_x, y=train_y)
                self._model = model
            self._predict = lambda (fitted_model, pred_x
                                    ): self.continuous_predict(x=pred_x)
            self._staged_predict = lambda (
                fitted_model, pred_x): self.staged_pred_continuous(x=pred_x)
            if self._load_type == "pred_at" and self._fit_args[
                    'n_estimators'] < model.n_estimators:
                if not self._silent:
                    print("Predict using %d trees" %
                          self._fit_args['n_estimators'])
                self._predict = lambda (
                    fitted_model, pred_x): self.staged_pred_continuous_at_n(
                        x=pred_x, n=self._fit_args['n_estimators'])
        elif self._model_type == "GradientBoostingClassifier":
            if model is None:
                model = GradientBoostingClassifier(**self._fit_args)
                model.fit(X=train_x, y=train_y, **extra_fit_args)
                self._model = model
            elif self._load_type == "fit_more":
                model.warm_start = True
                model.n_estimators += self._fit_args['n_estimators']
                model.fit(X=train_x, y=train_y)
                self._model = model
                self._staged_predict = lambda (
                    fitted_model, pred_x): self.staged_pred_proba(x=pred_x)
            self._predict = lambda (fitted_model, pred_x): self.pred_proba(
                x=pred_x)
            if self._load_type == "pred_at" and self._fit_args[
                    'n_estimators'] < model.n_estimators:
                if not self._silent:
                    print("Predict using %d trees" %
                          self._fit_args['n_estimators'])
                self._predict = lambda (
                    fitted_model, pred_x): self.staged_pred_proba_at_n(
                        x=pred_x, n=self._fit_args['n_estimators'])
        elif self._model_type == "LogisticRegression":
            if model is None:
                if self._data_balance is True:
                    self._fit_args.update({"class_weight": "balanced"})
                model = LogisticRegression(**self._fit_args)
                model.fit(X=train_x, y=train_y)
                self._model = model
            self._predict = lambda (fitted_model, pred_x): self.pred_proba(
                x=pred_x)
            self._staged_predict = lambda (
                fitted_model, pred_x): [self._predict((fitted_model, pred_x))]

        elif self._model_type == "SVC":
            if model is None:
                if self._data_balance is True:
                    self._fit_args.update({"class_weight": "balanced"})
                model = sklearn.svm.SVC(**self._fit_args)
                model.fit(X=train_x, y=train_y)
                self._model = model
            self._predict = lambda (fitted_model, pred_x): self.pred_proba(
                x=pred_x)
            self._staged_predict = lambda (
                fitted_model, pred_x): [self._predict((fitted_model, pred_x))]
        elif self._model_type == "CNN":
            if model is None:
                train_data = load_pd_df(self._input_dir + '/train.csv')
                indices, max_len = self.process_date_list(
                    train_data['Date'].map(
                        lambda x: datetime.datetime.strptime(x, '%Y-%m-%d')))
                self._feature_size = [train_x.shape[1], max_len]

                NB_FILTER = [64, 128]
                NB_Size = [4, 3, 3]
                FULLY_CONNECTED_UNIT = 256
                model = Sequential()
                model.add(
                    Conv2D(NB_FILTER[0], (train_x.shape[1], NB_Size[0]),
                           input_shape=train_x.shape,
                           border_mode='valid',
                           activation='relu'))
                model.add(MaxPooling2D(pool_size=(1, 3)))
                model.add(
                    Conv2D(NB_FILTER[1], (1, NB_Size[1]), border_mode='valid'))
                model.add(MaxPooling2D(pool_size=(1, 3)))
                model.add(Flatten())
                model.add(
                    Dense(FULLY_CONNECTED_UNIT,
                          activation='relu',
                          W_constraint=maxnorm(3),
                          kernel_regularizer=regularizers.l2(0.01)))
                model.add(Dense(2, activation='softmax'))
                model.compile(loss='categorical_crossentropy',
                              optimizer=Adamax(),
                              metrics=['accuracy'])
                model.fit(train_x,
                          train_y,
                          batch_size=16,
                          epochs=50,
                          verbose=1)
        elif self._model_type == "LSTM":
            if model is None:
                train_data = load_pd_df(self._input_dir + '/train.csv')
                indices, max_len = self.process_date_list(
                    train_data['Date'].map(
                        lambda x: datetime.datetime.strptime(x, '%Y-%m-%d')))
                self._feature_size = [train_x.shape[1], max_len]

                class_weight = {
                    1:
                    np.divide(float(n_samples), float(
                        (len(counts) * counts[1]))),
                    0:
                    np.divide(float(n_samples), float(
                        (len(counts) * counts[0])))
                }
                # class_weight = {1: 10,
                #                 0: 1}
                model = CNN_LSTM(
                    (self._feature_size[0], 4),
                    (None, self._feature_size[0], self._feature_size[1], 1))
                model.fit_generator(
                    generator=self.generator(train_x, train_y, indices,
                                             max_len),
                    epochs=20,
                    class_weight=class_weight,
                    steps_per_epoch=train_x.shape[0] / self._batch_size)
                # model.fit_generator(generator=self.generator(train_x, train_y, indices, max_len),
                #                     epochs=1, class_weight=class_weight, steps_per_epoch=1)
                self._model = model

        elif self._model_type == "Pipeline":
            if model is None:
                model = Pipeline([
                    ('pre_process',
                     get_class(self._fit_args['pre_process']['name'])(
                         self._fit_args['pre_process']['args'])),
                    ('model', get_class(self._fit_args['model']['name'])(
                        self._fit_args['model']['args']))
                ])
                model.fit(X=train_x, y=train_y)
                self._model = model
            self._predict = lambda (fitted_model, pred_x): self.pred_proba(
                x=pred_x)
            self._staged_predict = lambda (
                fitted_model, pred_x): [self._predict((fitted_model, pred_x))]

        if not self._silent:
            stop = timeit.default_timer()
            print "Train time: %d s" % (stop - start)

        del train_x, train_y
def train_data(X_train, X_test, y_train, y_test, prompt):
  
    if prompt == "RF":
        model = RandomForestRegressor(max_depth=20)
        regressor = "Random Forest"
    
    elif prompt == "KNN":
        model = KNeighborsRegressor(n_neighbors=5)
        regressor = "KNN"
        
    elif prompt == "DT":
        model = DecisionTreeRegressor(max_depth=10)
        regressor = "Decision Tree"
    
    elif prompt == "LR":
        model = LinearRegression()
        regressor = "Linear Regression"
        
    elif prompt == "LSVR":
        model = SVR(kernel='linear')
        regressor = "Linear SVR"
    
    elif prompt == "RBFSVR":
        model = SVR(kernel='rbf')
        regressor = "RBF Kernel SVR"
    
    elif prompt == "PSVR":
        model = SVR(kernel='poly')
        regressor = "Polynomial SVR"
    
    elif prompt == "ANN":
        regressor = "ANN"
        model = Sequential()
        model.add(Dense(input_dim=4, units=6, activation='tanh'))
        model.add(Dense(units=4, activation='tanh'))
        model.add(Dense(units=4, activation='tanh'))
        model.add(Dense(units=3, activation='relu'))
        model.compile(loss="mse", metrics=['mae'], optimizer='adam')

    else:
      print("Please enter a valid regression model!")
      assert False

    print(f"\nWORKING FOR {regressor.upper()} MODEL")

    # optimal_r2 = 0    
    coffs, intercepts = None, None
    
    if "SVR" in prompt:
        y_pred = []
        r2s = []
        coffs = []
        intercepts = []
        mapping = {0:"a", 1:"b", 2:"c"}
        
        for i in mapping:
            model.fit(X_train, y_train[:, i])
            y_pred.append(model.predict(X_test))
            r2s.append(round(r2_score(y_test[:, i], model.predict(X_test)), 3))
            if prompt == "LSVR":
              coffs.append(np.round(model.coef_, 3))
            else:
              coffs.append(np.round(model.dual_coef_, 3))
            intercepts.append(np.round(model.intercept_[0], 3))
            
        y_pred = np.array(y_pred).T
        result = round(np.array(r2s).mean(), 3)
        print(f"R2 Score: {result}")
            
    else:                
        if prompt == "ANN":
            model.fit(X_train, y_train, validation_split=0.1, epochs=100, verbose=0)
        else:
            model.fit(X_train, y_train)
            
        y_pred = model.predict(X_test)
        result = round(r2_score(y_test, y_pred), 3)
        print(f"R2 Score: {result}")
        
    return y_pred, regressor, model, result, coffs, intercepts
Beispiel #8
0
    def get_baseline_loss(self):
        """
        Computes the loss for the default hyperparameter configuration of the ML-algorithm (baseline).
        :return:
        baseline_loss: float
            Validation loss of the baseline HP-configuration
        """
        if self.ml_algorithm == 'RandomForestRegressor':
            model = RandomForestRegressor(random_state=0)
            model.fit(self.x_train, self.y_train)
            y_pred = model.predict(self.x_val)

        elif self.ml_algorithm == 'SVR':
            model = SVR()
            model.fit(self.x_train, self.y_train)
            y_pred = model.predict(self.x_val)

        elif self.ml_algorithm == 'AdaBoostRegressor':
            model = AdaBoostRegressor(random_state=0)
            model.fit(self.x_train, self.y_train)
            y_pred = model.predict(self.x_val)

        elif self.ml_algorithm == 'DecisionTreeRegressor':
            model = DecisionTreeRegressor(random_state=0)
            model.fit(self.x_train, self.y_train)
            y_pred = model.predict(self.x_val)

        elif self.ml_algorithm == 'LinearRegression':
            model = LinearRegression()
            model.fit(self.x_train, self.y_train)
            y_pred = model.predict(self.x_val)

        elif self.ml_algorithm == 'KNNRegressor':
            model = KNeighborsRegressor()
            model.fit(self.x_train, self.y_train)
            y_pred = model.predict(self.x_val)

        elif self.ml_algorithm == 'KerasRegressor':
            # >>> What are default parameters for a keras model?
            # Baseline regression model from: https://www.tensorflow.org/tutorials/keras/regression#full_model

            model = keras.Sequential()
            model.add(keras.layers.InputLayer(input_shape=len(self.x_train.keys())))
            model.add(keras.layers.Dense(64, activation='relu'))
            model.add(keras.layers.Dense(64, activation='relu'))
            model.add(keras.layers.Dense(1))

            model.compile(loss='mse', optimizer=keras.optimizers.Adam(0.001))

            model.fit(self.x_train, self.y_train, epochs=100, validation_data=(self.x_val, self.y_val), verbose=0)

            y_pred = model.predict(self.x_val)

        elif self.ml_algorithm == 'XGBoostRegressor':
            model = XGBRegressor(random_state=0)
            model.fit(self.x_train, self.y_train)
            y_pred = model.predict(self.x_val)

        else:
            raise Exception('Unknown ML-algorithm!')

        # Add remaining ML-algorithms here

        baseline_loss = self.metric(self.y_val, y_pred)

        return baseline_loss
Beispiel #9
0
    def create_new(cls, algorithm, par):
        """
                Create new untrained model.

                Parameters:
                ----------
                algorithm: str (rf/pca_rf/ann/cnn)
                        which algorithm will be used
                par: dict
                        For RF:
                        - n_estimators: suggest using 1000
                        - max_features: suggest using 25
                        For ANN:
                        - neurons: suggest using [200]
                        - l2_lambda: suggest using 0
                        For CNN:
                        - conv_nodes: suggest using [32, 128]
                        - dense_nodes: suggest using [512, 512]
                        - dropout_p: suggest using [0.1, 0.5]
                """
        seed = 369
        if (algorithm == 'rf') or (algorithm == 'pca_rf'):
            model = RandomForestRegressor(n_estimators=par['n_estimators'],
                                          max_features=par['max_features'],
                                          min_samples_leaf=1,
                                          random_state=seed,
                                          n_jobs=-1)
        elif algorithm == 'ann':
            model = models.Sequential()
            model.add(
                layers.Dense(par['neurons'][0],
                             activation='relu',
                             input_shape=(441, ),
                             kernel_regularizer=regularizers.l2(
                                 par['l2_lambda'])))
            if len(par['neurons']) > 1:
                # add more hidden layers if `len(neurons) > 1`
                for n in par['neurons'][1:]:
                    model.add(
                        layers.Dense(
                            n,
                            activation='relu',
                            kernel_regularizer=regularizers.l2(l2_lambda)))

            # add output layer
            model.add(layers.Dense(2, activation='softmax'))

            # define optimizer = RMS
            # low learning rate avoids over shoot of correction
            optimizer = optimizers.RMSprop(lr=1e-4)

            # compile model, using accuracy to fit training data
            model.compile(optimizer=optimizer,
                          loss='categorical_crossentropy',
                          metrics=['accuracy'])
        elif algorithm == 'cnn':
            model = models.Sequential()

            model.add(
                Conv2D(par['conv_nodes'][0],
                       3,
                       3,
                       activation='relu',
                       input_shape=(21, 21, 1),
                       dim_ordering="tf"))
            model.add(MaxPooling2D((2, 2), dim_ordering="tf"))
            if len(par['dropout_p']) != 0:
                model.add(Dropout(par['dropout_p'][0]))

            if len(par['conv_nodes']) > 1:
                for i in par['conv_nodes'][1:]:
                    model.add(
                        Conv2D(i, 3, 3, activation='relu', dim_ordering="tf"))
                    model.add(MaxPooling2D((2, 2), dim_ordering="tf"))
                    if len(par['dropout_p']) != 0:
                        model.add(Dropout(par['dropout_p'][0]))
            model.add(Flatten())

            for n in par['dense_nodes']:
                model.add(Dense(n))
                model.add(Activation("relu"))
                if len(par['dropout_p']) != 0:
                    model.add(Dropout(par['dropout_p'][1]))

            model.add(Dense(2))
            model.add(Activation('softmax'))

            lr = 0.1
            decay = lr / 50
            optimizer = optimizers.SGD(lr=lr, decay=decay, nesterov=True)

            model.compile(optimizer=optimizer,
                          loss='categorical_crossentropy',
                          metrics=['accuracy'])

        return cls(model, algorithm)
Beispiel #10
0
def main():

    # 데이터 분리
    dummy_list, x_val, y_val, id_val, month_val, Train, Predict, model_name, month_name = read_data_info(
        read_data_file, read_col_info_file, read_model_info_file)

    # 샘플뽑아 진행 (생략가능)
    Train, Predict = small_sample(train_num, test_num, Train, Predict)

    # make dataset
    X_Train_df, y_Train_df, X_Predict_df, y_Predict_df = make_model_df(
        dummy_list, x_val, y_val, id_val, month_val, Train, Predict,
        month_name)

    # 차분공식
    X_Train_df, y_Train_df, X_Predict_df, y_Predict_df = get_diff_df(
        X_Train_df, X_Predict_df, y_Train_df, y_Predict_df)

    # -------------- 모델 선택 -------------------
    if model_name == 'logit':  # 로지스틱

        # 현재 dataset은 logit에 맞는 형태가 아니기에 임의로 변경해서 확인하는 작업입니다.
        y_Train_df.loc[y_Train_df[y_val] > np.mean(y_Train_df[y_val]),
                       y_val] = 1
        y_Train_df.loc[y_Train_df[y_val] > 1, y_val] = 0

        model = sm.Logit(y_Train_df, X_Train_df).fit()
        get_simple_results(model, X_Predict_df, y_val, Predict)

    elif model_name == 'MNlogit':  # 다중 로지스틱
        model = sm.MNLogit(y_Train_df, X_Train_df).fit()
        get_simple_results(model, X_Predict_df, y_val, Predict)

    elif model_name == 'OLS':  # 선형회귀
        model = sm.OLS(y_Train_df, X_Train_df).fit()
        get_simple_results(model, X_Predict_df, y_val, Predict)

    elif model_name == 'Random_fore':  # 랜덤포레스트
        model = RandomForestRegressor(max_depth=2, random_state=0).fit(
            X_Train_df, y_Train_df)
        get_model_results(model, X_Predict_df)

    # 현재 우리가 필요한 문제 auto_reg로 자동화 회귀모델링
    # auto 모델의 경우 predict를 할 수 있는 reg와 분류작업을 위한 classifi를 직접 지정받아야하는 부분입니다.
    elif model_name == 'Auto_classi':
        model = GoClassify(n_best=1).train(X_Train_df, y_Train_df)
        get_model_results(model, X_Predict_df)

    elif model_name == 'Auto_reg':
        model = GoRegress(n_best=1).train(X_Train_df, y_Train_df)
        get_model_results(model, X_Predict_df)

    # 신경망 (Deep learning)
    elif model_name == 'Neural_net':

        # scaling 하는 또다른 방법. 적용하였으면 추후 재 되돌리는 코드 필요. LSTM 코드 참조
        #sc = StandardScaler()
        #X_Train_df = sc.fit_transform(X_Train_df)
        #y_Train_df = sc.fit_transform(y_Train_df)
        #X_Predict_df = sc.fit_transform(X_Predict_df)
        #X_Predict_df = sc.fit_transform(y_Predict_df)

        # Initialising the ANN
        model = Sequential()

        # Adding the input layer and the first hidden layer
        model.add(
            Dense(10,
                  activation='relu',
                  kernel_initializer='normal',
                  input_dim=X_Train_df.shape[1]))

        # Adding the second hidden layer
        model.add(Dense(units=8, activation='relu'))
        # model.add(Dropout(0.5))

        # Adding the third hidden layer
        # model.add(Dense(units = 4, activation = 'relu'))   #  레이어 추가
        # model.add(Dropout(0.5))

        # Adding the output layer
        model.add(Dense(units=1, activation='relu'))
        model.compile(optimizer='rmsprop',
                      loss='mean_squared_error',
                      metrics=['accuracy'])
        model.fit(X_Train_df, y_Train_df, batch_size=10, epochs=150,
                  verbose=0)  # callback 안함. 필요시 LSTM 코드 참조 추가

        get_neural_results(model, X_Predict_df)

    else:
        print('Please select your data model')
            profit += x_test.iloc[i]['odd_home']
        profit -= 1
        
    if x_test.iloc[i]['prediction'] < 0.3 and x_test.iloc[i]['prediction'] > 0. and x_test.iloc[i]['odd_away'] > 1.4:
        if x_test.iloc[i]['result'] == 0:
            profit += x_test.iloc[i]['odd_away']
        profit -= 1

print('Profit_rf: ', profit)



predict_columns = ['elo', 'elo_recent', 'elo_surf', 'prob_g', 'prob_g_rec', 'lose12', 'p_gamma', 'p_gamma_rec', 'p_gamma_surf', 'p_gamma_time', 'set_score', 'match_score', 'p_gamma_rec_p5', 'p_gamma_rec_m5', 'd_dif', 'freq_home', 'freq_away', 'fatigue_home', 'fatigue_away', 'win_perc', 'set_perc', 'game_perc', '1st_lose_win', '1st_win_lose', 'p_gamma_simple', 'p_gamma_simplest', 'p_gamma_simple_surf', 'p_gamma_simplest_surf', 'age_dif']
model = Sequential()

model.add(Dense(32, input_dim=29, activation='relu'))
model.add(Dense(32, activation='relu'))

model.add(Dense(1, activation='sigmoid'))
# Compile model
model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy'])

model.fit(x_train[predict_columns].values, y_train.values, epochs = 400, batch_size = 10)

scores = model.evaluate(x_train[predict_columns], y_train)

y_pred_keras = model.predict_proba(x_test[predict_columns])

print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

x_test['prediction_keras'] = y_pred_keras