Beispiel #1
0
# Fully connected (Dense)
from keras import models
from keras import layers

model = models.Sequential()  # set model
model.add(layers.Dense(16, activation='relu', input_shape=(64, )))  # 16 output
model.add(layers.Dense(16, activation='relu'))  # 16 output
model.add(layers.Dense(1,
                       activation='linear'))  # 1 output, y = a(wx + b), a = 1

# In[8]:

model.compile(
    optimizer='rmsprop',
    loss='mean_squared_error',  # regression problems
    metrics=['mse'
             ])  # regression problems, MSE, MAE, MAPE, Cosine, not accuracy

# In[9]:

history = model.fit(X_train,
                    y_train_scaler,
                    epochs=250,
                    batch_size=1000,
                    validation_data=(X_val, y_val_scaler))

# In[10]:

import matplotlib.pyplot as plt
    def train(self):
        start = timeit.default_timer()
        train_x, train_y, feature_list = self.feature_extraction()
        self._feature_size = [train_x.shape[1], 1]
        self._features = feature_list

        if not self._silent:
            print "Train has %d instances " % (len(train_x))

        counts = Counter(train_y)
        expectation_ratio = 1 / float(len(counts.keys()))
        n_samples = len(train_y)
        for key, value in counts.items():
            tmp = float(expectation_ratio) / (float(value) / float(n_samples))
            if (tmp > 6) | (tmp < (1.0 / 6.0)):
                self._data_balance = True

        extra_fit_args = dict()
        if self._weight_col is not None:
            extra_fit_args['sample_weight'] = train_x[self._weight_col].values
            del train_x[self._weight_col]

        if 0 < self._bootstrap < 1.0:
            if self._bootstrap_seed is not None:
                if not self._silent:
                    print "Setting bootstrap seed to %d" % self._bootstrap_seed
                np.random.seed(self._bootstrap_seed)
                random.seed(self._bootstrap_seed)
            bootstrap_len = int(math.floor(self._bootstrap * len(train_x)))
            bootstrap_ix = random.sample(range(len(train_x)), bootstrap_len)
            train_x = train_x.iloc[bootstrap_ix]
            train_x.reset_index()
            train_y = train_y.iloc[bootstrap_ix]
            train_y.reset_index()

        model = None

        if self._model_type == "RandomForestRegressor":
            if model is None:
                if self._data_balance is True:
                    self._fit_args.update({"class_weight": "balanced"})
                model = RandomForestRegressor(**self._fit_args)
                model.fit(X=train_x, y=train_y, **extra_fit_args)
                self._model = model
                self._predict = lambda (fitted_model, pred_x
                                        ): self.continuous_predict(x=pred_x)
                self._have_feat_importance = True

        elif self._model_type == "RandomForestClassifier":
            if model is None:
                # if self._data_balance is True:
                #     self._fit_args.update({"class_weight": "balanced"})
                model = RandomForestClassifier(**self._fit_args)
                model.fit(X=train_x, y=train_y, **extra_fit_args)
                self._model = model
                self._predict = lambda (fitted_model, pred_x): self.pred_proba(
                    x=pred_x)
            self._staged_predict = lambda (
                fitted_model, pred_x): [self._predict((fitted_model, pred_x))]
            self._have_feat_importance = True

        elif self._model_type == "ExtraTreesRegressor":
            if model is None:
                if self._data_balance is True:
                    self._fit_args.update({"class_weight": "balanced"})
                model = ExtraTreesRegressor(**self._fit_args)
                model.fit(X=train_x, y=train_y, **extra_fit_args)
                self._model = model
                self._predict = lambda (fitted_model, pred_x
                                        ): self.continuous_predict(x=pred_x)
                self._have_feat_importance = True

        elif self._model_type == "ExtraTreesClassifier":
            if model is None:
                if self._data_balance is True:
                    self._fit_args.update({"class_weight": "balanced"})
                model = ExtraTreesClassifier(**self._fit_args)
                model.fit(X=train_x, y=train_y, **extra_fit_args)
            self._predict = lambda (fitted_model, pred_x): self.pred_proba(
                x=pred_x)
            self._staged_predict = lambda (
                fitted_model, pred_x): [self._predict((fitted_model, pred_x))]
            self._have_feat_importance = True

        elif self._model_type == "GradientBoostingRegressor":
            if model is None:
                model = GradientBoostingRegressor(**self._fit_args)
                model.fit(X=train_x, y=train_y, **extra_fit_args)
                self._model = model
            elif self._load_type == "fit_more":
                model.warm_start = True
                model.n_estimators += self._fit_args['n_estimators']
                model.fit(X=train_x, y=train_y)
                self._model = model
            self._predict = lambda (fitted_model, pred_x
                                    ): self.continuous_predict(x=pred_x)
            self._staged_predict = lambda (
                fitted_model, pred_x): self.staged_pred_continuous(x=pred_x)
            if self._load_type == "pred_at" and self._fit_args[
                    'n_estimators'] < model.n_estimators:
                if not self._silent:
                    print("Predict using %d trees" %
                          self._fit_args['n_estimators'])
                self._predict = lambda (
                    fitted_model, pred_x): self.staged_pred_continuous_at_n(
                        x=pred_x, n=self._fit_args['n_estimators'])
        elif self._model_type == "GradientBoostingClassifier":
            if model is None:
                model = GradientBoostingClassifier(**self._fit_args)
                model.fit(X=train_x, y=train_y, **extra_fit_args)
                self._model = model
            elif self._load_type == "fit_more":
                model.warm_start = True
                model.n_estimators += self._fit_args['n_estimators']
                model.fit(X=train_x, y=train_y)
                self._model = model
                self._staged_predict = lambda (
                    fitted_model, pred_x): self.staged_pred_proba(x=pred_x)
            self._predict = lambda (fitted_model, pred_x): self.pred_proba(
                x=pred_x)
            if self._load_type == "pred_at" and self._fit_args[
                    'n_estimators'] < model.n_estimators:
                if not self._silent:
                    print("Predict using %d trees" %
                          self._fit_args['n_estimators'])
                self._predict = lambda (
                    fitted_model, pred_x): self.staged_pred_proba_at_n(
                        x=pred_x, n=self._fit_args['n_estimators'])
        elif self._model_type == "LogisticRegression":
            if model is None:
                if self._data_balance is True:
                    self._fit_args.update({"class_weight": "balanced"})
                model = LogisticRegression(**self._fit_args)
                model.fit(X=train_x, y=train_y)
                self._model = model
            self._predict = lambda (fitted_model, pred_x): self.pred_proba(
                x=pred_x)
            self._staged_predict = lambda (
                fitted_model, pred_x): [self._predict((fitted_model, pred_x))]

        elif self._model_type == "SVC":
            if model is None:
                if self._data_balance is True:
                    self._fit_args.update({"class_weight": "balanced"})
                model = sklearn.svm.SVC(**self._fit_args)
                model.fit(X=train_x, y=train_y)
                self._model = model
            self._predict = lambda (fitted_model, pred_x): self.pred_proba(
                x=pred_x)
            self._staged_predict = lambda (
                fitted_model, pred_x): [self._predict((fitted_model, pred_x))]
        elif self._model_type == "CNN":
            if model is None:
                train_data = load_pd_df(self._input_dir + '/train.csv')
                indices, max_len = self.process_date_list(
                    train_data['Date'].map(
                        lambda x: datetime.datetime.strptime(x, '%Y-%m-%d')))
                self._feature_size = [train_x.shape[1], max_len]

                NB_FILTER = [64, 128]
                NB_Size = [4, 3, 3]
                FULLY_CONNECTED_UNIT = 256
                model = Sequential()
                model.add(
                    Conv2D(NB_FILTER[0], (train_x.shape[1], NB_Size[0]),
                           input_shape=train_x.shape,
                           border_mode='valid',
                           activation='relu'))
                model.add(MaxPooling2D(pool_size=(1, 3)))
                model.add(
                    Conv2D(NB_FILTER[1], (1, NB_Size[1]), border_mode='valid'))
                model.add(MaxPooling2D(pool_size=(1, 3)))
                model.add(Flatten())
                model.add(
                    Dense(FULLY_CONNECTED_UNIT,
                          activation='relu',
                          W_constraint=maxnorm(3),
                          kernel_regularizer=regularizers.l2(0.01)))
                model.add(Dense(2, activation='softmax'))
                model.compile(loss='categorical_crossentropy',
                              optimizer=Adamax(),
                              metrics=['accuracy'])
                model.fit(train_x,
                          train_y,
                          batch_size=16,
                          epochs=50,
                          verbose=1)
        elif self._model_type == "LSTM":
            if model is None:
                train_data = load_pd_df(self._input_dir + '/train.csv')
                indices, max_len = self.process_date_list(
                    train_data['Date'].map(
                        lambda x: datetime.datetime.strptime(x, '%Y-%m-%d')))
                self._feature_size = [train_x.shape[1], max_len]

                class_weight = {
                    1:
                    np.divide(float(n_samples), float(
                        (len(counts) * counts[1]))),
                    0:
                    np.divide(float(n_samples), float(
                        (len(counts) * counts[0])))
                }
                # class_weight = {1: 10,
                #                 0: 1}
                model = CNN_LSTM(
                    (self._feature_size[0], 4),
                    (None, self._feature_size[0], self._feature_size[1], 1))
                model.fit_generator(
                    generator=self.generator(train_x, train_y, indices,
                                             max_len),
                    epochs=20,
                    class_weight=class_weight,
                    steps_per_epoch=train_x.shape[0] / self._batch_size)
                # model.fit_generator(generator=self.generator(train_x, train_y, indices, max_len),
                #                     epochs=1, class_weight=class_weight, steps_per_epoch=1)
                self._model = model

        elif self._model_type == "Pipeline":
            if model is None:
                model = Pipeline([
                    ('pre_process',
                     get_class(self._fit_args['pre_process']['name'])(
                         self._fit_args['pre_process']['args'])),
                    ('model', get_class(self._fit_args['model']['name'])(
                        self._fit_args['model']['args']))
                ])
                model.fit(X=train_x, y=train_y)
                self._model = model
            self._predict = lambda (fitted_model, pred_x): self.pred_proba(
                x=pred_x)
            self._staged_predict = lambda (
                fitted_model, pred_x): [self._predict((fitted_model, pred_x))]

        if not self._silent:
            stop = timeit.default_timer()
            print "Train time: %d s" % (stop - start)

        del train_x, train_y
def create_model_output(model_type, input_values_combined, output_values_range,
                        count, clf, no_epochs):
    if model_type == 'regression':
        ''' ML regression Model '''
        [xtrain, xtest, ytrain,
         ytest] = splitPreProcess(input_values_combined, output_values_range,
                                  test_window, lstm_history)
        [pred_train, clf] = ML_linear(xtrain, ytrain)
        pred_train = clf.predict(xtrain)
        pred_test = clf.predict(xtest[lstm_history:, :])
        pred_train = pred_train.ravel()
        pred_test = pred_test.ravel()
        ytest = ytest[lstm_history:]
    elif model_type == 'neural net':
        ''' NN regression Model '''
        [xtrain, xtest, ytrain,
         ytest] = splitPreProcess(input_values_combined, output_values_range,
                                  test_window, lstm_history)
        [pred_train, optimal_size, optimal_alpha,
         clf] = ML_Optimizer_NN(xtrain, ytrain, xtest, ytest, range_size,
                                range_alpha)
        pred_train = clf.predict(xtrain)
        pred_test = clf.predict(xtest[lstm_history:, :])
        pred_train = pred_train.ravel()
        pred_test = pred_test.ravel()
        ytest = ytest[lstm_history:]
    elif model_type == 'RF':
        [xtrain, xtest, ytrain,
         ytest] = splitPreProcess(input_values_combined, output_values_range,
                                  test_window, lstm_history)
        clf = RandomForestRegressor(max_depth=10,
                                    random_state=0,
                                    n_estimators=50)
        clf.fit(xtrain, ytrain)
        print(clf.feature_importances_)
        pred_train = clf.predict(xtrain)
        pred_test = clf.predict(xtest[lstm_history:, :])
        pred_train = pred_train.ravel()
        pred_test = pred_test.ravel()
        ytest = ytest[lstm_history:]
    elif model_type == 'lstm':
        ''' LSTM regression Model '''
        [xtrain, xtest, ytrain,
         ytest] = splitPreProcess(input_values_combined, output_values_range,
                                  test_window, lstm_history)
        xtrain = xtrain.reshape((xtrain.shape[0], 1, xtrain.shape[1]))
        xtest = xtest.reshape((xtest.shape[0], 1, xtest.shape[1]))
        if count == 0:
            clf = Sequential()
            clf.add(LSTM(200, input_shape=(xtrain.shape[1], xtrain.shape[2])))
            clf.add(Dense(1))
            clf.compile(loss='mae', optimizer='adam')
            history = clf.fit(xtrain,
                              ytrain,
                              epochs=10,
                              batch_size=5,
                              validation_data=(xtrain, ytrain),
                              verbose=1,
                              shuffle=True)
            pyplot.plot(history.history['loss'], label='train')
            pyplot.plot(history.history['val_loss'], label='test')
            pyplot.legend()
            pyplot.show()
            pred_train = clf.predict(xtrain)
            pred_test = clf.predict(xtest[lstm_history:, :])
            pred_train = pred_train.ravel()
            pred_test = pred_test.ravel()
            ytest = ytest[lstm_history:]
            s = pickle.dumps(clf)
        else:
            clf = pickle.loads(s)
            pred_train = clf.predict(xtrain)
            pred_test = clf.predict(xtest[lstm_history:, :])
            pred_train = pred_train.ravel()
            pred_test = pred_test.ravel()
            ytest = ytest[lstm_history:]
    elif model_type == 'lstm_3D':
        ''' LSTM regression Model '''
        [xtrain, xtest, ytrain,
         ytest] = splitPreProcess(input_values_combined, output_values_range,
                                  test_window, lstm_history)
        [xtrain_3D, xtest_3D, ytrain_3D,
         ytest_3D] = create3D(xtrain, xtest, ytrain, ytest, lstm_history)
        xtrain = xtrain_3D
        xtest = xtest_3D
        ytrain = ytrain_3D
        ytest = ytest_3D
        if (count == 0) or (count > 0):
            clf = Sequential()
            clf.add(LSTM(200, input_shape=(xtrain.shape[1], xtrain.shape[2])))
            clf.add(Dense(1))
            clf.compile(loss='mae', optimizer='adam')
            history = clf.fit(xtrain,
                              ytrain,
                              epochs=no_epochs,
                              batch_size=10,
                              validation_data=(xtrain, ytrain),
                              verbose=1,
                              shuffle=True)
            pyplot.plot(history.history['loss'], label='train')
            pyplot.plot(history.history['val_loss'], label='test')
            pyplot.legend()
            pyplot.show()
            pred_train = clf.predict(xtrain)
            pred_test = clf.predict(xtest)
            pred_train = pred_train.ravel()
            pred_test = pred_test.ravel()
            s = pickle.dumps(clf)
        else:
            #clf = pickle.loads(s)
            pred_train = clf.predict(xtrain)
            pred_test = clf.predict(xtest)
            pred_train = pred_train.ravel()
            pred_test = pred_test.ravel()

    corr_coefficient_train = np.corrcoef(pred_train, ytrain)
    corr_coefficient_test = np.corrcoef(pred_test, ytest)
    final_performance.append([
        model_type,
        np.sum(abs(pred_test - ytest)), corr_coefficient_test[0, 1],
        np.sum(abs(pred_train - ytrain)), corr_coefficient_train[0, 1]
    ])
    print(final_performance)
    df_performance = pd.DataFrame(final_performance)
    df_performance.to_csv('df_performance.csv', index=False, header=True)
    return pred_train, pred_test, ytrain, ytest, clf
def train_data(X_train, X_test, y_train, y_test, prompt):
  
    if prompt == "RF":
        model = RandomForestRegressor(max_depth=20)
        regressor = "Random Forest"
    
    elif prompt == "KNN":
        model = KNeighborsRegressor(n_neighbors=5)
        regressor = "KNN"
        
    elif prompt == "DT":
        model = DecisionTreeRegressor(max_depth=10)
        regressor = "Decision Tree"
    
    elif prompt == "LR":
        model = LinearRegression()
        regressor = "Linear Regression"
        
    elif prompt == "LSVR":
        model = SVR(kernel='linear')
        regressor = "Linear SVR"
    
    elif prompt == "RBFSVR":
        model = SVR(kernel='rbf')
        regressor = "RBF Kernel SVR"
    
    elif prompt == "PSVR":
        model = SVR(kernel='poly')
        regressor = "Polynomial SVR"
    
    elif prompt == "ANN":
        regressor = "ANN"
        model = Sequential()
        model.add(Dense(input_dim=4, units=6, activation='tanh'))
        model.add(Dense(units=4, activation='tanh'))
        model.add(Dense(units=4, activation='tanh'))
        model.add(Dense(units=3, activation='relu'))
        model.compile(loss="mse", metrics=['mae'], optimizer='adam')

    else:
      print("Please enter a valid regression model!")
      assert False

    print(f"\nWORKING FOR {regressor.upper()} MODEL")

    # optimal_r2 = 0    
    coffs, intercepts = None, None
    
    if "SVR" in prompt:
        y_pred = []
        r2s = []
        coffs = []
        intercepts = []
        mapping = {0:"a", 1:"b", 2:"c"}
        
        for i in mapping:
            model.fit(X_train, y_train[:, i])
            y_pred.append(model.predict(X_test))
            r2s.append(round(r2_score(y_test[:, i], model.predict(X_test)), 3))
            if prompt == "LSVR":
              coffs.append(np.round(model.coef_, 3))
            else:
              coffs.append(np.round(model.dual_coef_, 3))
            intercepts.append(np.round(model.intercept_[0], 3))
            
        y_pred = np.array(y_pred).T
        result = round(np.array(r2s).mean(), 3)
        print(f"R2 Score: {result}")
            
    else:                
        if prompt == "ANN":
            model.fit(X_train, y_train, validation_split=0.1, epochs=100, verbose=0)
        else:
            model.fit(X_train, y_train)
            
        y_pred = model.predict(X_test)
        result = round(r2_score(y_test, y_pred), 3)
        print(f"R2 Score: {result}")
        
    return y_pred, regressor, model, result, coffs, intercepts
Beispiel #5
0
    def get_baseline(self, cv_mode=True, test_mode=False):
        """
        Computes a loss baseline for the ML-algorithm based on its default hyperparameter configuration
        (either cross validation loss or test loss after full training)
        :param cv_mode: bool
            Flag that indicates, whether to perform cross validation or simple validation
        :param test_mode: bool
            Flag that indicates, whether to compute the loss on the test set or not
        :return:
        baseline: float
             Loss of the baseline HP-configuration.
        """
        if self.is_time_series:
            # Use TimeSeriesSplit for time series data
            kf = TimeSeriesSplit(n_splits=5)
        else:
            # Create K-Folds cross validator for all other data types
            kf = KFold(n_splits=5, shuffle=self.shuffle)

        cv_baselines = []
        cv_iter = 0

        # Iterate over the cross validation splits
        for train_index, val_index in kf.split(X=self.x_train):
            cv_iter = cv_iter + 1

            # Cross validation
            if cv_mode and not test_mode:

                x_train_cv, x_val_cv = self.x_train.iloc[train_index], self.x_train.iloc[val_index]
                y_train_cv, y_val_cv = self.y_train.iloc[train_index], self.y_train.iloc[val_index]

            # Separate a validation set, but do not perform cross validation
            elif not cv_mode and not test_mode and cv_iter < 2:

                x_train_cv, x_val_cv, y_train_cv, y_val_cv = train_test_split(self.x_train, self.y_train, test_size=0.2,
                                                                              shuffle=self.shuffle, random_state=0)

            # Training on full training set and evaluation on test set
            elif not cv_mode and test_mode and cv_iter < 2:

                x_train_cv, x_val_cv = self.x_train, self.x_test
                y_train_cv, y_val_cv = self.y_train, self.y_test

            elif cv_mode and test_mode:

                raise Exception('Cross validation is not implemented for test mode.')

            # Iteration doesn't make sense for non cross validation
            else:
                continue

            if self.ml_algorithm == 'RandomForestRegressor':
                model = RandomForestRegressor(random_state=0)
                model.fit(x_train_cv, y_train_cv)
                y_pred = model.predict(x_val_cv)

            elif self.ml_algorithm == 'RandomForestClassifier':
                model = RandomForestClassifier(random_state=0)
                model.fit(x_train_cv, y_train_cv)
                y_pred = model.predict(x_val_cv)

            elif self.ml_algorithm == 'SVR':
                model = SVR(cache_size=500)
                model.fit(x_train_cv, y_train_cv)
                y_pred = model.predict(x_val_cv)

            elif self.ml_algorithm == 'SVC':
                model = SVC(random_state=0, cache_size=500)
                model.fit(x_train_cv, y_train_cv)
                y_pred = model.predict(x_val_cv)

            elif self.ml_algorithm == 'AdaBoostRegressor':
                model = AdaBoostRegressor(random_state=0)
                model.fit(x_train_cv, y_train_cv)
                y_pred = model.predict(x_val_cv)

            elif self.ml_algorithm == 'AdaBoostClassifier':
                model = AdaBoostClassifier(random_state=0)
                model.fit(x_train_cv, y_train_cv)
                y_pred = model.predict(x_val_cv)

            elif self.ml_algorithm == 'DecisionTreeRegressor':
                model = DecisionTreeRegressor(random_state=0)
                model.fit(x_train_cv, y_train_cv)
                y_pred = model.predict(x_val_cv)

            elif self.ml_algorithm == 'DecisionTreeClassifier':
                model = DecisionTreeClassifier(random_state=0)
                model.fit(x_train_cv, y_train_cv)
                y_pred = model.predict(x_val_cv)

            elif self.ml_algorithm == 'LinearRegression':
                model = LinearRegression()
                model.fit(x_train_cv, y_train_cv)
                y_pred = model.predict(x_val_cv)

            elif self.ml_algorithm == 'KNNRegressor':
                model = KNeighborsRegressor()
                model.fit(x_train_cv, y_train_cv)
                y_pred = model.predict(x_val_cv)

            elif self.ml_algorithm == 'KNNClassifier':
                model = KNeighborsClassifier()
                model.fit(x_train_cv, y_train_cv)
                y_pred = model.predict(x_val_cv)

            elif self.ml_algorithm == 'LogisticRegression':
                model = LogisticRegression()
                model.fit(x_train_cv, y_train_cv)
                y_pred = model.predict(x_val_cv)

            elif self.ml_algorithm == 'NaiveBayes':
                model = GaussianNB()
                model.fit(x_train_cv, y_train_cv)
                y_pred = model.predict(x_val_cv)

            elif self.ml_algorithm == 'MLPRegressor':
                model = MLPRegressor(random_state=0)
                model.fit(x_train_cv, y_train_cv)
                y_pred = model.predict(x_val_cv)

            elif self.ml_algorithm == 'MLPClassifier':
                model = MLPClassifier(random_state=0)
                model.fit(x_train_cv, y_train_cv)
                y_pred = model.predict(x_val_cv)

            elif self.ml_algorithm == 'ElasticNet':
                model = ElasticNet(random_state=0)
                model.fit(x_train_cv, y_train_cv)
                y_pred = model.predict(x_val_cv)

            elif self.ml_algorithm == 'KerasRegressor' or self.ml_algorithm == 'KerasClassifier':

                # Use the warmstart configuration to create a baseline for Keras models

                epochs = 100

                # Initialize the neural network
                model = keras.Sequential()

                # Add input layer
                model.add(keras.layers.InputLayer(input_shape=len(x_train_cv.keys())))

                # Add first hidden layer
                if warmstart_keras['hidden_layer1_size'] > 0:
                    model.add(
                        keras.layers.Dense(warmstart_keras['hidden_layer1_size'],
                                           activation=warmstart_keras['hidden_layer1_activation']))
                    model.add(keras.layers.Dropout(warmstart_keras['dropout1']))

                # Add second hidden layer
                if warmstart_keras['hidden_layer2_size'] > 0:
                    model.add(
                        keras.layers.Dense(warmstart_keras['hidden_layer2_size'],
                                           activation=warmstart_keras['hidden_layer2_activation']))
                    model.add(keras.layers.Dropout(warmstart_keras['dropout2']))

                # Add output layer
                if self.ml_algorithm == 'KerasRegressor':

                    model.add(keras.layers.Dense(1, activation='linear'))

                    # Select optimizer and compile the model
                    adam = keras.optimizers.Adam(learning_rate=warmstart_keras['init_lr'])
                    model.compile(optimizer=adam, loss='mse', metrics=['mse'])

                elif self.ml_algorithm == 'KerasClassifier':

                    # Determine the number of different classes depending on the data format
                    if type(y_train_cv) == pd.core.series.Series:
                        num_classes = int(max(y_train_cv) - min(y_train_cv) + 1)

                    elif type(y_train_cv) == pd.core.frame.DataFrame:
                        num_classes = len(y_train_cv.keys())

                    else:
                        raise Exception('Unknown data format!')

                    # Binary classification
                    if num_classes <= 2:

                        # 'Sigmoid is equivalent to a 2-element Softmax, where the second element is assumed to be zero'
                        # https://keras.io/api/layers/activations/#sigmoid-function
                        model.add(keras.layers.Dense(1, activation='sigmoid'))

                        adam = keras.optimizers.Adam(learning_rate=warmstart_keras['init_lr'])
                        model.compile(optimizer=adam, loss=keras.losses.BinaryCrossentropy(), metrics=['accuracy'])

                    # Multiclass classification
                    else:

                        # Use softmax activation for multiclass clf. -> 'Softmax converts a real vector to a vector of
                        # categorical probabilities.[...]the result could be interpreted as a probability distribution.'
                        # https://keras.io/api/layers/activations/#softmax-function
                        model.add(keras.layers.Dense(num_classes, activation='softmax'))

                        adam = keras.optimizers.Adam(learning_rate=warmstart_keras['init_lr'])
                        model.compile(optimizer=adam, loss=keras.losses.CategoricalCrossentropy(),
                                      metrics=[keras.metrics.CategoricalAccuracy()])

                # Learning rate schedule
                if warmstart_keras["lr_schedule"] == "cosine":
                    schedule = functools.partial(cosine, initial_lr=warmstart_keras["init_lr"], T_max=epochs)

                elif warmstart_keras["lr_schedule"] == "exponential":
                    schedule = functools.partial(exponential, initial_lr=warmstart_keras["init_lr"], T_max=epochs)

                elif warmstart_keras["lr_schedule"] == "constant":
                    schedule = functools.partial(fix, initial_lr=warmstart_keras["init_lr"])

                else:
                    raise Exception('Unknown learning rate schedule!')

                # Determine the learning rate for this iteration and pass it as callback
                lr = keras.callbacks.LearningRateScheduler(schedule)

                # Early stopping callback
                early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss',
                                                               min_delta=0,
                                                               patience=10,
                                                               verbose=1,
                                                               mode='auto',
                                                               restore_best_weights=True)

                callbacks_list = [lr, early_stopping]

                # Train the model
                model.fit(x_train_cv, y_train_cv, epochs=epochs, batch_size=warmstart_keras['batch_size'],
                          validation_data=(x_val_cv, y_val_cv), callbacks=callbacks_list,
                          verbose=0)

                # Make the prediction
                y_pred = model.predict(x_val_cv)

                # In case of binary classification round to the nearest integer
                if self.ml_algorithm == 'KerasClassifier':

                    # Binary classification
                    if num_classes <= 2:

                        y_pred = np.rint(y_pred)

                    # Multiclass classification
                    else:

                        # Identify the predicted class (maximum probability) in each row
                        for row_idx in range(y_pred.shape[0]):

                            # Predicted class
                            this_class = np.argmax(y_pred[row_idx, :])

                            # Iterate over columns / classes
                            for col_idx in range(y_pred.shape[1]):

                                if col_idx == this_class:
                                    y_pred[row_idx, col_idx] = 1
                                else:
                                    y_pred[row_idx, col_idx] = 0

                # KerasRegressor
                else:
                    y_pred = np.reshape(y_pred, newshape=(-1,))

            elif self.ml_algorithm == 'XGBoostRegressor':
                model = XGBRegressor(random_state=0)
                model.fit(x_train_cv, y_train_cv)
                y_pred = model.predict(x_val_cv)

            elif self.ml_algorithm == 'XGBoostClassifier':
                model = XGBClassifier(random_state=0)
                model.fit(x_train_cv, y_train_cv)
                y_pred = model.predict(x_val_cv)

            elif self.ml_algorithm == 'LGBMRegressor' or self.ml_algorithm == 'LGBMClassifier':
                # Create lgb datasets
                train_data = lgb.Dataset(x_train_cv, label=y_train_cv)
                valid_data = lgb.Dataset(x_val_cv, label=y_val_cv)

                # Specify the ML task and the random seed
                if self.ml_algorithm == 'LGBMRegressor':
                    # Regression task
                    params = {'objective': 'regression',
                              'seed': 0}

                elif self.ml_algorithm == 'LGBMClassifier':

                    # Determine the number of classes
                    num_classes = int(max(y_train_cv) - min(y_train_cv) + 1)

                    # Binary classification task
                    if num_classes <= 2:
                        params = {'objective': 'binary',
                                  'seed': 0}

                    # Multiclass classification task
                    else:
                        params = {'objective': 'multiclass',  # uses Softmax objective function
                                  'num_class': num_classes,
                                  'seed': 0}

                lgb_clf = lgb.train(params=params, train_set=train_data, valid_sets=[valid_data], verbose_eval=False)

                # Make the prediction
                y_pred = lgb_clf.predict(data=x_val_cv)

                # Classification task
                if self.ml_algorithm == 'LGBMClassifier':

                    # Binary classification: round to the nearest integer
                    if num_classes <= 2:

                        y_pred = np.rint(y_pred)

                    # Multiclass classification: identify the predicted class based on the one-hot-encoded probabilities
                    else:

                        y_one_hot_proba = np.copy(y_pred)
                        n_rows = y_one_hot_proba.shape[0]

                        y_pred = np.zeros(shape=(n_rows, 1))

                        # Identify the predicted class for each row (highest probability)
                        for row in range(n_rows):
                            y_pred[row, 0] = np.argmax(y_one_hot_proba[row, :])

            else:
                raise Exception('Unknown ML-algorithm!')

            # Add remaining ML-algorithms here

            cv_baselines.append(self.metric(y_val_cv, y_pred))

        if cv_mode:

            # Compute the average cross validation loss
            baseline = np.mean(cv_baselines)

        else:
            baseline = cv_baselines[0]

        return baseline
Beispiel #6
0
def run_feature_selector_algo(args, S, X_train, X_test, T_train, T_test, i,
                              model_fpsr, model_fnsr, model_msfe, model_mspe,
                              model_card, model_nme_train, model_nme_test):
    log_params = False
    file_path_prefix = "./parameters/"
    feature_percentage = args.feature_percentage

    start_time = time.time()
    if args.algo == "RF":
        file_path = file_path_prefix + args.data + "/" + args.algo + "-" + str(
            i) + ".joblib"

        model = RandomForestRegressor(n_estimators=100)
        model = create_model(args, file_path, model, X_train, T_train)
        importance_vals = model.feature_importances_

        # Choose features which has 1% importance according to paper: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6660200/
        S_hat = np.argwhere(importance_vals > 0.01).flatten()

        if args.data == "MNIST" or args.data == "CIFAR-10":  # Take 40% features of MNIST only
            file_path = file_path_prefix + args.data + "/" + args.algo + "-" + feature_percentage + "_percent_features-" + str(
                i) + ".joblib"
            n_sub_feat_size = int(X_train.shape[1] * int(feature_percentage) /
                                  100)
            # For RF use this because of the already trained saved model in Sandipan's laptop
            # n_sub_feat_size = 315
            S_hat = np.argsort(
                importance_vals)[::-1][:n_sub_feat_size].flatten(
                )  #40% features
            model = RandomForestRegressor(n_estimators=100)
            model = create_model(args, file_path, model, X_train[:, S_hat],
                                 T_train)
            X_train = X_train[:, S_hat]
            X_test = X_test[:, S_hat]
        log_params = True

    elif args.algo == "DEEPLIFT":
        # Implemented using DeepExplain in SHAP: https://github.com/slundberg/shap
        #-------------------------------------------------------------------------#
        x_train = X_train
        x_test = X_test

        X_train = X_train.reshape(X_train.shape[0], 28, 28)
        X_test = X_test.reshape(X_test.shape[0], 28, 28)
        # Make sure images have shape (28, 28, 1)
        X_train = np.expand_dims(X_train, -1)
        X_test = np.expand_dims(X_test, -1)
        print("X_train shape:", X_train.shape)
        print(X_train.shape[0], "train samples")
        print(X_test.shape[0], "test samples")

        # Model / data parameters
        num_classes = 10
        input_shape = (28, 28, 1)
        """
        ## Build the model
        """

        model = CNNModel(num_classes, input_shape).create_cnn_model()
        model.summary()

        file_path = file_path_prefix + args.data + "/" + args.algo + "-" + str(
            i) + ".h5"
        """
        ## Train the model
        """

        batch_size = 128
        epochs = 15

        model.compile(loss="categorical_crossentropy",
                      optimizer="adam",
                      metrics=["accuracy"])
        model = create_model(args, file_path, model, X_train, T_train)

        # Sanity checks
        score_train = model.evaluate(X_train, T_train, verbose=0)
        score_test = model.evaluate(X_test, T_test, verbose=0)
        print("Test loss:", score_test[0])
        print("Test accuracy:", score_test[1])

        background = X_train[np.random.choice(X_train.shape[0],
                                              100,
                                              replace=False)]
        # explain predictions of the model on 10 images
        e = shap.DeepExplainer(model, background)

        x_test_sample = X_test[np.random.choice(
            X_test.shape[0], int(args.deeplift_sample_size), replace=False), :]

        shap_values = e.shap_values(x_test_sample)

        total_val = np.sum(np.sum(np.abs(shap_values), axis=0),
                           axis=0).flatten()
        S_hat = total_val.argsort()[::-1]

        if args.data == "MNIST" or args.data == "CIFAR-10":  # Take 40% features of MNIST only
            X_train = x_train[:, S_hat]
            X_test = x_test[:, S_hat]
            X_train = X_train.reshape(X_train.shape[0], 28, 28)
            X_test = X_test.reshape(X_test.shape[0], 28, 28)
            # Make sure images have shape (28, 28, 1)
            X_train = np.expand_dims(X_train, -1)
            X_test = np.expand_dims(X_test, -1)
            file_path = file_path_prefix + args.data + "/" + args.algo + "-" + feature_percentage + "percent_features-" + str(
                i) + ".h5"
            n_sub_feat_size = int(X_train.shape[1] * int(feature_percentage) /
                                  100)
            S_hat = total_val.argsort()[::-1][:n_sub_feat_size]  #40% features
            model_new = CNNModel(num_classes, input_shape).create_cnn_model()
            model_new.compile(loss="categorical_crossentropy",
                              optimizer="adam",
                              metrics=["accuracy"])
            model = create_model(args, file_path, model_new, X_train, T_train)

        # Just to compare what global features SHAP with DeepLift choose
        # X_train_ori =  loadmat("./mat_files/MNIST.mat")["train_x"].astype(np.float32)
        # show_image([X_train_ori[:,1],X_train_ori[:,20],X_train_ori[:,30]],S_hat[0:len(S)], (args.algo+str(i)))

        # show_image(x_train[1,:].flatten(),x_train[20,:].flatten(),x_train[30,:].flatten(),S_hat, (args.algo+str(i)))

        log_params = True

    elif args.algo == "BART":
        # Implemented using XBART: https://github.com/JingyuHe/XBART
        #----------------------------------------------------------#
        x_train = X_train
        x_test = X_test

        X_train = pd.DataFrame(X_train)
        X_test = pd.DataFrame(X_test)

        # Ugly hack otherwise xbart fit does not work
        T_train = T_train.flatten()
        T_test = T_test.flatten()

        file_path = file_path_prefix + args.data + "/" + args.algo + str(
            args.tree_size) + "-" + str(i) + ".joblib"
        # model = XBART(num_trees = int(args.tree_size), num_sweeps = 20, burnin = 15, verbose = True, parallel = True)
        model = XBART(num_trees=int(args.tree_size),
                      num_sweeps=20,
                      burnin=15,
                      verbose=True,
                      parallel=True)
        model = create_model(args, file_path, model, X_train, T_train)

        S_hat = sorted(model.importance, key=model.importance.get)[::-1]
        imp_vals = np.array(S_hat)
        S_hat = imp_vals[imp_vals > 0.01]

        if args.data == "MNIST" or args.data == "CIFAR-10":  # Take 40% features of MNIST only
            file_path = file_path_prefix + args.data + "/" + args.algo + "-" + feature_percentage + "_percent_features-" + str(
                i) + ".joblib"
            n_sub_feat_size = int(X_train.shape[1] * int(feature_percentage) /
                                  100)
            S_hat = sorted(
                model.importance,
                key=model.importance.get)[::-1][:
                                                n_sub_feat_size]  #40% features
            model = XBART(num_trees=int(args.tree_size),
                          num_sweeps=20,
                          burnin=15,
                          verbose=True,
                          parallel=True)
            X_train = pd.DataFrame(x_train[:, S_hat])
            X_test = pd.DataFrame(x_test[:, S_hat])
            model = create_model(args, file_path, model, X_train, T_train)

        # Ugly hack otherwise xbart predict does not work
        T_train = T_train.reshape(X_train.shape[0], 1)
        T_test = T_test.reshape(X_test.shape[0], 1)

        log_params = True

    elif args.algo == "POINTNET":
        import torch
        from torch.utils.data import DataLoader
        import kaolin as kal
        from kaolin import ClassificationEngine
        from kaolin.datasets import ModelNet
        from kaolin.models.PointNet import PointNetClassifier as PointNet
        import kaolin.transforms as tfs

        modelnet_path = './mat_files/ModelNet10'
        categories = ['chair', 'sofa']
        num_points = 1024
        device = 'cuda'

        transform = tfs.Compose([
            tfs.TriangleMeshToPointCloud(num_samples=num_points),
            tfs.NormalizePointCloud()
        ])

        train_loader = DataLoader(ModelNet(modelnet_path,
                                           categories=categories,
                                           split='train',
                                           transform=transform,
                                           device=device),
                                  batch_size=12,
                                  shuffle=True)

    elif args.algo == "GAM":  # Note GAM doesn't work on MNIST properly
        file_path = file_path_prefix + args.data + "/" + args.algo + "-" + str(
            i) + ".joblib"
        thershold = 0.01

        gam_fn_form = s(0, n_splines=5)
        for feature in range(1, X_train.shape[1]):
            gam_fn_form += s(feature, n_splines=5)
        # Regression in GAM
        # https://pygam.readthedocs.io/en/latest/notebooks/tour_of_pygam.html#Regression
        model = GAM(gam_fn_form,
                    distribution='normal',
                    link='identity',
                    max_iter=10,
                    tol=0.001)
        model = create_model(args, file_path, model, X_train, T_train)

        feature_vals = np.array(model.statistics_['p_values'])
        imp_vals = feature_vals[feature_vals > thershold]
        S_hat = np.argsort(imp_vals).flatten()

        #S_hat = np.argsort(model.statistics_['p_values'])

        log_params = True

    elif args.algo == "LASSO":
        file_path = file_path_prefix + args.data + "/" + args.algo + "-" + str(
            i) + ".joblib"

        thershold = 0.01
        #T_train = np.argmax(T_train, axis=1)
        #T_test = np.argmax(T_test, axis=1)

        model = linear_model.Lasso(alpha=0.01, max_iter=5000)
        model = create_model(args, file_path, model, X_train, T_train)

        imp_vals = model.coef_[model.coef_ > thershold]
        S_hat = np.argsort(imp_vals).flatten()
        if args.data == "MNIST" or args.data == "CIFAR-10":  # Take 40% features of MNIST only
            file_path = file_path_prefix + args.data + "/" + args.algo + "-" + feature_percentage + "_percent_features-" + str(
                i) + ".joblib"
            n_sub_feat_size = int(X_train.shape[1] * int(feature_percentage) /
                                  100)
            S_hat = np.argsort(
                model.coef_)[::-1][:n_sub_feat_size].flatten()  #40% features
            model = linear_model.Lasso(alpha=0.01, max_iter=5000)
            model = create_model(args, file_path, model, X_train[:, S_hat],
                                 T_train)
            X_train = X_train[:, S_hat]
            X_test = X_test[:, S_hat]

        # Ugly hack otherwise vector norm not calculated
        #T_train = T_train.reshape(X_train.shape[0], 1)
        #T_test = T_test.reshape(X_test.shape[0], 1)

        log_params = True

    elif args.algo == "E-NET":
        file_path = file_path_prefix + args.data + "/" + args.algo + "-" + str(
            i) + ".joblib"

        T_train = np.argmax(T_train, axis=1)
        T_test = np.argmax(T_test, axis=1)

        model = ElasticNet(alpha=0.01, l1_ratio=0.7)
        model = create_model(args, file_path, model, X_train, T_train)

        S_hat = np.argsort(model.coef_)

        log_params = False

    elif args.algo == "CORR":
        thershold = 0.01
        importance_vals = abs(np.dot((X_train.T), T_train).T)[::-1]
        S_hat = np.argsort(importance_vals > thershold).flatten()
        model_fpsr[0, i] = FPSR(S, S_hat)
        model_fnsr[0, i] = FNSR(S, S_hat)

        log_params = False
    elif args.algo == "SPINN":
        # https://github.com/jjfeng/spinn
        log_params = False
        print("Not yet implemented!")

    else:
        print("Sorry! No such evaluation exists.")

    if log_params:
        # Mean squared errors
        model_msfe[0, i] = compute_mse_compare(
            model.predict(X_train).reshape(T_train.shape), T_train)
        model_mspe[0, i] = compute_mse_compare(
            model.predict(X_test).reshape(T_test.shape), T_test)
        # Selection rate errors
        model_fpsr[0, i] = FPSR(S, S_hat)
        model_fnsr[0, i] = FNSR(S, S_hat)
        # Cardinality of the model
        model_card[0, i] = len(S_hat)
        # Normalized Error (NME)
        model_nme_train[0, i] = compute_nme(
            model.predict(X_train).reshape(T_train.shape), T_train)
        model_nme_test[0, i] = compute_nme(
            model.predict(X_test).reshape(T_test.shape), T_test)

        if args.algo == "BART":
            val = model.predict(X_train)
            normalized = (val - min(val)) / (max(val) - min(val))
            accuracy = np.sum([
                abs(0.9 * normalized - T_train.flatten()) < 0.2
            ]) / len(T_train.flatten())
            print("**********The train accuracy is: ", accuracy)
        else:
            print(
                "**********The train accuracy is: ",
                calculate_accuracy(
                    model.predict(X_train).reshape(T_train.shape).T,
                    T_train.T))

        if args.algo == "BART":
            val = model.predict(X_test)
            normalized = (val - min(val)) / (max(val) - min(val))
            accuracy = np.sum([abs(0.9 * normalized - T_test.flatten()) < 0.2
                               ]) / len(T_test.flatten())
            print("**********The test accuracy is: ", accuracy)
        else:
            print(
                "**********The test accuracy is: ",
                calculate_accuracy(
                    model.predict(X_test).reshape(T_test.shape).T, T_test.T))

    print("Time taken for this MC iteration: ", time.time() - start_time)
Beispiel #7
0
    def get_baseline_loss(self):
        """
        Computes the loss for the default hyperparameter configuration of the ML-algorithm (baseline).
        :return:
        baseline_loss: float
            Validation loss of the baseline HP-configuration
        """
        if self.ml_algorithm == 'RandomForestRegressor':
            model = RandomForestRegressor(random_state=0)
            model.fit(self.x_train, self.y_train)
            y_pred = model.predict(self.x_val)

        elif self.ml_algorithm == 'SVR':
            model = SVR()
            model.fit(self.x_train, self.y_train)
            y_pred = model.predict(self.x_val)

        elif self.ml_algorithm == 'AdaBoostRegressor':
            model = AdaBoostRegressor(random_state=0)
            model.fit(self.x_train, self.y_train)
            y_pred = model.predict(self.x_val)

        elif self.ml_algorithm == 'DecisionTreeRegressor':
            model = DecisionTreeRegressor(random_state=0)
            model.fit(self.x_train, self.y_train)
            y_pred = model.predict(self.x_val)

        elif self.ml_algorithm == 'LinearRegression':
            model = LinearRegression()
            model.fit(self.x_train, self.y_train)
            y_pred = model.predict(self.x_val)

        elif self.ml_algorithm == 'KNNRegressor':
            model = KNeighborsRegressor()
            model.fit(self.x_train, self.y_train)
            y_pred = model.predict(self.x_val)

        elif self.ml_algorithm == 'KerasRegressor':
            # >>> What are default parameters for a keras model?
            # Baseline regression model from: https://www.tensorflow.org/tutorials/keras/regression#full_model

            model = keras.Sequential()
            model.add(keras.layers.InputLayer(input_shape=len(self.x_train.keys())))
            model.add(keras.layers.Dense(64, activation='relu'))
            model.add(keras.layers.Dense(64, activation='relu'))
            model.add(keras.layers.Dense(1))

            model.compile(loss='mse', optimizer=keras.optimizers.Adam(0.001))

            model.fit(self.x_train, self.y_train, epochs=100, validation_data=(self.x_val, self.y_val), verbose=0)

            y_pred = model.predict(self.x_val)

        elif self.ml_algorithm == 'XGBoostRegressor':
            model = XGBRegressor(random_state=0)
            model.fit(self.x_train, self.y_train)
            y_pred = model.predict(self.x_val)

        else:
            raise Exception('Unknown ML-algorithm!')

        # Add remaining ML-algorithms here

        baseline_loss = self.metric(self.y_val, y_pred)

        return baseline_loss
Beispiel #8
0
    def create_new(cls, algorithm, par):
        """
                Create new untrained model.

                Parameters:
                ----------
                algorithm: str (rf/pca_rf/ann/cnn)
                        which algorithm will be used
                par: dict
                        For RF:
                        - n_estimators: suggest using 1000
                        - max_features: suggest using 25
                        For ANN:
                        - neurons: suggest using [200]
                        - l2_lambda: suggest using 0
                        For CNN:
                        - conv_nodes: suggest using [32, 128]
                        - dense_nodes: suggest using [512, 512]
                        - dropout_p: suggest using [0.1, 0.5]
                """
        seed = 369
        if (algorithm == 'rf') or (algorithm == 'pca_rf'):
            model = RandomForestRegressor(n_estimators=par['n_estimators'],
                                          max_features=par['max_features'],
                                          min_samples_leaf=1,
                                          random_state=seed,
                                          n_jobs=-1)
        elif algorithm == 'ann':
            model = models.Sequential()
            model.add(
                layers.Dense(par['neurons'][0],
                             activation='relu',
                             input_shape=(441, ),
                             kernel_regularizer=regularizers.l2(
                                 par['l2_lambda'])))
            if len(par['neurons']) > 1:
                # add more hidden layers if `len(neurons) > 1`
                for n in par['neurons'][1:]:
                    model.add(
                        layers.Dense(
                            n,
                            activation='relu',
                            kernel_regularizer=regularizers.l2(l2_lambda)))

            # add output layer
            model.add(layers.Dense(2, activation='softmax'))

            # define optimizer = RMS
            # low learning rate avoids over shoot of correction
            optimizer = optimizers.RMSprop(lr=1e-4)

            # compile model, using accuracy to fit training data
            model.compile(optimizer=optimizer,
                          loss='categorical_crossentropy',
                          metrics=['accuracy'])
        elif algorithm == 'cnn':
            model = models.Sequential()

            model.add(
                Conv2D(par['conv_nodes'][0],
                       3,
                       3,
                       activation='relu',
                       input_shape=(21, 21, 1),
                       dim_ordering="tf"))
            model.add(MaxPooling2D((2, 2), dim_ordering="tf"))
            if len(par['dropout_p']) != 0:
                model.add(Dropout(par['dropout_p'][0]))

            if len(par['conv_nodes']) > 1:
                for i in par['conv_nodes'][1:]:
                    model.add(
                        Conv2D(i, 3, 3, activation='relu', dim_ordering="tf"))
                    model.add(MaxPooling2D((2, 2), dim_ordering="tf"))
                    if len(par['dropout_p']) != 0:
                        model.add(Dropout(par['dropout_p'][0]))
            model.add(Flatten())

            for n in par['dense_nodes']:
                model.add(Dense(n))
                model.add(Activation("relu"))
                if len(par['dropout_p']) != 0:
                    model.add(Dropout(par['dropout_p'][1]))

            model.add(Dense(2))
            model.add(Activation('softmax'))

            lr = 0.1
            decay = lr / 50
            optimizer = optimizers.SGD(lr=lr, decay=decay, nesterov=True)

            model.compile(optimizer=optimizer,
                          loss='categorical_crossentropy',
                          metrics=['accuracy'])

        return cls(model, algorithm)
Beispiel #9
0
def main():

    # 데이터 분리
    dummy_list, x_val, y_val, id_val, month_val, Train, Predict, model_name, month_name = read_data_info(
        read_data_file, read_col_info_file, read_model_info_file)

    # 샘플뽑아 진행 (생략가능)
    Train, Predict = small_sample(train_num, test_num, Train, Predict)

    # make dataset
    X_Train_df, y_Train_df, X_Predict_df, y_Predict_df = make_model_df(
        dummy_list, x_val, y_val, id_val, month_val, Train, Predict,
        month_name)

    # 차분공식
    X_Train_df, y_Train_df, X_Predict_df, y_Predict_df = get_diff_df(
        X_Train_df, X_Predict_df, y_Train_df, y_Predict_df)

    # -------------- 모델 선택 -------------------
    if model_name == 'logit':  # 로지스틱

        # 현재 dataset은 logit에 맞는 형태가 아니기에 임의로 변경해서 확인하는 작업입니다.
        y_Train_df.loc[y_Train_df[y_val] > np.mean(y_Train_df[y_val]),
                       y_val] = 1
        y_Train_df.loc[y_Train_df[y_val] > 1, y_val] = 0

        model = sm.Logit(y_Train_df, X_Train_df).fit()
        get_simple_results(model, X_Predict_df, y_val, Predict)

    elif model_name == 'MNlogit':  # 다중 로지스틱
        model = sm.MNLogit(y_Train_df, X_Train_df).fit()
        get_simple_results(model, X_Predict_df, y_val, Predict)

    elif model_name == 'OLS':  # 선형회귀
        model = sm.OLS(y_Train_df, X_Train_df).fit()
        get_simple_results(model, X_Predict_df, y_val, Predict)

    elif model_name == 'Random_fore':  # 랜덤포레스트
        model = RandomForestRegressor(max_depth=2, random_state=0).fit(
            X_Train_df, y_Train_df)
        get_model_results(model, X_Predict_df)

    # 현재 우리가 필요한 문제 auto_reg로 자동화 회귀모델링
    # auto 모델의 경우 predict를 할 수 있는 reg와 분류작업을 위한 classifi를 직접 지정받아야하는 부분입니다.
    elif model_name == 'Auto_classi':
        model = GoClassify(n_best=1).train(X_Train_df, y_Train_df)
        get_model_results(model, X_Predict_df)

    elif model_name == 'Auto_reg':
        model = GoRegress(n_best=1).train(X_Train_df, y_Train_df)
        get_model_results(model, X_Predict_df)

    # 신경망 (Deep learning)
    elif model_name == 'Neural_net':

        # scaling 하는 또다른 방법. 적용하였으면 추후 재 되돌리는 코드 필요. LSTM 코드 참조
        #sc = StandardScaler()
        #X_Train_df = sc.fit_transform(X_Train_df)
        #y_Train_df = sc.fit_transform(y_Train_df)
        #X_Predict_df = sc.fit_transform(X_Predict_df)
        #X_Predict_df = sc.fit_transform(y_Predict_df)

        # Initialising the ANN
        model = Sequential()

        # Adding the input layer and the first hidden layer
        model.add(
            Dense(10,
                  activation='relu',
                  kernel_initializer='normal',
                  input_dim=X_Train_df.shape[1]))

        # Adding the second hidden layer
        model.add(Dense(units=8, activation='relu'))
        # model.add(Dropout(0.5))

        # Adding the third hidden layer
        # model.add(Dense(units = 4, activation = 'relu'))   #  레이어 추가
        # model.add(Dropout(0.5))

        # Adding the output layer
        model.add(Dense(units=1, activation='relu'))
        model.compile(optimizer='rmsprop',
                      loss='mean_squared_error',
                      metrics=['accuracy'])
        model.fit(X_Train_df, y_Train_df, batch_size=10, epochs=150,
                  verbose=0)  # callback 안함. 필요시 LSTM 코드 참조 추가

        get_neural_results(model, X_Predict_df)

    else:
        print('Please select your data model')
            profit += x_test.iloc[i]['odd_away']
        profit -= 1

print('Profit_rf: ', profit)



predict_columns = ['elo', 'elo_recent', 'elo_surf', 'prob_g', 'prob_g_rec', 'lose12', 'p_gamma', 'p_gamma_rec', 'p_gamma_surf', 'p_gamma_time', 'set_score', 'match_score', 'p_gamma_rec_p5', 'p_gamma_rec_m5', 'd_dif', 'freq_home', 'freq_away', 'fatigue_home', 'fatigue_away', 'win_perc', 'set_perc', 'game_perc', '1st_lose_win', '1st_win_lose', 'p_gamma_simple', 'p_gamma_simplest', 'p_gamma_simple_surf', 'p_gamma_simplest_surf', 'age_dif']
model = Sequential()

model.add(Dense(32, input_dim=29, activation='relu'))
model.add(Dense(32, activation='relu'))

model.add(Dense(1, activation='sigmoid'))
# Compile model
model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy'])

model.fit(x_train[predict_columns].values, y_train.values, epochs = 400, batch_size = 10)

scores = model.evaluate(x_train[predict_columns], y_train)

y_pred_keras = model.predict_proba(x_test[predict_columns])

print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

x_test['prediction_keras'] = y_pred_keras

x_test.to_csv('x_test.csv', sep = ';', decimal=",")

model_json = model.to_json()
with open("model.json", "w") as json_file: