Exemple #1
0
    def gridSearch(self, regressor, X_train, y_train):
        parameters = {
            'depth': sp_randInt(6, 10),  #
            'learning_rate': sp_randFloat(),
            'iterations': sp_randInt(600, 900)  #
        }

        randm = RandomizedSearchCV(estimator=regressor,
                                   param_distributions=parameters,
                                   cv=3,
                                   n_iter=4,
                                   n_jobs=8)  #
        randm.fit(X_train, y_train)

        #Results from Random Search
        print("\n========================================================")
        print(" Results from Random Search ")
        print("========================================================")

        print("\n s:\n", randm.best_estimator_)

        print("\n The best score across ALL searched params:\n",
              randm.best_score_)

        print("\n The best parameters across ALL searched params:\n",
              randm.best_params_)

        #new catboost model using best parameters
        regressor = CatBoostRegressor(
            iterations=randm.best_params_['iterations'],
            learning_rate=randm.best_params_['learning_rate'],
            depth=randm.best_params_['depth'],
            od_type='IncToDec')
        return regressor, randm.best_params_
Exemple #2
0
# 전처리
x_train, x_test, y_train, y_test = train_test_split(x, y, shuffle=True, test_size=0.2, random_state=42)
print(x_train.shape)    # (3628, 110336)
print(x_test.shape)     # (908, 110336)
print(y_train.shape)    # (3628,)
print(y_test.shape)     # (908,)

scaler = MinMaxScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

# 모델 구성
model = HistGradientBoostingClassifier(verbose=1, random_state=42)
parameters = {"learning_rate": sp_randFloat(),
              "max_iter"    : [1000,1200,1500],
              "l2_regularization" : [1.5, 0.5, 0, 1],
              "max_depth"    : sp_randInt(4, 10)
            }
randm = RandomizedSearchCV(estimator=model, param_distributions = parameters,
                            cv = 2, n_iter = 10, n_jobs=-1)
randm.fit(x_train, y_train)

print(" Results from Random Search " )
print("The best estimator across ALL searched params:", randm.best_estimator_)
print("The best score across ALL searched params:", randm.best_score_)
print(" The best parameters across ALL searched params:", randm.best_params_)


end_now = datetime.datetime.now()
Exemple #3
0
model_list.append(('RF_4', 'Random Forest Algorithm: PS-4',
                  trained_model, accuracy, conf_matrix,
                  class_report, kappa_score))

###############################################################################
######  Automatic tuning of parameter settings using RandomizedSearchCV   #####
###############################################################################

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Model setup
model = RandomForestClassifier()
parameters = {'max_depth' : sp_randInt(4, 10),
              'criterion' : ['gini', 'entropy'],
              'max_features' : ['auto', 'sqrt', 'log2'],
              'n_estimators' : sp_randInt(100, 1000),
              'min_impurity_decrease' : sp_randFloat(),
              }


random = RandomizedSearchCV(estimator = model,
                            param_distributions = parameters,
                            cv = KFold,, n_iter = 10,
                            verbose = 1, n_jobs = 10)

random.fit(X_train, y_train)

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Randomized search results
print("\n =========================================================")
print(" Random Search Results ")
print("============================================================")
Exemple #4
0
def main():

    dataset = pd.read_csv('tcd-ml-1920-group-income-train.csv')
    train = dataset.copy()
    train = train[:1044560]
    train = train.drop_duplicates(subset='Instance',
                                  keep='first',
                                  inplace=False)
    train = train.drop(columns=['Instance'])
    train = train.drop_duplicates(inplace=False)
    y = np.log(train['Total Yearly Income [EUR]'])

    train = train.drop(columns=[
        'Total Yearly Income [EUR]',
        'Hair Color',
        'Housing Situation',
        'Wears Glasses',
    ])

    train = changeSizeOfCity(train)
    train = degree(train)
    train = genderCleaning(train)
    train = bodyHeight(train)
    train = profession(train)
    train = satisfaction(train)
    train = work_experience(train)
    train = processAdditionToSalary(train)
    train = crime(train)

    #Encode using get dummies
    print("Start Dummies")
    te = TargetEncoder()
    train[['Gender', 'Country', 'Profession',
           'University Degree']] = te.fit_transform(
               train[['Gender', 'Country', 'Profession', 'University Degree']],
               y)
    print("End Dummies")

    #catboost regressor creation
    regressor = CatBoostRegressor(od_type='IncToDec')

    #split data 80/20
    X_train, X_test, y_train, y_test = train_test_split(train,
                                                        y,
                                                        test_size=0.2)

    #create an evalution dataset tostop overfitting
    eval_dataset = Pool(X_test, y_test)

    #grdi search for optimal parameters
    print('Fitting')
    parameters = {
        'depth': sp_randInt(3, 14),
        'learning_rate': sp_randFloat(),
        'iterations': sp_randInt(800, 1200)
    }

    randm = RandomizedSearchCV(estimator=regressor,
                               param_distributions=parameters,
                               cv=4,
                               n_iter=10,
                               n_jobs=5)
    randm.fit(X_train, y_train)

    # Results from Random Search
    print("\n========================================================")
    print(" Results from Random Search ")
    print("========================================================")

    print("\n s:\n", randm.best_estimator_)

    print("\n The best score across ALL searched params:\n", randm.best_score_)

    print("\n The best parameters across ALL searched params:\n",
          randm.best_params_)

    # new catboost model using best parameters
    regressor = CatBoostRegressor(
        iterations=randm.best_params_['iterations'],
        learning_rate=randm.best_params_['learning_rate'],
        depth=randm.best_params_['depth'],
        od_type='IncToDec',
        use_best_model=True)

    test_dataset = pd.read_csv('tcd-ml-1920-group-income-test.csv')

    predict_X = test_dataset
    X_train = train
    y_train = y
    predict_y = predict_X.pop('Total Yearly Income [EUR]')

    predict_X = year(predict_X)
    predict_X = predict_X.drop(columns=[
        'Instance',
        'Hair Color',
        'Housing Situation',
        'Wears Glasses',
    ])

    #clean test data
    predict_X = changeSizeOfCity(predict_X)
    predict_X = degree(predict_X)
    predict_X = genderCleaning(predict_X)
    predict_X = bodyHeight(predict_X)
    predict_X = profession(predict_X)
    predict_X = work_experience(predict_X)
    predict_X = processAdditionToSalary(predict_X)

    predict_X = satisfaction(predict_X)
    predict_X = crime(predict_X)

    predict_X[['Gender', 'Country', 'Profession',
               'University Degree']] = te.transform(
                   predict_X[[
                       'Gender', 'Country', 'Profession', 'University Degree'
                   ]], predict_y)
    X_train, predict_X = train.align(predict_X,
                                     join='outer',
                                     axis=1,
                                     fill_value=0)

    print('Fitting Test Data')
    regressor.fit(X_training, y_training, eval_set=eval_dataset)

    #predict on the trained model
    pred2 = regressor.predict(predict_X)
    output = pd.read_csv('tcd-ml-1920-group-income-submission.csv')
    instance = output['Instance']
    output.pop('Instance')
    a = pd.DataFrame.from_dict({
        'Instance': instance,
        'Total Yearly Income [EUR]': np.exp(pred2)
    })
    a.to_csv("tcd-ml-1920-group-income-submission.csv", index=False)

    y_pred = regressor.predict(X_test)
    print('MAE is: {}'.format(
        mean_absolute_error(np.exp(y_test), np.exp(y_pred))))
Exemple #5
0
# parameters = {'learning_rate': [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16],
#                   'subsample'    : [0.9, 0.5, 0.2, 0.1],
#                   'n_estimators' : [100,500,1000, 1500],
#                   'max_depth'    : [4,6,8,10]
#              }
GBR = GradientBoostingRegressor( random_state=1, verbose=1,  )
# grid_GBR = GridSearchCV(estimator=GBR, param_grid = parameters, cv = 2, n_jobs=-1)

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform as sp_randFloat
from scipy.stats import randint as sp_randInt

score = 0
while score < 0.8:
# model = GradientBoostingRegressor()
	parameters = {'learning_rate': sp_randFloat(),
	              'subsample'    : sp_randFloat(),
	              'n_estimators' : sp_randInt(100, 1000),
	              'max_depth'    : sp_randInt(4, 10) 
	             }

	grid_GBR = RandomizedSearchCV(estimator=GBR, param_distributions = parameters, 
	                           cv = 2, n_iter = 10, n_jobs=-1)
	grid_GBR.fit(df1_train[ vars ][ :-24 ], df1_train['POWER'][ 24: ])

	# grid_GBR.fit(df1_train[ vars ][ :-24 ], df1_train['POWER'][ 24: ] )
	print(" Results from Grid Search " )
	print("\n The best estimator across ALL searched params:\n",grid_GBR.best_estimator_)
	print("\n The best score across ALL searched params:\n",grid_GBR.best_score_)
	print("\n The best parameters across ALL searched params:\n",grid_GBR.best_params_)
	score = grid_GBR.best_score_
Exemple #6
0
                  class_report, kappa_score))

###############################################################################
######  Automatic tuning of parameter settings using RandomizedSearchCV   #####
###############################################################################

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Model setup
model = MLPClassifier()
parameters = {'activation' : ['relu', 'tanh'],
              'solver' : ['sgd', 'adam'],
              'learning_rate' : ['constant', 'adaptive'],
              'hidden_layer_sizes' : [100, 200, 300],
              'max_iter' : sp_randInt(50, 300),
              'batch_size' : [10, 30, 50, 70],
              'learning_rate_init' : sp_randFloat()
              }

random = RandomizedSearchCV(estimator = model, param_distributions = parameters,
                    cv = KFold, n_iter = 10,  verbose = 1, n_jobs = 10)
random.fit(X_train, y_train)

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Randomized search results
print("\n =========================================================")
print(" Random Search Results ")
print("============================================================")
print("\n The best estimator :\n",
      grid.best_estimator_)
print("\n The best score :\n",
      grid.best_score_)
Exemple #7
0
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Collect performance results
model_list.append(('XGBoost_4', 'XGBoost Algorithm: PS-4',
                  trained_model, accuracy, conf_matrix,
                  class_report, kappa_score))

###############################################################################
######  Automatic tuning of parameter settings using RandomizedSearchCV   #####
###############################################################################

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Model setup
model = xgb.XGBClassifier()
parameters = {'max_depth' : sp_randInt(4, 10),
              'gamma' :sp_randFloat(),
              'learning_rate' : sp_randFloat(),
              'n_estimators' : sp_randInt(100, 1000)
              }

random = RandomizedSearchCV(estimator = model,
                            param_distributions = parameters,
                            cv = KFold, verbose = 1, n_iter = 10)

random.fit(X_train, y_train)

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Randomized search results
print("\n =========================================================")
print(" Random Search Results ")
print("============================================================")
Exemple #8
0
                                                    test_size=0.2,
                                                    random_state=42)
print(x_train.shape)  # (3628, 110336)
print(x_test.shape)  # (908, 110336)
print(y_train.shape)  # (3628,)
print(y_test.shape)  # (908,)

scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

# 모델 구성
model = GradientBoostingClassifier(verbose=1, random_state=42)
parameters = {
    "learning_rate": sp_randFloat(),
    "subsample": sp_randFloat(),
    "n_estimators": sp_randInt(100, 1000),
    "max_depth": sp_randInt(4, 10)
}
randm = RandomizedSearchCV(estimator=model,
                           param_distributions=parameters,
                           cv=2,
                           n_iter=10,
                           n_jobs=-1)
randm.fit(x_train, y_train)

print(" Results from Random Search ")
print("The best estimator across ALL searched params:", randm.best_estimator_)
print("The best score across ALL searched params:", randm.best_score_)
print(" The best parameters across ALL searched params:", randm.best_params_)
Exemple #9
0
    def run(self, trainingDasaset, plotting):
        dataset = trainingDasaset
        accuracy = 0
        train = dataset.copy()
        y = train['int_rate']
        train = train.drop(columns=[
            'int_rate',
        ])
        regressor = CatBoostRegressor(od_type='IncToDec')
        #split data 80/20
        X_train, X_test, y_train, y_test = train_test_split(train,
                                                            y,
                                                            test_size=0.2)

        #grid search for optimal parameters
        print('Fitting')
        parameters = {
            'depth': sp_randInt(6, 10),
            'learning_rate': sp_randFloat(),
            'iterations': sp_randInt(600, 1000)
        }

        regressor = CatBoostRegressor(iterations=643,
                                      learning_rate=0.9600690303599169,
                                      depth=6,
                                      od_type='IncToDec')

        bestParams = None
        #regressor,bestParams = self.gridSearch(regressor,X_train, y_train)

        if plotting == True:
            print('Fitting Test Data')
            regressor.fit(X_train, y_train)

            y_pred = regressor.predict(X_test)
            print(
                "###################################CatBoost#############################"
            )
            print('MAE is: {}'.format(
                mean_absolute_error(np.exp(y_test), np.exp(y_pred))))
            accuracy = r2_score(y_test, y_pred)
            if bestParams != None:
                print(bestParams)
            #accuracy = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

        #predict test data
        else:
            regressor.fit(train, y)
            testData = pd.read_csv(
                "./SiameseNeuralNetworkProject/MachineLearningAlgorithmSuite/CleanedData/SiameseTrainingData.csv"
            )
            predictions = regressor.predict(testData)
            np.savetxt(
                "./SiameseNeuralNetworkProject/MachineLearningAlgorithmSuite/OutputFiles/CatBoostPredictions.csv",
                predictions,
                delimiter=",")

            testData = pd.read_csv(
                "./SiameseNeuralNetworkProject/MachineLearningAlgorithmSuite/CleanedData/OverallTestingData.csv"
            )
            predictions = regressor.predict(testData)
            np.savetxt(
                "./SiameseNeuralNetworkProject/MachineLearningAlgorithmSuite/OutputFiles/CatBoostPredictionsTestData.csv",
                predictions,
                delimiter=",")

        return accuracy
Exemple #10
0
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform as sp_randFloat
from scipy.stats import randint as sp_randInt

from util import load_data, preprocess, mk_trainset, metric
from make_var import make_variable
from clustering import clustering

# 자기가 실행할 모델 파라미터 관련 코드 실행 (그냥 다한다! : >)
# real implement에서 lightgbm 제외하고는 mk_trainset의 category False, PCA True로 설정.
# random_search의 모델 명, 파라미터 명 설정. (random_search 함수 내 파라미터 저장 이름도 확인. 덮어쓰기 안되게!)
# 파라미터 로드는 맨 아래 방법처럼.

n_estimators = [int(x) for x in range(10000, 50000, 5000)]
importance_type = ['split', 'gain']
lambda_l1 = sp_randFloat()
lambda_l2 = sp_randFloat()
max_depth = sp_randInt(3, 30)
depth = sp_randInt(3, 30)
min_child_samples = sp_randInt(1, 7)
min_data_in_leaf = sp_randInt(1, 7)
min_sum_hessian_in_leaf = sp_randInt(1, 10)
num_leaves = sp_randInt(10, 50)
bagging_fraction = sp_randFloat()
feature_fraction = sp_randFloat()
learning_rate = sp_randFloat()
max_bin = sp_randInt(low=0, high=30)
min_gain_to_split = sp_randFloat()
max_leaf_nodes = sp_randInt(10, 50)
min_samples_leaf = sp_randInt(2, 30)
min_samples_split = sp_randInt(2, 30)
Exemple #11
0
# Visualize distribution of target attribute-
num, bins, patches = plt.hist(y, bins = int(np.ceil(np.sqrt(y.size))))
plt.show()

# Visualize distributions of all numeric attributes in features-
sns.boxplot(data = X_df)
plt.title("Pima diabetes: Boxplot distribution - numeric columns")
plt.show()


# Initialize a GradientBoostingRegressor model-
gbr = GradientBoostingRegressor()

# Specify parameters for hyper-parameter tuning-
parameters = {
    'learning_rate': sp_randFloat(),
    'subsample'    : sp_randFloat(),
    'n_estimators' : sp_randInt(100, 1000),
    'max_depth'    : sp_randInt(4, 10)
    }


'''
RandomizedSearchCV parameters-

1. estimator: here, we input the metric or the model for which we need to optimize the parameters.

2. param_distributions: here, we have to pass the dictionary of parameters that we need to optimize.

3. cv: here, we have to pass an interger value signifying the number of splits needed for cross
validation. By default it's 5.
Exemple #12
0
import catboost as cb
from scipy.stats import randint as sp_randInt
from scipy.stats import uniform as sp_randFloat
from sklearn.model_selection import RandomizedSearchCV
from skopt import BayesSearchCV
from skopt.space import Integer, Real

SCORING_LIST = ["accuracy", "roc_auc", "f1"]

XGBOOST_RANDOMSEARCH_PARAMS = {
    "silent": [False],
    "max_depth": sp_randInt(6, 20),
    "learning_rate": sp_randFloat(0.01, 0.3),
    "subsample": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    "colsample_bytree": [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    "colsample_bylevel": [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    "min_child_weight": [0.5, 1.0, 3.0, 5.0, 7.0, 10.0],
    "gamma": [0, 0.25, 0.5, 1.0],
    "reg_lambda": [0.1, 1.0, 5.0, 10.0, 50.0, 100.0],
    "n_estimators": [200],
}

XGBOOST_BAYESSEARCH_PARAMS = {
    "silent": [False],
    "max_depth": Integer(6, 20),
    "learning_rate": Real(0.01, 0.3),
    "subsample": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    "colsample_bytree": [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    "colsample_bylevel": [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    "min_child_weight": [0.5, 1.0, 3.0, 5.0, 7.0, 10.0],
    "gamma": [0, 0.25, 0.5, 1.0],