Exemple #1
0
def gbt_n_estimatior(maxnum, X, Y, xtest, ytest, fix_lr, bool_clf ):
    
    tmpy = Y.reshape( (len(Y),) )
    score = []
    cnt = len(xtest)
    
    for trial_n in range(10,maxnum+1,10):
        
        if bool_clf == False:
            clf = GradientBoostingRegressor(n_estimators = trial_n,learning_rate = fix_lr)
        else:
            clf = GradientBoostingClassifier(n_estimators = trial_n,learning_rate = fix_lr)

        clf.fit( X, tmpy )
        
        pytest = clf.predict(xtest)

        if bool_clf == False:
            
            score.append((trial_n, rmse(ytest, pytest), mae(ytest, pytest), mape(ytest, pytest)))
                
        else:
            score.append((trial_n, np.sqrt(np.mean([( pytest[i]-ytest[i] )**2 for i in range(cnt) ]))) )
            
    
    return min(score, key = lambda x: x[1]), score
Exemple #2
0
def rf_n_depth_estimatior(maxnum, maxdep, X, Y, xtest, ytest, bool_clf):
        
    score = []
    
    cnt = len(xtest)
        
    for n_trial in range(10, maxnum + 1, 10):
        for dep_trial in range(2, maxdep + 1):
            
            if bool_clf == True:
                clf = RandomForestClassifier(n_estimators = n_trial, max_depth = dep_trial, max_features = "sqrt")
            else:
                clf = RandomForestRegressor(n_estimators = n_trial, max_depth = dep_trial, max_features = "sqrt")
            
            clf.fit( X, Y )
        
            pytest = clf.predict(xtest)
            
            if bool_clf == False:
                
                score.append((n_trial, dep_trial, rmse(ytest, pytest), mae(ytest, pytest), mape(ytest, pytest) ))
                
            else:
                score.append((trial_n, clf.score(xtest, ytest) ))
                
                #score.append( (n_trial, dep_trial, clf.score(xtest, ytest)) )
#            score.append(\
#            (n_trial, dep_trial, sqrt(mean([( pytest[i]-ytest[i] )**2 for i in range(cnt) ]))) )
    
    return min(score, key = lambda x: x[2]), score
Exemple #3
0
def gbt_tree_para( X, Y, xtest, ytest, depth_range, fix_lr, fix_n_est, bool_clf ):
    
    tmpy = Y.reshape( (len(Y),) )
    score = []
    
    cnt = len(xtest)
    
    for trial_depth in depth_range:
        
        if bool_clf == False:
            clf = GradientBoostingRegressor(n_estimators = fix_n_est,learning_rate = fix_lr, max_depth = trial_depth )
        else:
            clf = GradientBoostingClassifier(n_estimators = fix_n_est,learning_rate = fix_lr, max_depth = trial_depth )
            
        clf.fit( X, tmpy )
        
        pytest = clf.predict(xtest)
        
        if bool_clf == False:
            
            score.append((trial_depth, rmse(ytest, pytest), mae(ytest, pytest), mape(ytest, pytest)))
            
        else:
            score.append((trial_depth, clf.score(xtest, ytest)))
            
        
    return min(score, key = lambda x: x[1]), score
def chi2(train, y):
    variable = []
    score = []
    p = []
    # X = train.select_dtypes(exclude=[np.number])
    X = train
    for i in X.columns.values.tolist():
        table = pd.crosstab(X[i], y['Attrition'])
        variable.append(i)
        score.append(chi2_contingency(table)[0])
        p.append(chi2_contingency(table)[1])
    result = pd.DataFrame({'Variable': variable, 'Score': score, 'P': p})
    print(result)
Exemple #5
0
def xgt_l2( fix_lr, fix_depth, fix_round, xtrain, ytrain, xtest, ytest, l2_range, bool_clf, num_class ):
    
    score = []
    xg_train = xgb.DMatrix(xtrain, label = ytrain)
    xg_test  = xgb.DMatrix(xtest,  label = ytest)

# setup parameters for xgboost
    param = {}
# use softmax multi-class classification
    if bool_clf == True:
        param['objective'] = 'multi:softmax'
        param['num_class'] = num_class
    else:
        param['objective'] = "reg:linear" 

#   'multi:softmax'
    
# scale weight of positive examples
    param['eta'] = fix_lr
    param['max_depth'] = fix_depth
    param['silent'] = 1
    param['nthread'] = 8
    
    param['lambda'] = 0.0
#     param['alpha']
    
    
    for l2_trial in l2_range:
        
        param['lambda'] = l2_trial
        
        bst = xgb.train(param, xg_train, fix_round )
        pytest = bst.predict( xg_test )
        
        if bool_clf == True:
            
            tmplen = len(ytest)
            tmpcnt = 0.0
            for i in range(tmplen):
                if ytest[i] == pred[i]:
                    tmpcnt +=1
                        
            tmp_accur = tmpcnt*1.0/tmplen
                    
        else:
            
            score.append((l2_trial, rmse(ytest, pytest), mae(ytest, pytest), mape(ytest, pytest)))
            
    return min(score, key = lambda x: x[1]), score
def gbt_tree_para(X, Y, xtest, ytest, depth_range, fix_lr, fix_n_est,
                  bool_clf):

    tmpy = Y.reshape((len(Y), ))
    score = []

    tmp_err = 0.0 if bool_clf else np.inf

    for i in depth_range:

        if bool_clf == False:

            clf=GradientBoostingRegressor(n_estimators = fix_n_est, learning_rate = fix_lr,max_depth = i, \
                                          max_features ='sqrt')
        else:
            clf=GradientBoostingClassifier(n_estimators = fix_n_est,learning_rate = fix_lr,max_depth = i, \
                                           max_features ='sqrt')

        clf.fit(X, tmpy)
        pytest = clf.predict(xtest)

        # regression
        if bool_clf == False:

            tmp_ts = sqrt(
                sum((pytest - ytest) * (pytest - ytest)) / len(ytest))
            score.append((i, tmp_ts))

            if tmp_ts < tmp_err:
                best_pytest = pytest
                best_model = clf

                tmp_err = tmp_ts

        # classification
        else:

            tmp_ts = clf.score(xtest, ytest)
            score.append((i, tmp_ts))

            if tmp_ts > tmp_err:
                best_pytest = pytest
                best_model = clf

                tmp_err = tmp_ts

    return min(score, key = lambda x: x[1]) if bool_clf == False else max(score, key = lambda x: x[1]),\
           best_model, utils_evaluation_score(X, Y, bool_clf, best_model)
def rf_n_depth_estimatior(maxnum, maxdep, X, Y, xtest, ytest, bool_clf):

    tmpy = Y
    score = []

    tmp_err = 0.0 if bool_clf else np.inf

    for n_trial in range(10, maxnum + 1, 10):
        for dep_trial in range(2, maxdep + 1):

            if bool_clf == True:
                clf = RandomForestClassifier(n_estimators=n_trial,
                                             max_depth=dep_trial,
                                             max_features="sqrt")
            else:
                clf = RandomForestRegressor(n_estimators=n_trial,
                                            max_depth=dep_trial,
                                            max_features="sqrt")

            clf.fit(X, tmpy)
            pytest = clf.predict(xtest)

            if bool_clf == False:
                tmp_ts = sqrt(
                    sum((pytest - ytest) * (pytest - ytest)) / len(ytest))
                score.append((n_trial, dep_trial, tmp_ts))

                if tmp_ts < tmp_err:
                    best_pytest = pytest
                    best_model = clf

                    tmp_err = tmp_ts

            else:
                tmp_ts = clf.score(xtest, ytest)
                score.append((n_trial, dep_trial, tmp_ts))

                if tmp_ts > tmp_err:
                    best_pytest = pytest
                    best_model = clf

                    tmp_err = tmp_ts

    return min(score, key = lambda x: x[2]) if bool_clf==False else max(score, key = lambda x: x[2]),\
           best_pytest, best_model, utils_evaluation_score(X, Y, bool_clf, best_model)
Exemple #8
0
def xgt_l2(fix_lr, fix_depth, fix_round, xtrain, ytrain, xtest, ytest,
           l2_range, bool_clf, num_class):

    score = []
    xg_train = xgb.DMatrix(xtrain, label=ytrain)
    xg_test = xgb.DMatrix(xtest, label=ytest)

    # setup parameters for xgboost
    param = {}

    # use softmax multi-class classification
    if bool_clf == True:
        param['objective'] = 'multi:softmax'
        param['num_class'] = num_class

    else:
        param['objective'] = "reg:linear"

    # scale weight of positive examples
    param['eta'] = fix_lr
    param['max_depth'] = fix_depth
    param['silent'] = 1
    param['nthread'] = 8

    param['lambda'] = 0.0
    # param['alpha']

    tmp_err = 0.0 if bool_clf else np.inf

    for l2_trial in l2_range:

        param['lambda'] = l2_trial

        model = xgb.train(param, xg_train, fix_round)

        pred = model.predict(xg_test)

        if bool_clf == True:

            tmplen = len(ytest)
            tmpcnt = 0.0

            for i in range(tmplen):
                if ytest[i] == pred[i]:
                    tmpcnt += 1

            tmp_accur = tmpcnt * 1.0 / tmplen

            if tmp_accur > tmp_err:

                best_model = model
                best_pytest = pred

                tmp_err = tmp_accur
        else:
            tmp_accur = np.sqrt(
                np.mean([(pred[i] - ytest[i])**2 for i in range(len(ytest))]))

            if tmp_accur < tmp_err:

                best_model = model
                best_pytest = pred

                tmp_err = tmp_accur

        score.append((l2_trial, tmp_accur))

    return min(score, key = lambda x: x[1]) if bool_clf == False else max(score, key = lambda x: x[1]),\
           best_model,\
           xgt_evaluation_score(xg_train, ytrain, bool_clf, best_model, False)
def xgt_n_depth(lr, max_depth, max_round, xtrain, ytrain, xtest, ytest,
                bool_clf, num_class):

    score = []
    xg_train = xgb.DMatrix(xtrain, label=ytrain)
    xg_test = xgb.DMatrix(xtest, label=ytest)

    # setup parameters for xgboost
    param = {}

    # use softmax multi-class classification

    if bool_clf == True:
        param['objective'] = 'multi:softmax'
        param['num_class'] = num_class
    else:
        param['objective'] = "reg:linear"
    # 'multi:softmax'

    # scale weight of positive examples
    # param['gamma']
    param['eta'] = lr
    param['max_depth'] = 0
    param['silent'] = 1
    param['nthread'] = 8

    tmp_err = 0.0 if bool_clf else np.inf

    for depth_trial in range(2, max_depth):

        for num_round_trial in range(2, max_round):

            param['max_depth'] = depth_trial
            bst = xgb.train(param, xg_train, num_round_trial)
            pred = bst.predict(xg_test)

            if bool_clf == True:
                tmplen = len(ytest)
                tmpcnt = 0.0
                for i in range(tmplen):
                    if ytest[i] == pred[i]:
                        tmpcnt += 1
                tmp_accur = tmpcnt * 1.0 / tmplen

                if tmp_accur > tmp_err:
                    best_model = bst
                    best_pytest = pred

                    tmp_err = tmp_accur
            else:
                tmp_accur = sqrt(
                    mean([(pred[i] - ytest[i])**2 for i in range(len(ytest))]))

                if tmp_accur < tmp_err:
                    best_model = bst
                    best_pytest = pred

                    tmp_err = tmp_accur

            score.append((depth_trial, num_round_trial, tmp_accur))

    return min(score, key = lambda x: x[2]) if bool_clf == False else max(score, key = lambda x: x[2]),\
           best_model, xgt_evaluation_score(xg_train, ytrain, bool_clf, best_model, False)