def gbt_n_estimatior(maxnum, X, Y, xtest, ytest, fix_lr, bool_clf ): tmpy = Y.reshape( (len(Y),) ) score = [] cnt = len(xtest) for trial_n in range(10,maxnum+1,10): if bool_clf == False: clf = GradientBoostingRegressor(n_estimators = trial_n,learning_rate = fix_lr) else: clf = GradientBoostingClassifier(n_estimators = trial_n,learning_rate = fix_lr) clf.fit( X, tmpy ) pytest = clf.predict(xtest) if bool_clf == False: score.append((trial_n, rmse(ytest, pytest), mae(ytest, pytest), mape(ytest, pytest))) else: score.append((trial_n, np.sqrt(np.mean([( pytest[i]-ytest[i] )**2 for i in range(cnt) ]))) ) return min(score, key = lambda x: x[1]), score
def rf_n_depth_estimatior(maxnum, maxdep, X, Y, xtest, ytest, bool_clf): score = [] cnt = len(xtest) for n_trial in range(10, maxnum + 1, 10): for dep_trial in range(2, maxdep + 1): if bool_clf == True: clf = RandomForestClassifier(n_estimators = n_trial, max_depth = dep_trial, max_features = "sqrt") else: clf = RandomForestRegressor(n_estimators = n_trial, max_depth = dep_trial, max_features = "sqrt") clf.fit( X, Y ) pytest = clf.predict(xtest) if bool_clf == False: score.append((n_trial, dep_trial, rmse(ytest, pytest), mae(ytest, pytest), mape(ytest, pytest) )) else: score.append((trial_n, clf.score(xtest, ytest) )) #score.append( (n_trial, dep_trial, clf.score(xtest, ytest)) ) # score.append(\ # (n_trial, dep_trial, sqrt(mean([( pytest[i]-ytest[i] )**2 for i in range(cnt) ]))) ) return min(score, key = lambda x: x[2]), score
def gbt_tree_para( X, Y, xtest, ytest, depth_range, fix_lr, fix_n_est, bool_clf ): tmpy = Y.reshape( (len(Y),) ) score = [] cnt = len(xtest) for trial_depth in depth_range: if bool_clf == False: clf = GradientBoostingRegressor(n_estimators = fix_n_est,learning_rate = fix_lr, max_depth = trial_depth ) else: clf = GradientBoostingClassifier(n_estimators = fix_n_est,learning_rate = fix_lr, max_depth = trial_depth ) clf.fit( X, tmpy ) pytest = clf.predict(xtest) if bool_clf == False: score.append((trial_depth, rmse(ytest, pytest), mae(ytest, pytest), mape(ytest, pytest))) else: score.append((trial_depth, clf.score(xtest, ytest))) return min(score, key = lambda x: x[1]), score
def chi2(train, y): variable = [] score = [] p = [] # X = train.select_dtypes(exclude=[np.number]) X = train for i in X.columns.values.tolist(): table = pd.crosstab(X[i], y['Attrition']) variable.append(i) score.append(chi2_contingency(table)[0]) p.append(chi2_contingency(table)[1]) result = pd.DataFrame({'Variable': variable, 'Score': score, 'P': p}) print(result)
def xgt_l2( fix_lr, fix_depth, fix_round, xtrain, ytrain, xtest, ytest, l2_range, bool_clf, num_class ): score = [] xg_train = xgb.DMatrix(xtrain, label = ytrain) xg_test = xgb.DMatrix(xtest, label = ytest) # setup parameters for xgboost param = {} # use softmax multi-class classification if bool_clf == True: param['objective'] = 'multi:softmax' param['num_class'] = num_class else: param['objective'] = "reg:linear" # 'multi:softmax' # scale weight of positive examples param['eta'] = fix_lr param['max_depth'] = fix_depth param['silent'] = 1 param['nthread'] = 8 param['lambda'] = 0.0 # param['alpha'] for l2_trial in l2_range: param['lambda'] = l2_trial bst = xgb.train(param, xg_train, fix_round ) pytest = bst.predict( xg_test ) if bool_clf == True: tmplen = len(ytest) tmpcnt = 0.0 for i in range(tmplen): if ytest[i] == pred[i]: tmpcnt +=1 tmp_accur = tmpcnt*1.0/tmplen else: score.append((l2_trial, rmse(ytest, pytest), mae(ytest, pytest), mape(ytest, pytest))) return min(score, key = lambda x: x[1]), score
def gbt_tree_para(X, Y, xtest, ytest, depth_range, fix_lr, fix_n_est, bool_clf): tmpy = Y.reshape((len(Y), )) score = [] tmp_err = 0.0 if bool_clf else np.inf for i in depth_range: if bool_clf == False: clf=GradientBoostingRegressor(n_estimators = fix_n_est, learning_rate = fix_lr,max_depth = i, \ max_features ='sqrt') else: clf=GradientBoostingClassifier(n_estimators = fix_n_est,learning_rate = fix_lr,max_depth = i, \ max_features ='sqrt') clf.fit(X, tmpy) pytest = clf.predict(xtest) # regression if bool_clf == False: tmp_ts = sqrt( sum((pytest - ytest) * (pytest - ytest)) / len(ytest)) score.append((i, tmp_ts)) if tmp_ts < tmp_err: best_pytest = pytest best_model = clf tmp_err = tmp_ts # classification else: tmp_ts = clf.score(xtest, ytest) score.append((i, tmp_ts)) if tmp_ts > tmp_err: best_pytest = pytest best_model = clf tmp_err = tmp_ts return min(score, key = lambda x: x[1]) if bool_clf == False else max(score, key = lambda x: x[1]),\ best_model, utils_evaluation_score(X, Y, bool_clf, best_model)
def rf_n_depth_estimatior(maxnum, maxdep, X, Y, xtest, ytest, bool_clf): tmpy = Y score = [] tmp_err = 0.0 if bool_clf else np.inf for n_trial in range(10, maxnum + 1, 10): for dep_trial in range(2, maxdep + 1): if bool_clf == True: clf = RandomForestClassifier(n_estimators=n_trial, max_depth=dep_trial, max_features="sqrt") else: clf = RandomForestRegressor(n_estimators=n_trial, max_depth=dep_trial, max_features="sqrt") clf.fit(X, tmpy) pytest = clf.predict(xtest) if bool_clf == False: tmp_ts = sqrt( sum((pytest - ytest) * (pytest - ytest)) / len(ytest)) score.append((n_trial, dep_trial, tmp_ts)) if tmp_ts < tmp_err: best_pytest = pytest best_model = clf tmp_err = tmp_ts else: tmp_ts = clf.score(xtest, ytest) score.append((n_trial, dep_trial, tmp_ts)) if tmp_ts > tmp_err: best_pytest = pytest best_model = clf tmp_err = tmp_ts return min(score, key = lambda x: x[2]) if bool_clf==False else max(score, key = lambda x: x[2]),\ best_pytest, best_model, utils_evaluation_score(X, Y, bool_clf, best_model)
def xgt_l2(fix_lr, fix_depth, fix_round, xtrain, ytrain, xtest, ytest, l2_range, bool_clf, num_class): score = [] xg_train = xgb.DMatrix(xtrain, label=ytrain) xg_test = xgb.DMatrix(xtest, label=ytest) # setup parameters for xgboost param = {} # use softmax multi-class classification if bool_clf == True: param['objective'] = 'multi:softmax' param['num_class'] = num_class else: param['objective'] = "reg:linear" # scale weight of positive examples param['eta'] = fix_lr param['max_depth'] = fix_depth param['silent'] = 1 param['nthread'] = 8 param['lambda'] = 0.0 # param['alpha'] tmp_err = 0.0 if bool_clf else np.inf for l2_trial in l2_range: param['lambda'] = l2_trial model = xgb.train(param, xg_train, fix_round) pred = model.predict(xg_test) if bool_clf == True: tmplen = len(ytest) tmpcnt = 0.0 for i in range(tmplen): if ytest[i] == pred[i]: tmpcnt += 1 tmp_accur = tmpcnt * 1.0 / tmplen if tmp_accur > tmp_err: best_model = model best_pytest = pred tmp_err = tmp_accur else: tmp_accur = np.sqrt( np.mean([(pred[i] - ytest[i])**2 for i in range(len(ytest))])) if tmp_accur < tmp_err: best_model = model best_pytest = pred tmp_err = tmp_accur score.append((l2_trial, tmp_accur)) return min(score, key = lambda x: x[1]) if bool_clf == False else max(score, key = lambda x: x[1]),\ best_model,\ xgt_evaluation_score(xg_train, ytrain, bool_clf, best_model, False)
def xgt_n_depth(lr, max_depth, max_round, xtrain, ytrain, xtest, ytest, bool_clf, num_class): score = [] xg_train = xgb.DMatrix(xtrain, label=ytrain) xg_test = xgb.DMatrix(xtest, label=ytest) # setup parameters for xgboost param = {} # use softmax multi-class classification if bool_clf == True: param['objective'] = 'multi:softmax' param['num_class'] = num_class else: param['objective'] = "reg:linear" # 'multi:softmax' # scale weight of positive examples # param['gamma'] param['eta'] = lr param['max_depth'] = 0 param['silent'] = 1 param['nthread'] = 8 tmp_err = 0.0 if bool_clf else np.inf for depth_trial in range(2, max_depth): for num_round_trial in range(2, max_round): param['max_depth'] = depth_trial bst = xgb.train(param, xg_train, num_round_trial) pred = bst.predict(xg_test) if bool_clf == True: tmplen = len(ytest) tmpcnt = 0.0 for i in range(tmplen): if ytest[i] == pred[i]: tmpcnt += 1 tmp_accur = tmpcnt * 1.0 / tmplen if tmp_accur > tmp_err: best_model = bst best_pytest = pred tmp_err = tmp_accur else: tmp_accur = sqrt( mean([(pred[i] - ytest[i])**2 for i in range(len(ytest))])) if tmp_accur < tmp_err: best_model = bst best_pytest = pred tmp_err = tmp_accur score.append((depth_trial, num_round_trial, tmp_accur)) return min(score, key = lambda x: x[2]) if bool_clf == False else max(score, key = lambda x: x[2]),\ best_model, xgt_evaluation_score(xg_train, ytrain, bool_clf, best_model, False)