def ModelPredict(para): # para=[7.16, 0.4, 0.31,0.9,2.7,1.48, 0.78, 0.86] data = pd.DataFrame(columns=('R', 'angle', 'occusion', 'score')) print("预测开始:") for i in range(int(len(para) / 4)): print(i * 4) data.loc[i] = para[i * 4:i * 4 + 4] print(data) y_test = data.pop('score') x_test = data print(x_test) print(y_test) cab, lgb, xgb, gbdt, stack_lr = LoadModel() print("加载完毕:") y_pred_cab_test = cab.predict(x_test) y_pred_lgb_test = lgb.predict(x_test) y_pred_xgb_test = xgb.predict(x_test) y_pred_gbdt_test = gbdt.predict(x_test) print("stack") stack_x_test = pd.DataFrame() stack_x_test['Method_1'] = y_pred_cab_test stack_x_test['Method_2'] = y_pred_lgb_test stack_x_test['Method_3'] = y_pred_xgb_test stack_x_test['Method_4'] = y_pred_gbdt_test stack_pred = stack_lr.predict(stack_x_test) print("stack_mae:", mean_absolute_error(y_test, stack_pred)) #mae:2.1501818709279975 print(stack_pred.tolist()) return stack_pred.tolist()
def main_gbmclassifier(datastruct, experiment_id=None): print("Light GBM model") mlflow.set_experiment("Light GBM Experiments") df, train_x, train_y, test_x, test_y = datastruct train_data = lightgbm.Dataset(train_x, label=train_y) test_data = lightgbm.Dataset(test_x, label=test_y) metrics = {} with mlflow.start_run(): print("Training model") start_timer = time.time() parameters = { 'application': 'binary', 'objective': 'binary', 'metric': 'auc', 'is_unbalance': 'true' } lightgbm.train(parameters, train_data, valid_sets=test_data) pred_y = lightgbm.predict(test_x) # train 200 small models # models = [] # for var in train_x.columns: # sys.stdout.write('\r') # #base_estimator = DecisionTreeClassifier(min_samples_leaf=base_min_samples_leaf, random_state=0) # model = lightgbm.train(parameters, train_data, valid_sets=test_data) # models.append(model) # sys.stdout.write('> {} / 200'.format(len(models))) # sys.stdout.flush() stop_timer = time.time() print("Model trained") # predictions = [m.predict_proba(x.reshape(-1,1))[:,1] for (m, x) in zip(models, test_x.values.T)] # pred_y = np.array(predictions).T.mean(axis=1) # pred_y_logit = logit(np.array(predictions).T).sum(axis=1) metrics['roc_auc'] = roc_auc_score(test_y, pred_y) metrics['roc_auc_logit'] = roc_auc_score(test_y, pred_y_logit) metrics['elapsed_time'] = (stop_timer - start_timer) #mlflow logging mlflow.log_param('model_type', "200 Ada Boosted Decision Trees") mlflow.log_param('features', train_x.columns) mlflow.log_param('sample_size', df.shape) mlflow.log_param('min_samples_leaf', base_min_samples_leaf) mlflow.log_param('n_estimators', n_estimators) mlflow.log_metrics(metrics) print("Completed")
def test_model(lgbm_model, data_dir): """ Test the LightGBM model from the EMBER dataset from the vectorized features """ # Read data X_test, y_test = read_vectorized_features(data_dir, subset="test") # Filter unlabeled data test_rows = (y_test != -1) test_features = X_test[test_rows] test_labels = y_test[test_rows] test_predictions = lgb.predict(test_features) return test_predictions
def style_predict(palette): def sortByLight2(elem): hls=colorsys.rgb_to_hls(*elem) return hls[1] # build a color palette palette.sort(key=sortByLight2,reverse=True) palette1=[*palette[0],*palette[1],*palette[2],*palette[3],*palette[4]] x=pd.Series(palette1,dtype='float64') y_pred = gbm.predict(x, num_iteration=gbm.best_iteration) print("Probability:",y_pred) y_pred=y_pred.tolist()[0] style=['cute','fresh','technology'] if(max(y_pred)<EPS): return -1 pred_Y=y_pred.index(max(y_pred)) return style[pred_Y]
def evaluate_cb(**params): print('=' * 100) warnings.simplefilter('ignore') params['max_depth'] = int(params['max_depth']) params['max_bin'] = int(params['max_bin']) params['min_data_in_leaf'] = int(params['min_data_in_leaf']) params['bagging_freq'] = int(params['bagging_freq']) params['num_leaves'] = int(params['num_leaves']) start_params.update(params) print('Training with params: {}'.format(params)) lgb.train(start_params, trn_data, 20000, valid_sets=[val_data], early_stopping_rounds=300, verbose_eval=False) val_pred = lgb.predict(val_data) val_score = roc_auc_score(y_cv, val_pred) print("Val score: {:<8.5f}".format(val_score)) return val_score
def predict(self): return lightgbm.predict(self.test_df.values)
def m5_predict(): # directory = "/Users/apple/automl/auto-hpo/input/data/PredictFutureSales/" # # data = pd.read_pickle('/Users/apple/automl/auto-hpo/examples/predictfuturesales/cfp_data.pkl') data_dir = os.path.abspath(os.path.dirname(os.path.dirname(__file__))) df = pd.read_pickle(data_dir + '/m5_data_FIRST_DAY_1.pkl') df = create_dt(False) df.to_pickle('m5_test_data.pkl') useless_cols = ["id", "date", "sales", "d", "wm_yr_wk", "weekday"] train_cols = df.columns[~df.columns.isin(useless_cols)] #lgb = pickle.load(open('/Users/apple/automl/auto-hpo/output/xgbmodel/18_model_train.pkl', 'rb')) lgb = pickle.load( open('/pfs/auto-hpo/auto-hpo/output/model/gbm/18_model_train.pkl', 'rb')) alphas = [1.035, 1.03, 1.025, 1.02] weights = [1 / len(alphas)] * len(alphas) sub = 0. for icount, (alpha, weight) in enumerate(zip(alphas, weights)): te = create_dt(False) cols = [f"F{i}" for i in range(1, 29)] for tdelta in range(0, 28): day = fday + timedelta(days=tdelta) print(icount, day) tst = te[(te.date >= day - timedelta(days=max_lags)) & (te.date <= day)].copy() create_fea(tst) tst = tst.loc[tst.date == day, train_cols] te.loc[te.date == day, "sales"] = alpha * lgb.predict( tst) # magic multiplier by kyakovlev te_sub = te.loc[te.date >= fday, ["id", "sales"]].copy() # te_sub.loc[te.date >= fday+ timedelta(days=h), "id"] = te_sub.loc[te.date >= fday+timedelta(days=h), # "id"].str.replace("validation$", "evaluation") te_sub["F"] = [ f"F{rank}" for rank in te_sub.groupby("id")["id"].cumcount() + 1 ] te_sub = te_sub.set_index(["id", "F" ]).unstack()["sales"][cols].reset_index() te_sub.fillna(0., inplace=True) te_sub.sort_values("id", inplace=True) te_sub.reset_index(drop=True, inplace=True) te_sub.to_csv(f"submission_m5_{icount}.csv", index=False) if icount == 0: sub = te_sub sub[cols] *= weight else: sub[cols] += te_sub[cols] * weight print(icount, alpha, weight) sub2 = sub.copy() sub2["id"] = sub2["id"].str.replace("validation$", "evaluation") sub = pd.concat([sub, sub2], axis=0, sort=False) sub.to_csv("m5_18_submission.csv", index=False) sub.head(10) sub.id.nunique(), sub["id"].str.contains("validation$").sum() sub.shape
# X_score =np.delete(X_score,0,1) M=X_score.shape[0] scores_fin = 1+np.zeros(M) for m in models: ger=m[0] las=m[1] gbr=m[2] Enet=m[3] lgb=m[4] las2=m[5] ger_predict=ger.predict(X_score) las_predict=las.predict(X_score) gbr_predict=gbr.predict(X_score) Enet_predict=Enet.predict(X_score) lgb_predict=lgb.predict(X_score) X_stack=pd.DataFrame({"A":[]}) X_stack=pd.concat([X_stack,pd.DataFrame(ger_predict),pd.DataFrame(las_predict),pd.DataFrame(gbr_predict),pd.DataFrame(Enet_predict),pd.DataFrame(lgb_predict)],axis=1) X_stack=np.array(X_stack) X_stack=np.delete(X_stack,0,1) scores_fin=scores_fin*(las2.predict(X_stack)) scores_fin = scores_fin ** (1/nF) # """ # #########################################################建立模型####################################################### # """ # x_train = all_data[:ntrain] # x_test = all_data[ntrain:] # n_folds = 5 # # def rmsle_cv(model): # kf = KFold(n_folds, shuffle=True, random_state=42)
print(grid.best_params_) print(grid.best_score_) lgb_params['reg_alpha'] = grid.best_params['reg_alpha'] lgb_params['reg_lambda'] = grid.best_params['reg_lambda'] lgb_params['colsample_bytree'] = grid.best_params['colsample_bytree'] lgb_params['colsample_bytree'] = grid.best_params['colsample_bytree'] lgb_params['n_estimators'] = grid.best_params_['n_estimators'] lgb.set_params(**lgb_params) ''' X = train.drop(['target'],axis=1) test = test.drop(['target'],axis=1) Y = train['target'].values lgb.fit(X,Y,verbose=False) pred = lgb.predict(test) print(len(pred)) submission = pd.DataFrame({'ID' : range(0,len(pred)),'item_cnt_month': pred}) submission.to_csv(SUBMISSION_FILE,index=False) print('Process Complete {:.4f}'.format((time.time() - start_time)/60))
def main(): """ load data """ train_set = pd.read_csv('../data/train.csv') test_set = pd.read_csv('../data/test.csv') #Without outlier remover, with basic nanRemover 0.12416413124809748 """ Remove Outliers """ outliers = train_set[train_set['GrLivArea'] > 4500].index print(outliers) outliers = [197, 523, 691, 854, 1182, 1298] train_set.drop(outliers, inplace=True) #With outlier remover 0.10970218665126451 """ fix salePrice skewness """ train_set["SalePrice"] = np.log1p(train_set["SalePrice"]) y_train_values = train_set["SalePrice"].values """ prepare combined data. """ train_set_id = train_set['Id'] test_set_id = test_set['Id'] train_set_rows = train_set.shape[0] test_set_rows = test_set.shape[0] train_set.drop('Id', axis=1, inplace=True) test_set.drop('Id', axis=1, inplace=True) train_set.drop('SalePrice', axis=1, inplace=True) combined_data = pd.concat((train_set, test_set)) """ create data transform pipeline """ transform_pipeline = Pipeline(steps=[ ('OutlierRemover', OutlierRemover()), ('NaNImputer', NaNImputer()), ('NaNRemover', NaNRemover()), ('AdditionalFeatureGenerator', AdditionalFeatureGenerator()), ('TypeTransformer', TypeTransformer()), ('ErrorImputer', ErrorImputer()), ('SkewFixer', SkewFixer()), ('Scaler', Scaler()), ('FeatureDropper', FeatureDropper()), ('Dummyfier', Dummyfier()), ]) transformed_data = transform_pipeline.transform(combined_data) train_data = transformed_data[:train_set_rows] predict_data = transformed_data[train_set_rows:] """ try various regressors """ rf_param = { # 'bootstrap': [True], 'max_depth': [3, 4, 5], 'min_samples_leaf': [3, 4, 5], 'n_estimators': [5, 7, 10] } ls_param = { 'alpha': [0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008], 'max_iter': [10000], "normalize": [False] } elnet_param = { 'alpha': [0.0003, 0.0004, 0.0005], 'l1_ratio': [0.9, 0.95, 0.99, 1], 'max_iter': [10000] } ridge_param = {'alpha': [10, 10.1, 10.2, 10.3, 10.4, 10.5]} svr_param = { 'gamma': [1e-08, 1e-09], 'C': [100000, 110000], 'epsilon': [1, 0.1, 0.01] } gbm_param = { "n_estimators": [1000], 'min_child_weight': [1, 5], 'gamma': [0.1, 0.2], 'subsample': [0.6], 'colsample_bytree': [0.6], 'max_depth': [3, 4], 'eta': [0.01], 'eval_metric': ['mae'] } lgb_params = { 'objective': ['regression'], 'num_leaves': [255], 'max_depth': [8], 'bagging_seed': [3], 'boosting_type': ['gbdt'], 'min_sum_hessian_in_leaf': [100], 'learning_rate': np.linspace(0.05, 0.1, 2), 'bagging_fraction': np.linspace(0.7, 0.9, 2), 'bagging_freq': np.linspace(30, 50, 3, dtype='int'), 'max_bin': [15, 63], } rf = get_best_estimator(train_data, y_train_values, estimator=RandomForestRegressor(), params=rf_param, n_jobs=4) elnet = get_best_estimator(train_data, y_train_values, estimator=ElasticNet(), params=elnet_param, n_jobs=4) lso = get_best_estimator(train_data, y_train_values, estimator=Lasso(), params=ls_param, n_jobs=4) rdg = get_best_estimator(train_data, y_train_values, estimator=Ridge(), params=ridge_param, n_jobs=4) svr = get_best_estimator(train_data, y_train_values, estimator=SVR(), params=svr_param, n_jobs=4) gbm = get_best_estimator(train_data, y_train_values, estimator=xgb.XGBRegressor(), params=gbm_param, n_jobs=4) lbm = get_best_estimator(train_data, y_train_values, estimator=lgb.LGBMRegressor(), params=lgb_params, n_jobs=4) def cv_rmse(model): kfolds = KFold(n_splits=5, shuffle=True, random_state=42) rmse = np.sqrt(-cross_val_score(model, train_data, y_train_values, scoring="neg_mean_squared_error", cv=kfolds)) return (rmse) # print("Randomforest model rmse : ", cv_rmse(rf).mean()) # print("elastic model rmse : ", cv_rmse(elnet).mean()) # print("lasso model rmse : ", cv_rmse(lso).mean()) # print("ridge model rmse : ", cv_rmse(rdg).mean()) # print("svr model rmse : ", cv_rmse(svr).mean()) # print("xgboost model rmse : ", cv_rmse(gbm).mean()) # print("lightgbm model rmse : ", cv_rmse(lbm).mean()) submission = pd.DataFrame({ "Id": test_set_id, "SalePrice": np.expm1(rf.predict(predict_data)) }) submission.to_csv('submission_rf.csv', index=False) submission = pd.DataFrame({ "Id": test_set_id, "SalePrice": np.expm1(elnet.predict(predict_data)) }) submission.to_csv('submission_elnet.csv', index=False) submission = pd.DataFrame({ "Id": test_set_id, "SalePrice": np.expm1(lso.predict(predict_data)) }) submission.to_csv('submission_lso.csv', index=False) submission = pd.DataFrame({ "Id": test_set_id, "SalePrice": np.expm1(rdg.predict(predict_data)) }) submission.to_csv('submission_rdg.csv', index=False) submission = pd.DataFrame({ "Id": test_set_id, "SalePrice": np.expm1(svr.predict(predict_data)) }) submission.to_csv('submission_svr.csv', index=False) submission = pd.DataFrame({ "Id": test_set_id, "SalePrice": np.expm1(gbm.predict(predict_data)) }) submission.to_csv('submission_gbm.csv', index=False) submission = pd.DataFrame({ "Id": test_set_id, "SalePrice": np.expm1(lbm.predict(predict_data)) }) submission.to_csv('submission_lbm.csv', index=False) model = StackingRegressor(regressors=[rf, elnet, lso, rdg, svr], meta_regressor=Lasso(alpha=0.0005)) # Fit the model on our data model.fit(train_data, y_train_values) print("StackingRegressor model rmse : ", cv_rmse(model).mean()) # y_pred = model.predict(train_data) # print(sqrt(mean_squared_error(y_train_values, y_pred))) # Predict test set ensembled = np.expm1(model.predict(predict_data)) """ export submission data """ submission = pd.DataFrame({"Id": test_set_id, "SalePrice": ensembled}) submission.to_csv('submission_stacking.csv', index=False) """" Ensemble Weights """ from scipy.optimize import minimize regressors = [rf, elnet, lso, rdg, svr, gbm, lbm] predictions = [] for clf in regressors: predictions.append( clf.predict(train_data)) # listing all our predictions def mse_func(weights): # scipy minimize will pass the weights as a numpy array final_prediction = 0 for weight, prediction in zip(weights, predictions): final_prediction += weight * prediction return mean_squared_error(y_train_values, final_prediction) starting_values = [0.5] * len( predictions) # minimize need a starting value bounds = [(0, 1)] * len(predictions) # weights are bound between 0 and 1 res = minimize(mse_func, starting_values, bounds=bounds, method='SLSQP') print('Result Assessment: {message_algo}'.format( message_algo=res['message'])) print('Ensemble Score: {best_score}'.format(best_score=res['fun'])) print('Best Weights: {weights}'.format(weights=res['x'])) ## All sale_price_ensemble = ( np.expm1(rf.predict(predict_data)) * res['x'][0] + np.expm1(elnet.predict(predict_data)) * res['x'][1] + np.expm1(lso.predict(predict_data)) * res['x'][2] + np.expm1(rdg.predict(predict_data)) * res['x'][3] + np.expm1(svr.predict(predict_data)) * res['x'][4] + np.expm1(gbm.predict(predict_data)) * res['x'][5] + np.expm1(lgb.predict(predict_data)) * res['x'][6]) submission = pd.DataFrame({ "Id": test_set_id, "SalePrice": sale_price_ensemble }) submission.to_csv('submission_average.csv', index=False)
def lgb(df_train, df_test): df_train['bodyType'] = df_train['bodyType'].replace(np.nan, -1).astype(int) df_train['model'] = df_train['model'].replace(np.nan, -1).astype(int) df_train['fuelType'] = df_train['fuelType'].replace(np.nan, -1).astype(int) df_train['gearbox'] = df_train['gearbox'].replace(np.nan, -1).astype(int) df_train['notRepairedDamage'] = df_train['notRepairedDamage'].replace( '-', -1) df_train['name_count'] = df_train.groupby(['name' ])['SaleID'].transform('count') df_train['creatDate'] = df_train['creatDate'].astype(str).str[0:4] df_train['regDate'] = df_train['regDate'].astype(str).str[0:4] df_train['used_year'] = df_train['creatDate'].astype( int) - df_train['regDate'].astype(int) df_train['power'] = df_train['power'].map(lambda x: 600 if x > 600 else x) df_train['bodyType_0'] = df_train['bodyType'].apply(lambda x: 1 if x == 0 else 0) df_train['bodyType_1'] = df_train['bodyType'].apply(lambda x: 1 if x == 1 else 0) df_train['bodyType_2'] = df_train['bodyType'].apply(lambda x: 1 if x == 2 else 0) df_train['bodyType_3'] = df_train['bodyType'].apply(lambda x: 1 if x == 3 else 0) df_train['bodyType_4'] = df_train['bodyType'].apply(lambda x: 1 if x == 4 else 0) df_train['bodyType_5'] = df_train['bodyType'].apply(lambda x: 1 if x == 5 else 0) df_train['bodyType_6'] = df_train['bodyType'].apply(lambda x: 1 if x == 6 else 0) df_train['bodyType_7'] = df_train['bodyType'].apply(lambda x: 1 if x == 7 else 0) df_train['bodyType_-1'] = df_train['bodyType'].apply(lambda x: 1 if x == -1 else 0) df_train['fuelType_0'] = df_train['fuelType'].apply(lambda x: 1 if x == 0 else 0) df_train['fuelType_1'] = df_train['fuelType'].apply(lambda x: 1 if x == 1 else 0) df_train['fuelType_2'] = df_train['fuelType'].apply(lambda x: 1 if x == 2 else 0) df_train['fuelType_3'] = df_train['fuelType'].apply(lambda x: 1 if x == 3 else 0) df_train['fuelType_4'] = df_train['fuelType'].apply(lambda x: 1 if x == 4 else 0) df_train['fuelType_5'] = df_train['fuelType'].apply(lambda x: 1 if x == 5 else 0) df_train['fuelType_6'] = df_train['fuelType'].apply(lambda x: 1 if x == 6 else 0) df_train['fuelType_-1'] = df_train['fuelType'].apply(lambda x: 1 if x == -1 else 0) feature_choose0 = [ 'SaleID', 'name', 'regDate', 'model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'power', 'kilometer', 'notRepairedDamage', 'regionCode', 'seller', 'offerType', 'creatDate', 'price', 'bodyType_0', 'bodyType_1', 'bodyType_2', 'bodyType_3', 'bodyType_4', 'bodyType_5', 'bodyType_6', 'bodyType_7', 'bodyType_-1', 'fuelType_0', 'fuelType_1', 'fuelType_2', 'fuelType_3', 'fuelType_4', 'fuelType_5', 'fuelType_6', 'fuelType_-1' ] feature_choose0_test = [ 'SaleID', 'name', 'regDate', 'model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'power', 'kilometer', 'notRepairedDamage', 'regionCode', 'seller', 'offerType', 'creatDate', 'bodyType_0', 'bodyType_1', 'bodyType_2', 'bodyType_3', 'bodyType_4', 'bodyType_5', 'bodyType_6', 'bodyType_7', 'bodyType_-1', 'fuelType_0', 'fuelType_1', 'fuelType_2', 'fuelType_3', 'fuelType_4', 'fuelType_5', 'fuelType_6', 'fuelType_-1' ] feature_choose1 = [ 'v_0', 'v_1', 'v_2', 'v_3', 'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12', 'v_13', 'v_14', 'used_year', 'name_count' ] feature_choose2 = ['price'] X_scaler = StandardScaler() Y_scaler = StandardScaler() df_scaler_X = X_scaler.fit_transform(df_train[feature_choose1]) df_scaler_X1 = pd.DataFrame(df_scaler_X, columns=feature_choose1) df_train = pd.concat([df_train[feature_choose0], df_scaler_X1], axis=1) df_scaler_Y = Y_scaler.fit_transform(df_train[feature_choose2]) df_scaler_Y1 = pd.DataFrame(df_scaler_Y, columns=['price']) kk = ['kilometer', 'power'] t1 = df_train.groupby(kk[0], as_index=False)[kk[1]].agg({ kk[0] + '_' + kk[1] + '_count': 'count', kk[0] + '_' + kk[1] + '_max': 'max', kk[0] + '_' + kk[1] + '_median': 'median', kk[0] + '_' + kk[1] + '_min': 'min', kk[0] + '_' + kk[1] + '_sum': 'sum', kk[0] + '_' + kk[1] + '_std': 'std', kk[0] + '_' + kk[1] + '_mean': 'mean' }) df_train = pd.merge(df_train, t1, on=kk[0], how='left') train_X = df_train.drop(labels=[ 'SaleID', 'price', 'regDate', 'creatDate', 'regionCode', 'name', 'offerType', 'seller' ], axis=1).values train_Y = df_scaler_Y1.values x_train, x_test, y_train, y_test = train_test_split(train_X, train_Y, test_size=0.2) import lightgbm as lgbm model_lgbm = lgbm.LGBMRegressor(n_estimators=10000, learning_rate=0.02, boosting_type='gbdt', objective='regression_l1', max_depth=-1, num_leaves=31, min_child_samples=20, feature_fraction=0.8, bagging_freq=1, bagging_fraction=0.8, lambda_l2=2, random_state=2020, metric='mae') lgbm = model_lgbm.fit(x_train, y_train) df_out = pd.DataFrame(data=None) df_out['SaleID'] = df_test['SaleID'] df_test['bodyType'] = df_test['bodyType'].replace(np.nan, -1) df_test['fuelType'] = df_test['fuelType'].replace(np.nan, -1) df_test['gearbox'] = df_test['gearbox'].replace(np.nan, -1) df_test['notRepairedDamage'] = df_test['notRepairedDamage'].replace( '-', -1) df_test['name_count'] = df_test.groupby(['name' ])['SaleID'].transform('count') df_test['creatDate'] = df_test['creatDate'].astype(str).str[0:4] df_test['regDate'] = df_test['regDate'].astype(str).str[0:4] df_test['used_year'] = df_test['creatDate'].astype( int) - df_test['regDate'].astype(int) df_test['power'] = df_test['power'].map(lambda x: 600 if x > 600 else x) df_test['bodyType_0'] = df_test['bodyType'].apply(lambda x: 1 if x == 0 else 0) df_test['bodyType_1'] = df_test['bodyType'].apply(lambda x: 1 if x == 1 else 0) df_test['bodyType_2'] = df_test['bodyType'].apply(lambda x: 1 if x == 2 else 0) df_test['bodyType_3'] = df_test['bodyType'].apply(lambda x: 1 if x == 3 else 0) df_test['bodyType_4'] = df_test['bodyType'].apply(lambda x: 1 if x == 4 else 0) df_test['bodyType_5'] = df_test['bodyType'].apply(lambda x: 1 if x == 5 else 0) df_test['bodyType_6'] = df_test['bodyType'].apply(lambda x: 1 if x == 6 else 0) df_test['bodyType_7'] = df_test['bodyType'].apply(lambda x: 1 if x == 7 else 0) df_test['bodyType_-1'] = df_test['bodyType'].apply(lambda x: 1 if x == -1 else 0) df_test['fuelType_0'] = df_test['fuelType'].apply(lambda x: 1 if x == 0 else 0) df_test['fuelType_1'] = df_test['fuelType'].apply(lambda x: 1 if x == 1 else 0) df_test['fuelType_2'] = df_test['fuelType'].apply(lambda x: 1 if x == 2 else 0) df_test['fuelType_3'] = df_test['fuelType'].apply(lambda x: 1 if x == 3 else 0) df_test['fuelType_4'] = df_test['fuelType'].apply(lambda x: 1 if x == 4 else 0) df_test['fuelType_5'] = df_test['fuelType'].apply(lambda x: 1 if x == 5 else 0) df_test['fuelType_6'] = df_test['fuelType'].apply(lambda x: 1 if x == 6 else 0) df_test['fuelType_-1'] = df_test['fuelType'].apply(lambda x: 1 if x == -1 else 0) df_scaler_test_X = X_scaler.fit_transform(df_test[feature_choose1]) df_scaler_test_X1 = pd.DataFrame(df_scaler_test_X, columns=feature_choose1) df_test = pd.concat([df_test[feature_choose0_test], df_scaler_test_X1], axis=1) kk = ['kilometer', 'power'] t1 = df_test.groupby(kk[0], as_index=False)[kk[1]].agg({ kk[0] + '_' + kk[1] + '_count': 'count', kk[0] + '_' + kk[1] + '_max': 'max', kk[0] + '_' + kk[1] + '_median': 'median', kk[0] + '_' + kk[1] + '_min': 'min', kk[0] + '_' + kk[1] + '_sum': 'sum', kk[0] + '_' + kk[1] + '_std': 'std', kk[0] + '_' + kk[1] + '_mean': 'mean' }) df_test = pd.merge(df_test, t1, on=kk[0], how='left') df_test = df_test.drop(labels=[ 'SaleID', 'regDate', 'creatDate', 'regionCode', 'name', 'offerType', 'seller' ], axis=1).values test_X = df_test df_out['price1'] = Y_scaler.inverse_transform(lgbm.predict(test_X)) df_out = df_out[['SaleID', 'price1']] return df_out
def testLightGBM(self): self.predicted_labels = lgb.predict(self.val_data) print ("LightGBM score " + str(rmse(self.predicted_labels,self.val_labels)))
'max_depth': 7, 'learning_rate': 0.05, 'max_bin': 200 } param['metric'] = ['auc', 'binary_logloss'] num_round = 50 from datetime import datetime start = datetime.now() lgb = lgb.train(param, train_dataset, num_round) stop = datetime.now() execution_time_lgb = stop - start print('--' * 20, execution_time_lgb, '--' * 20) ypred2 = lgb.predict(x_test) print(ypred2) for i in range(ypred2.shape[0]): if ypred2[i] > 0.5: ypred2[i] = 1 else: ypred2[i] = 0 lgb_xgb = accuracy_score(y_test, ypred2) print(confusion_matrix(y_test, ypred2)) #||---------------------------------------------------------------------------------------------------------------- from sklearn.metrics import roc_auc_score xgb_auc = roc_auc_score(y_test, ypred)
list(trains['comment_text']) + list(tests['comment_text'])) word_index = tokenizer.word_index train_X = tokenizer.texts_to_sequences(train['comment_text']) test_X = tokenizer.texts_to_sequences(tests['comment_text']) train_X = pad_sequences(train_X, maxlen=220) test_X = pad_sequences(test_X, maxlen=220) train_X = np.hstack([train_X, other_trains_1]) test_X = np.hstack([test_X, other_trains_2]) from sklearn.model_selection import StratifiedKFold params = { 'max_depth': -1, 'n_estimators': 1000, 'learning_rate': 0.05, 'num_leaves': 2**9 - 1, 'colsample_bytree': 0.28, 'objective': 'binary', 'n_jobs': -1, 'eval_metric': 'auc' } import lightgbm as lgb xtrain = lgb.Dataset(train_X, label_train) num_round = 10000 lgb = lgb.train(params, xtrain, num_round) yp = lgb.predict(test_X) from sklearn.metrics import roc_auc_score, f1_score print(roc_auc_score(list(label_test.values), list(yp)))
y, test_size=0.2, random_state=42) # Dataset lgb_train = lgb.Dataset(data=x_train, label=y_train) lgb_eval = lgb.Dataset(data=x_val, label=y_val) lgb = lgb.train(train_set=lgb_train, valid_sets=lgb_eval, params=params, verbose_eval=200, early_stopping_rounds=early_stopping_rounds, num_boost_round=num_boost_round) y_pred = lgb.predict(x_val) score = log_loss(y_val, y_pred) logger.info(f'Fold No: {n_fold} | {metric}: {score}') logger.info(f"Train Shape: {x_train.shape}") for thresh in np.arange(0.1, 0.301, 0.01): thresh = np.round(thresh, 2) f1 = f1_score(y_val, (y_pred > thresh).astype(int)) logger.info(f"F1 score at threshold {thresh} is {f1}") test_pred = lgb.predict(test) if len(prediction) == 0: prediction = test_pred else: prediction += test_pred
print(best_params) ### 训练 params['learning_rate'] = 0.01 lgb.train( params, # 参数字典 lgb_train, # 训练集 valid_sets=lgb_eval, # 验证集 num_boost_round=2000, # 迭代次数 early_stopping_rounds=50 # 早停次数 ) ### 线下预测 print("线下预测") preds_offline = lgb.predict(offline_test_X, num_iteration=lgb.best_iteration) # 输出概率 offline = offline_test[['instance_id', 'is_trade']] offline['preds'] = preds_offline offline.is_trade = offline['is_trade'].astype(np.float64) print('log_loss', metrics.log_loss(offline.is_trade, offline.preds)) ### 线上预测 print("线上预测") preds_online = lgb.predict(online_test_X, num_iteration=lgb.best_iteration) # 输出概率 online = online_test[['instance_id']] online['preds'] = preds_online online.rename(columns={'preds': 'predicted_score'}, inplace=True) # 更改列名 online.to_csv("./data/20180405.txt", index=None, sep=' ') # 保存结果 ### 保存模型
verbose=100, eval_set=(X_test[predictors], y_test)) from sklearn.metrics import mean_squared_error print("线下误差:{}".format(0.5 * mean_squared_error( y_test, self.model.predict(X_test[predictors])))) return self def predict(self, X): # 对测试集进行预测,传入模型,和测试数据 fea_test = pd.read_csv("./feature/fea_test.csv") fea_test1 = pd.read_csv("./feature/fea_test_1.csv") fea_test2 = pd.read_csv("./feature/fea_test_2.csv") X = pd.merge(X, fea_test, how="left", on="id") X = pd.merge(X, fea_test1, how="left", on="id") X = pd.merge(X, fea_test2, how="left", on="id") X = self.__make_feature(test=X, train=pd.DataFrame()) predictors = [f for f in list(X.columns) if f not in self.no_use] test_pred = self.model.predict(X[predictors]) print("最大值:{}".format(test_pred.max())) return test_pred def get_params(self): return self.params biorad = b_model() train = pd.read_csv("../raw_data/d_train.csv", encoding="gbk") test = pd.read_csv("../raw_data/d_test_A.csv", encoding="gbk") lgb = biorad.fit(train) lgb.predict(test)
def train(): train, train_label, valid, valid_label, test, test_label = get_allData() model = ML_model(train, valid, train_label, valid_label) import warnings warnings.filterwarnings("ignore") rf = model.rf() print("the model is rf and the test's f1 is: ", f1_score(test_label, rf.predict(test), average="macro")) print("the model is rf and the test's precision_score is: ", precision_score(test_label, rf.predict(test), average="macro")) print("the model is rf and the test's recall_score is: ", recall_score(test_label, rf.predict(test), average="macro")) print( "----------------------------------------------------------------------------------------" ) gboost = model.gboost() print("the model is gboost and the test's f1 is: ", f1_score(test_label, gboost.predict(test), average="macro")) print("the model is gboost and the test's precision_score is: ", precision_score(test_label, gboost.predict(test), average="macro")) print("the model is gboost and the test's recall_score is: ", recall_score(test_label, gboost.predict(test), average="macro")) print( "----------------------------------------------------------------------------------------" ) svm = model.svm() print("the model is svm and the test's f1 is: ", f1_score(test_label, svm.predict(test), average="macro")) print("the model is svm and the test's precision_score is: ", precision_score(test_label, svm.predict(test), average="macro")) print("the model is svm and the test's recall_score is: ", recall_score(test_label, svm.predict(test), average="macro")) print( "----------------------------------------------------------------------------------------" ) xbg = model.xgboost() print("the model is xbg and the test's f1 is: ", f1_score(test_label, xbg.predict(test), average="macro")) print("the model is xbg and the test's precision_score is: ", precision_score(test_label, xbg.predict(test), average="macro")) print("the model is xbg and the test's recall_score is: ", recall_score(test_label, xbg.predict(test), average="macro")) print( "----------------------------------------------------------------------------------------" ) lgb = model.lgb() print("the model is lgb and the test's f1 is: ", f1_score(test_label, lgb.predict(test), average="macro")) print("the model is lgb and the test's precision_score is: ", precision_score(test_label, lgb.predict(test), average="macro")) print("the model is lgb and the test's recall_score is: ", recall_score(test_label, lgb.predict(test), average="macro")) print( "----------------------------------------------------------------------------------------" ) stack = model.stacking() print("the model is stack and the test's f1 is: ", f1_score(test_label, stack.predict(test), average="macro")) print("the model is stack and the test's precision_score is: ", precision_score(test_label, stack.predict(test), average="macro")) print("the model is stack and the test's recall_score is: ", recall_score(test_label, stack.predict(test), average="macro")) print( "----------------------------------------------------------------------------------------" )