def test_plot_importance(self): gbm0 = lgb.train(self.params, self.train_data, num_boost_round=10) ax0 = lgb.plot_importance(gbm0) self.assertIsInstance(ax0, matplotlib.axes.Axes) self.assertEqual(ax0.get_title(), 'Feature importance') self.assertEqual(ax0.get_xlabel(), 'Feature importance') self.assertEqual(ax0.get_ylabel(), 'Features') self.assertLessEqual(len(ax0.patches), 30) gbm1 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, silent=True) gbm1.fit(self.X_train, self.y_train) ax1 = lgb.plot_importance(gbm1, color='r', title='t', xlabel='x', ylabel='y') self.assertIsInstance(ax1, matplotlib.axes.Axes) self.assertEqual(ax1.get_title(), 't') self.assertEqual(ax1.get_xlabel(), 'x') self.assertEqual(ax1.get_ylabel(), 'y') self.assertLessEqual(len(ax1.patches), 30) for patch in ax1.patches: self.assertTupleEqual(patch.get_facecolor(), (1., 0, 0, 1.)) # red ax2 = lgb.plot_importance(gbm0, color=['r', 'y', 'g', 'b'], title=None, xlabel=None, ylabel=None) self.assertIsInstance(ax2, matplotlib.axes.Axes) self.assertEqual(ax2.get_title(), '') self.assertEqual(ax2.get_xlabel(), '') self.assertEqual(ax2.get_ylabel(), '') self.assertLessEqual(len(ax2.patches), 30) self.assertTupleEqual(ax2.patches[0].get_facecolor(), (1., 0, 0, 1.)) # r self.assertTupleEqual(ax2.patches[1].get_facecolor(), (.75, .75, 0, 1.)) # y self.assertTupleEqual(ax2.patches[2].get_facecolor(), (0, .5, 0, 1.)) # g self.assertTupleEqual(ax2.patches[3].get_facecolor(), (0, 0, 1., 1.)) # b
def test_plot_importance(self): X_train, _, y_train, _ = train_test_split(*load_breast_cancer(True), test_size=0.1, random_state=1) train_data = lgb.Dataset(X_train, y_train) params = { "objective": "binary", "verbose": -1, "num_leaves": 3 } gbm0 = lgb.train(params, train_data, num_boost_round=10) ax0 = lgb.plot_importance(gbm0) self.assertIsInstance(ax0, matplotlib.axes.Axes) self.assertEqual(ax0.get_title(), 'Feature importance') self.assertEqual(ax0.get_xlabel(), 'Feature importance') self.assertEqual(ax0.get_ylabel(), 'Features') self.assertLessEqual(len(ax0.patches), 30) gbm1 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, silent=True) gbm1.fit(X_train, y_train) ax1 = lgb.plot_importance(gbm1, color='r', title='t', xlabel='x', ylabel='y') self.assertIsInstance(ax1, matplotlib.axes.Axes) self.assertEqual(ax1.get_title(), 't') self.assertEqual(ax1.get_xlabel(), 'x') self.assertEqual(ax1.get_ylabel(), 'y') self.assertLessEqual(len(ax1.patches), 30) for patch in ax1.patches: self.assertTupleEqual(patch.get_facecolor(), (1., 0, 0, 1.)) # red ax2 = lgb.plot_importance(gbm0, color=['r', 'y', 'g', 'b'], title=None, xlabel=None, ylabel=None) self.assertIsInstance(ax2, matplotlib.axes.Axes) self.assertEqual(ax2.get_title(), '') self.assertEqual(ax2.get_xlabel(), '') self.assertEqual(ax2.get_ylabel(), '') self.assertLessEqual(len(ax2.patches), 30) self.assertTupleEqual(ax2.patches[0].get_facecolor(), (1., 0, 0, 1.)) # r self.assertTupleEqual(ax2.patches[1].get_facecolor(), (.75, .75, 0, 1.)) # y self.assertTupleEqual(ax2.patches[2].get_facecolor(), (0, .5, 0, 1.)) # g self.assertTupleEqual(ax2.patches[3].get_facecolor(), (0, 0, 1., 1.)) # b
def GBDT_test(data,fold_n,num_rounds = 100000,bf=1,ff=1): model_type = "mort" if isMORT else "lgb" nFeatures = data.X_train.shape[1] early_stop = 100; verbose_eval = 20 #lr = 0.01; bf = bf; ff = ff if data.problem()=="classification": metric = 'auc' #"rmse" params = {"objective": "binary", "metric": metric,'n_estimators': num_rounds, "bagging_fraction": bf, "feature_fraction": ff,'verbose_eval': verbose_eval, "early_stopping_rounds": early_stop, 'n_jobs': -1, } else: metric = 'l2' #"rmse" params = {"objective": "regression", "metric": metric,'n_estimators': num_rounds, "bagging_fraction": bf, "feature_fraction": ff, 'verbose_eval': verbose_eval, "early_stopping_rounds": early_stop, 'n_jobs': -1, } print(f"====== GBDT_test\tparams={params}") X_train, y_train = data.X_train, data.y_train X_valid, y_valid = data.X_valid, data.y_valid X_test, y_test = data.X_test, data.y_test if not np.isfortran(X_train): #Very important!!! mort need COLUMN-MAJOR format X_train = np.asfortranarray(X_train) X_valid = np.asfortranarray(X_valid) #X_train, X_valid = pd.DataFrame(X_train), pd.DataFrame(X_valid) print(f"GBDT_test\ttrain={X_train.shape} valid={X_valid.shape}") #print(f"X_train=\n{X_train.head()}\n{X_train.tail()}") if model_type == 'mort': params['verbose'] = 667 model = LiteMORT(params).fit(X_train, y_train, eval_set=[(X_valid, y_valid)]) #y_pred_valid = model.predict(X_valid) #y_pred = model.predict(X_test) if model_type == 'lgb': if data.problem()=="classification": model = lgb.LGBMClassifier(**params) else: model = lgb.LGBMRegressor(**params) model.fit(X_train, y_train,eval_set=[(X_train, y_train), (X_valid, y_valid)],verbose=min(num_rounds//10,1000)) pred_val = model.predict(data.X_test) #plot_importance(model) lgb.plot_importance(model, max_num_features=32) plt.title("Featurertances") plt.savefig(f"./results/{dataset}_feat_importance_.jpg") #plt.show(block=False) plt.close() fold_importance = pd.DataFrame() fold_importance["importance"] = model.feature_importances_ fold_importance["feature"] = [i for i in range(nFeatures)] fold_importance["fold"] = fold_n #fold_importance.to_pickle(f"./results/{dataset}_feat_{fold_n}.pickle") print('best_score', model.best_score_) acc_train,acc_=model.best_score_['training'][metric], model.best_score_['valid_1'][metric] if data.X_test is not None: pred_val = model.predict(data.X_test) if False:#config.err_relative: #nrm_Y = ((YY_) ** 2).mean() #mse = ((YY_ - prediction) ** 2).mean()/nrm_Y lenY = np.linalg.norm(data.y_test) acc_ = np.linalg.norm(data.y_test - pred_val)/lenY else: acc_ = ((data.y_test - pred_val) ** 2).mean() print(f'====== Best step: test={data.X_test.shape} ACCU@Test={acc_:.5f}') return acc_,fold_importance
sub_df = pd.DataFrame({"fullVisitorId":test_id}) pred_test[pred_test<0] = 0 sub_df["PredictedLogRevenue"] = np.expm1(pred_test) sub_df = sub_df.groupby("fullVisitorId")["PredictedLogRevenue"].sum().reset_index() sub_df.columns = ["fullVisitorId", "PredictedLogRevenue"] sub_df["PredictedLogRevenue"] = np.log1p(sub_df["PredictedLogRevenue"]) sub_df.to_csv("baseline_lgb.csv", index=False) # In[ ]: sub_df.head() # **Feature Importance:** # # Now let us have a look at the important features of the light gbm model. # In[ ]: fig, ax = plt.subplots(figsize=(12,18)) lgb.plot_importance(model, max_num_features=50, height=0.8, ax=ax) ax.grid(False) plt.title("LightGBM - Feature Importance", fontsize=15) plt.show() # "totals.pageviews" turn out to be the most important feature followed by "totals.hits" and "visitStartTime". # **More to come. Stay tuned.!**
train_y, test_y = y[tr_idx], y[val_idx] # Datasetに入れて学習させる lgb_train = lgb.Dataset(train_x, train_y) lgb_valid = lgb.Dataset(test_x, test_y, reference=lgb_train) # Training model = lgb.train(light_params, lgb_train, num_boost_round=3000, early_stopping_rounds=50, valid_sets=[lgb_train, lgb_valid], verbose_eval=50) test_pred = model.predict(test_df) oof = model.predict(test_x) rmse = np.sqrt(mean_squared_error(test_y, oof)) print(f"RMSE : {rmse}") print(rmse) lgb.plot_importance(model, importance_type="gain", max_num_features=40, figsize=(12, 12)) # max_num_features=20, sub = pd.read_csv("submission.csv").iloc[:, 1:] sub["units_sold_month"] = test_pred.round(3) sub.to_csv("baseline.csv", index=False)
def make_predictions_gkf(train_df, test_df, feature_cols, target, param, NFOLDS=2): gkf = GroupKFold(n_splits=NFOLDS) split_groups = train_df['DT_M'] test_pred_prob = np.zeros(test_num) oof_pred_prob = np.zeros(train_num) train_values = train_df[feature_cols] test_values = test_df[feature_cols] labels = train_df['isFraud'] split_groups = train_df['DT_M'] for i, (train_idx, valid_idx) in enumerate( gkf.split(train_values, labels, groups=split_groups)): print(i, 'fold...') start_time = time.time() train_x, train_y = train_values.iloc[train_idx], labels[train_idx] valid_x, valid_y = train_values.iloc[valid_idx], labels[valid_idx] # Construct the dataset train_data = lgb.Dataset(train_x, label=train_y, categorical_feature=cate_cols, free_raw_data=True) valid_data = lgb.Dataset(valid_x, label=valid_y, categorical_feature=cate_cols, reference=train_data, free_raw_data=True) # Training bst = lgb.train(param, train_data, valid_sets=[train_data, valid_data], verbose_eval=200) # Prediction valid_pred_prob = bst.predict(valid_x, num_iteration=bst.best_iteration) oof_pred_prob[valid_idx] = valid_pred_prob print('val logloss: ', log_loss(valid_y, valid_pred_prob)) print('val auc: ', roc_auc_score(valid_y, valid_pred_prob)) test_pred_prob += bst.predict( test_values, num_iteration=bst.best_iteration) / gkf.n_splits print('runtime: {}\n'.format(time.time() - start_time)) # Plotting lgb.plot_importance(bst, max_num_features=30) print('oof logloss: ', log_loss(labels, oof_pred_prob)) print('oof auc: ', roc_auc_score(labels, oof_pred_prob)) test_df['isFraud'] = test_pred_prob return test_df[['TransactionID', 'isFraud']]
def plot_lgb_importance(lgbm): lgb.plot_importance(lgbm.model, height=0.5, figsize=(4, 8)) plt.show()
'scale_pos_weight': 2, 'drop_rate': 0.02 } cv_results = lgbm.cv(train_set=lgbm_train, params=lgbm_params, nfold=5, num_boost_round=600, early_stopping_rounds=50, verbose_eval=50, metrics=['auc']) optimum_boost_rounds = np.argmax(cv_results['auc-mean']) print('Optimum boost rounds = {}'.format(optimum_boost_rounds)) print('Best CV result = {}'.format(np.max(cv_results['auc-mean']))) clf = lgbm.train(train_set=lgbm_train, params=lgbm_params, num_boost_round=optimum_boost_rounds) """ Predict on test set and create submission """ y_pred = clf.predict(fin_test) out_df = pd.DataFrame({'SK_ID_CURR': test['SK_ID_CURR'], 'TARGET': y_pred}) out_df.to_csv('submission_lgbm.csv', index=False) fig, (ax, ax1) = plt.subplots(1, 2, figsize=[11, 7]) lgbm.plot_importance(clf, ax=ax, max_num_features=20, importance_type='split') lgbm.plot_importance(clf, ax=ax1, max_num_features=20, importance_type='gain') ax.set_title('Importance by splits') ax1.set_title('Importance by gain') plt.tight_layout() plt.savefig('feature_importance.png')
y_pred[y_pred < 0] = 0 y_true = _valid_df["likes"].values rmsle = np.sqrt(mean_squared_log_error(y_true, y_pred)) rmsles.append(rmsle) mlflow.log_metric(f"rmsle_{fold}", rmsle) print(f"------------------------ fold {fold} -----------------------") print(f"------------------- rmsle {rmsle} -----------------------") print() print("") print( f"------------------- average rmsle {np.mean(rmsles)} -----------------------" ) mlflow.log_metric(f"rmsle_avg", np.mean(rmsles)) if "lgbm" in models: lgb.plot_importance(lgb_model, figsize=(16, 16)) plt.show() mlflow.end_run() # %% # raw.train, raw.test = target_encoding(raw.train, raw.test) cat_train_dataset = Pool(raw.train[features], raw.train["likes_log"], cat_features=cat_features) lgb_train_dataset = lgb.Dataset(raw.train[features], raw.train["likes_log"]) cat_model = CatBoostRegressor(**Config.cat_params, iterations=2000) cat_model.fit( cat_train_dataset, verbose_eval=100, eval_set=[cat_train_dataset], )
dataset_train = lgb.Dataset(X_train_tfidf, y_train) dataset_valid = lgb.Dataset(X_valid_tfidf, y_valid) booster = lgb.train( params, dataset_train, feature_name=([f"feat_{i}" for i in range(1, 94)] + [f"tfidf_{i}" for i in range(1, 94)]), num_boost_round=500, valid_sets=dataset_valid, early_stopping_rounds=20, ) best_iteration = booster.best_iteration print(best_iteration) lgb.plot_importance( booster, max_num_features=30, figsize=(12, 10), dpi=300, ) df_test = pd.read_csv( "/kaggle/input/otto-group-product-classification-challenge/test.csv", dtype=dtypes).set_index("id") tfidf = TfidfTransformer() tfidf_feature_train_all = tfidf.fit_transform( df_train[feature_columns]).toarray().astype("float32") X_train_all_tfidf = np.hstack( (df_train[feature_columns].values, tfidf_feature_train_all)) dataset_train_all = lgb.Dataset(X_train_all_tfidf, df_train[target_column]) booster = lgb.train( params, dataset_train_all,
def cboost_feature_importance(model): fig, ax = plt.subplots(figsize=(12, 18)) lgb.plot_importance(model, max_num_features=50, height=0.8, ax=ax) ax.grid(False) plt.title("LightGBM - Feature Importance", fontsize=15) plt.show()
early_stop_rounds = 10 params = { 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': {'12', 'auc'}, 'num_leaves': 31, 'learning_rate': 0.05, 'feature_fraction': 0.9, 'bagging_fraction': 0.8, 'bagging_freq': 5, 'verbose': 1 } results = {} gbm = lgb.train(params, lgb_train, num_boost_round=boost_round, valid_sets=(lgb_eval, lgb_train), valid_names=('validate', 'train'), early_stopping_rounds=early_stop_rounds, evals_result=results) y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration) lgb.plot_metric(results) plt.show() lgb.plot_importance(gbm, importance_type='split') plt.show() lgb.plot_tree(gbm, tree_index=0) plt.show()
bst = lgb.train(param, dtrain, num_boost_round=1000, valid_sets=[dvalid], early_stopping_rounds=30) valid_pred = bst.predict(val_X) valid_score = metrics.roc_auc_score(val_y, valid_pred) print(f"Validation AUC score: {valid_score:.4f}") import matplotlib.pyplot as plt from lightgbm import plot_importance from lightgbm import plot_split_value_histogram fig, ax = plt.subplots(figsize=(10, 8)) plot_importance(bst, ax=ax) fig, ax = plt.subplots(figsize=(10, 8)) plot_split_value_histogram(bst, 'Forecast', ax=ax) plt.show() ax = lgb.plot_tree(bst, tree_index=3, figsize=(200, 200), show_info=['split_gain']) """ -------------------------------------------------------------------------- -------------------------------------------------------------------------- -------------------------------------------------------------------------- """ # Fitting classifier to the Training set # Create your classifier here
markersize=12, color='lightgreen', linewidth=2, label="Label") pyplot.plot('date', 'Sunspots_trended', data=data_test_dates, marker='', color='olive', linewidth=2, label="Forecast") pyplot.legend() pyplot.xlabel('date') pyplot.title("Full dataset + Forecast") lgb.plot_importance(models_dict[3]) # get feature importance from all 24 models feature_importance_df = pd.DataFrame() for index, model in models_dict.items(): iter = pd.DataFrame(data=model.feature_importance()).T iter.columns = model.feature_name() iter.index = [index] feature_importance_df = feature_importance_df.append(iter) # plot feature importance for each model (model 1 is for month 1, model 24 is for month 24 respectively) feature_importance_df.plot() pyplot.title("Model Feature importance") pyplot.xlabel('model (month i)') pyplot.ylabel('feature importance')
evals_result = {} model = lgb.train(params, lgtrain, 10000, valid_sets=[lgval], early_stopping_rounds=100, verbose_eval=20, evals_result=evals_result) pred_test_y = model.predict(test_X, num_iteration=model.best_iteration) return pred_test_y, model, evals_result # Splitting the data for model training# dev_X = train_X.iloc[:-200000, :] val_X = train_X.iloc[-200000:, :] dev_y = train_y[:-200000] val_y = train_y[-200000:] print(dev_X.shape, val_X.shape, test_X.shape) # Training the model # pred_test, model, evals_result = run_lgb(dev_X, dev_y, val_X, val_y, test_X) # Making a submission file # pred_test[pred_test > 1] = 1 pred_test[pred_test < 0] = 0 sub_df = pd.DataFrame({"item_id": test_id}) sub_df["deal_probability"] = pred_test sub_df.to_csv("baseline_lgb.csv", index=False) fig, ax = plt.subplots(figsize=(12, 18)) lgb.plot_importance(model, max_num_features=50, height=0.8, ax=ax) ax.grid(False) plt.title("LightGBM - Feature Importance", fontsize=15) plt.show()
} evals_result = {} # to record eval results for plotting print('Start training...') # train gbm = lgb.train(params, lgb_train, num_boost_round=100, valid_sets=[lgb_train, lgb_test], feature_name=['f' + str(i + 1) for i in range(28)], categorical_feature=[21], evals_result=evals_result, verbose_eval=10) print('Plot metrics during training...') ax = lgb.plot_metric(evals_result, metric='l1') plt.show() print('Plot feature importances...') ax = lgb.plot_importance(gbm, max_num_features=10) plt.show() print('Plot 84th tree...') # one tree use categorical feature to split ax = lgb.plot_tree(gbm, tree_index=83, figsize=(20, 8), show_info=['split_gain']) plt.show() print('Plot 84th tree with graphviz...') graph = lgb.create_tree_digraph(gbm, tree_index=83, name='Tree84') graph.render(view=True)
def DO(frm, to, fileno): dtypes = { 'ip': 'uint32', 'app': 'uint16', 'device': 'uint16', 'os': 'uint16', 'channel': 'uint16', 'is_attributed': 'uint8', 'click_id': 'uint32', } print('loading train data...', frm, to) train_df = pd.read_csv("../input/train.csv", parse_dates=['click_time'], skiprows=range(1, frm), nrows=to - frm, dtype=dtypes, usecols=[ 'ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed' ]) print('loading test data...') if debug: test_df = pd.read_csv("../input/test.csv", nrows=100000, parse_dates=['click_time'], dtype=dtypes, usecols=[ 'ip', 'app', 'device', 'os', 'channel', 'click_time', 'click_id' ]) else: test_df = pd.read_csv("../input/test.csv", parse_dates=['click_time'], dtype=dtypes, usecols=[ 'ip', 'app', 'device', 'os', 'channel', 'click_time', 'click_id' ]) len_train = len(train_df) train_df = train_df.append(test_df) del test_df gc.collect() print('Extracting new features...') train_df['hour'] = pd.to_datetime( train_df.click_time).dt.hour.astype('uint8') train_df['day'] = pd.to_datetime( train_df.click_time).dt.day.astype('uint8') gc.collect() train_df = do_countuniq(train_df, ['ip'], 'channel', 'X0', show_max=True) gc.collect() train_df = do_cumcount(train_df, ['ip', 'device', 'os'], 'app', 'X1', show_max=True) gc.collect() train_df = do_countuniq(train_df, ['ip', 'day'], 'hour', 'X2', show_max=True) gc.collect() train_df = do_countuniq(train_df, ['ip'], 'app', 'X3', show_max=True) gc.collect() train_df = do_countuniq(train_df, ['ip', 'app'], 'os', 'X4', show_max=True) gc.collect() train_df = do_countuniq(train_df, ['ip'], 'device', 'X5', show_max=True) gc.collect() train_df = do_countuniq(train_df, ['app'], 'channel', 'X6', show_max=True) gc.collect() train_df = do_cumcount(train_df, ['ip'], 'os', 'X7', show_max=True) gc.collect() train_df = do_countuniq(train_df, ['ip', 'device', 'os'], 'app', 'X8', show_max=True) gc.collect() train_df = do_count(train_df, ['ip', 'day', 'hour'], 'ip_tcount', show_max=True) gc.collect() train_df = do_count(train_df, ['ip', 'app'], 'ip_app_count', show_max=True) gc.collect() train_df = do_count(train_df, ['ip', 'app', 'os'], 'ip_app_os_count', show_max=True) gc.collect() print('doing nextClick') predictors = [] new_feature = 'nextClick' filename = 'nextClick_%d_%d.csv' % (frm, to) if os.path.exists(filename): print('loading from save file') QQ = pd.read_csv(filename).values else: D = 2**26 train_df['category'] = (train_df['ip'].astype(str) + "_" + train_df['app'].astype(str) + "_" + train_df['device'].astype(str) \ + "_" + train_df['os'].astype(str)).apply(hash) % D click_buffer = np.full(D, 3000000000, dtype=np.uint32) train_df['epochtime'] = train_df['click_time'].astype( np.int64) // 10**9 next_clicks = [] for category, t in zip(reversed(train_df['category'].values), reversed(train_df['epochtime'].values)): next_clicks.append(click_buffer[category] - t) click_buffer[category] = t del (click_buffer) QQ = list(reversed(next_clicks)) if not debug: print('saving') pd.DataFrame(QQ).to_csv(filename, index=False) train_df[new_feature] = QQ predictors.append(new_feature) train_df[new_feature + '_shift'] = pd.DataFrame(QQ).shift(+1).values predictors.append(new_feature + '_shift') del QQ gc.collect() # Adding features with var and mean hour (inspired from nuhsikander's script) print('grouping by : ip_day_chl_var_hour') gp = train_df[['ip', 'day', 'hour', 'channel']].groupby(by=['ip', 'day', 'channel'])[[ 'hour' ]].var().reset_index().rename( index=str, columns={'hour': 'ip_tchan_count'}) train_df = train_df.merge(gp, on=['ip', 'day', 'channel'], how='left') del gp gc.collect() print('grouping by : ip_app_os_var_hour') gp = train_df[['ip', 'app', 'os', 'hour']].groupby(by=['ip', 'app', 'os'])[[ 'hour' ]].var().reset_index().rename( index=str, columns={'hour': 'ip_app_os_var'}) train_df = train_df.merge(gp, on=['ip', 'app', 'os'], how='left') del gp gc.collect() print('grouping by : ip_app_channel_var_day') gp = train_df[['ip', 'app', 'channel', 'day']].groupby(by=['ip', 'app', 'channel'])[[ 'day' ]].var().reset_index().rename( index=str, columns={'day': 'ip_app_channel_var_day'}) train_df = train_df.merge(gp, on=['ip', 'app', 'channel'], how='left') del gp gc.collect() print('grouping by : ip_app_chl_mean_hour') gp = train_df[['ip', 'app', 'channel', 'hour']].groupby(by=['ip', 'app', 'channel'])[[ 'hour' ]].mean().reset_index().rename( index=str, columns={'hour': 'ip_app_channel_mean_hour'}) print("merging...") train_df = train_df.merge(gp, on=['ip', 'app', 'channel'], how='left') del gp gc.collect() print("vars and data type: ") train_df.info() train_df['ip_tcount'] = train_df['ip_tcount'].astype('uint16') train_df['ip_app_count'] = train_df['ip_app_count'].astype('uint16') train_df['ip_app_os_count'] = train_df['ip_app_os_count'].astype('uint16') target = 'is_attributed' predictors.extend([ 'app', 'device', 'os', 'channel', 'hour', 'day', 'ip_tcount', 'ip_tchan_count', 'ip_app_count', 'ip_app_os_count', 'ip_app_os_var', 'ip_app_channel_var_day', 'ip_app_channel_mean_hour' ]) categorical = ['app', 'device', 'os', 'channel', 'hour', 'day'] for i in range(0, naddfeat): predictors.append('X' + str(i)) print('predictors', predictors) test_df = train_df[len_train:] val_df = train_df[(len_train - val_size):len_train] train_df = train_df[:(len_train - val_size)] print("train size: ", len(train_df)) print("valid size: ", len(val_df)) print("test size : ", len(test_df)) sub = pd.DataFrame() sub['click_id'] = test_df['click_id'].astype('int') gc.collect() print("Training...") start_time = time.time() params = { 'learning_rate': 0.20, #'is_unbalance': 'true', # replaced with scale_pos_weight argument 'num_leaves': 7, # 2^max_depth - 1 'max_depth': 3, # -1 means no limit 'min_child_samples': 100, # Minimum number of data need in a child(min_data_in_leaf) 'max_bin': 100, # Number of bucketed bin for feature values 'subsample': 0.7, # Subsample ratio of the training instance. 'subsample_freq': 1, # frequence of subsample, <=0 means no enable 'colsample_bytree': 0.9, # Subsample ratio of columns when constructing each tree. 'min_child_weight': 0, # Minimum sum of instance weight(hessian) needed in a child(leaf) 'scale_pos_weight': 200 # because training data is extremely unbalanced } (bst, best_iteration) = lgb_modelfit_nocv(params, train_df, val_df, predictors, target, objective='binary', metrics='auc', early_stopping_rounds=30, verbose_eval=True, num_boost_round=1000, categorical_features=categorical) print('[{}]: model training time'.format(time.time() - start_time)) del train_df del val_df gc.collect() print('Plot feature importances...') ax = lgb.plot_importance(bst, max_num_features=100) plt.show() print("Predicting...") sub['is_attributed'] = bst.predict(test_df[predictors], num_iteration=best_iteration) if not debug: print("writing...") sub.to_csv('sub_it%d.csv.gz' % (fileno), index=False, compression='gzip') print("done...") return sub
model = xgb.train(params, xgb.DMatrix(x1, y1), 200, watchlist, feval=xgb_score, maximize=False, verbose_eval=100, early_stopping_rounds=10) model.save_model('xgb_model_v2_{}_limit_{}.model'.format(i, model.best_ntree_limit)) xgb_pred = model.predict(xgb.DMatrix(test[cols]), ntree_limit=model.best_ntree_limit) #xgb_pred = model.predict(xgb.DMatrix(test[cols]), ntree_limit=ntree_limit[i]) xgb_valid = model.predict(xgb.DMatrix(x2)) print('xgb valid log loss = {}'.format(log_loss(y2,xgb_valid))) ''' # lgbm #print('lgb training') d_train = lgb.Dataset(x1, label=y1) d_valid = lgb.Dataset(x2, label=y2) watchlist = [d_train, d_valid] #model = lgb.train(lgb_params, train_set=d_train, num_boost_round=240, valid_sets=watchlist, early_stopping_rounds=50, verbose_eval=100) model = lgb.Booster(model_file='lgb_model_v2_{}.model'.format(i)) ax = lgb.plot_importance(model) plt.tight_layout() plt.savefig('feature_importance_{}.png'.format(i)) break lgb_pred = model.predict(test[cols]) model.save_model('lgb_model_v2_{}.model'.format(i)) lgb_valid = model.predict(x2) print('lgb valid log loss = {}'.format(log_loss(y2, lgb_valid))) if (i == 0): xgb_preds = xgb_pred lgb_preds = lgb_pred cat_preds = cat_pred else: xgb_preds += xgb_pred lgb_preds += lgb_pred
lgbm_roc_score = roc_auc_score(y_test, lgbm_clf.predict_proba(x_test)[:, 1], average='macro') print('ROC AUC:{0:.4f}'.format(lgbm_roc_score)) print('lgbm_clf.predict_proba(x_test)', lgbm_clf.predict_proba(x_test)) print('lgbm_clf.predict_proba(x_test)[:, 1]', lgbm_clf.predict_proba(x_test)[:, 1]) print('y_test', y_test) from sklearn.model_selection import GridSearchCV ''' # 하이퍼 파라미터 테스트의 수행속도를 향상시키기 위해 n_estimators를 200 감소 from sklearn.model_selection import GridSearchCV lgbm_clf = LGBMClassifier(n_estimators=200) params = {'num_leaves':[32, 64], 'max_depth':[128, 160], 'min_child_samples':[60, 100], 'subsample':[0.8, 1]} # cv=3 gridcv = GridSearchCV(lgbm_clf, param_grid=params, cv=3) gridcv.fit(x_train, y_train, early_stopping_rounds=30, eval_metric="auc", eval_set=[(x_train, y_train), (x_test, y_test)]) print('GridSearchCV 최적 파라미터:', gridcv.best_params_) lgbm_roc_score = roc_auc_score(y_test, gridcv.predict_proba(x_test)[:, 1], average='macro') print('ROC AUC :{0:.4f}'.format(lgbm_roc_score))''' from lightgbm import plot_importance import matplotlib.pyplot as plt fig, ax = plt.subplots(1, 1, figsize=(10, 8)) plot_importance(lgbm_clf, ax=ax, max_num_features=20, height=.4) plt.show()
# In[103]: gbm = lgb.train(params, lgb_train, num_boost_round=61, valid_sets=lgb_eval, feature_name=feature_name, early_stopping_rounds=10) gbm.save_model('model.txt') # ### lightGBM 特征信息图 # In[104]: lgb.plot_importance(gbm, importance_type='gain', ignore_zero=False, figsize=(10, 6)) # ### lightGBM 模型加载,输入测试集进行预测 # In[105]: #bst = lgb.Booster(model_file='model.txt') y_predict = gbm.predict(x_test, num_iteration=gbm.best_iteration) # ### 分析训练效果(将预测的第五天风速y_predict与真实的第五天风速y_test对比) # In[214]: -sum(y_test * np.log(y_predict) +
def estimate(model, data): ax1 = plot_importance(model, importance_type='gain') ax1.set_title('gain') ax2 = plot_importance(model, importance_type='split') ax2.set_title('split')
train_pred = pd.DataFrame({"fullVisitorId": train_idx}) train_pred["PredictedLogRevenue"] = np.expm1(oof_preds) train_pred = train_pred.groupby( "fullVisitorId")["PredictedLogRevenue"].sum().reset_index() train_pred.columns = ["fullVisitorId", "PredictedLogRevenue"] train_pred["PredictedLogRevenue"] = np.log1p(train_pred["PredictedLogRevenue"]) train_rmse = np.sqrt( mean_squared_error(train_target, train_pred['PredictedLogRevenue'])) print('User-level score:', str(round(train_rmse, 4))) print(' ') end = time.time() print('training time:', str(round((end - start) / 60)), 'mins') #Predict and write to file for submission test_pred = pd.DataFrame({"fullVisitorId": test_idx}) test_pred["PredictedLogRevenue"] = np.expm1(sub_preds) test_pred = test_pred.groupby( "fullVisitorId")["PredictedLogRevenue"].sum().reset_index() test_pred.columns = ["fullVisitorId", "PredictedLogRevenue"] test_pred["PredictedLogRevenue"] = np.log1p(test_pred["PredictedLogRevenue"]) test_pred.to_csv("lgb_new_2.csv", index=False) #Print importances lgb.plot_importance(lgb_model, height=0.5, max_num_features=90, ignore_zero=False, figsize=(12, 9), importance_type='gain') plt.tight_layout() plt.show()
gbm = lgb.train(params, lgb_train, num_boost_round=283, valid_sets=lgb_eval, early_stopping_rounds=50) print('Start predicting...') # predict y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration) # eval error(y_test, y_pred) # online # predict = gbm.predict(test_X, num_iteration=gbm.best_iteration) # data1 = pd.DataFrame(predict) # save # save(data1, 'lgb') # gbm_online = lgb.train(params, # train_all, # num_boost_round=280) # # predict # predict = gbm_online.predict(test_X, num_iteration=gbm_online.best_iteration) # data1 = pd.DataFrame(predict) # # save # save(data1, 'lgb') plt_encoding_error() lgb.plot_importance(gbm) plt.show()
'min_sum_hessian_in_leaf': 0.001, 'n_jobs': -1, 'num_threads': 8, } print('................Start training {} fold..........................'. format(k + 1)) # train gbm = lgb.train(params, lgb_train, num_boost_round=2000, valid_sets=lgb_eval, early_stopping_rounds=100, verbose_eval=100, feature_name=features) lgb.plot_importance(gbm, max_num_features=20) plt.show() print('................Start predict .........................') # 预测 y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration) # 评估 tmp_auc = roc_auc_score(y_test, y_pred) auc_cv.append(tmp_auc) print("valid auc:", tmp_auc) # test pred = gbm.predict(test_data, num_iteration=gbm.best_iteration) pred_cv.append(pred) # K交叉验证的平均分数 print('the cv information:') print(auc_cv)
color="r", size=6) plt.show() with code(): def plot_feature_importance(model): n_features = X.shape[1] plt.barh(range(n_features), model.feature_importances_, align='center') plt.yticks(np.arange(n_features), X.columns) plt.xlabel('Feature importance') plt.ylabel('Feature') # ※Xはtrain_test_splitで分割する前のtrainデータを想定 with code(): # ランダムフォレスト from sklearn.ensemble import RandomForestClassifier forest = RandomForestClassifier( n_estimators=100, random_state=20181101) # n_estimatorsは構築する決定木の数 forest.fit(X_train, y_train) # 表示 plot_feature_importance(forest) with code(): import lightgbm as lgb # 可視化(modelはlightgbmで学習させたモデル) lgb.plot_importance(model, figsize=(12, 8)) plt.show()
valid_sets=[lgtrain, lgvalid], valid_names=['train', 'valid'], #early_stopping_rounds=500, verbose_eval=20) else: lgb_clf = lgb.train(lgbm_params, lgtrain, num_boost_round=15000, valid_sets=[lgtrain, lgvalid], valid_names=['train', 'valid'], early_stopping_rounds=60, verbose_eval=20) # Feature Importance Plot f, ax = plt.subplots(figsize=[7, 10]) lgb.plot_importance(lgb_clf, max_num_features=50, ax=ax) plt.title("Light GBM Feature Importance") plt.savefig(path + '../plots/feature_import_1006A.png') print("Model Evaluation Stage") print('RMSE:', np.sqrt(metrics.mean_squared_error(y_valid, lgb_clf.predict(X_valid)))) lgpred = lgb_clf.predict(testing) lgsub = pd.DataFrame(lgpred, columns=["deal_probability"], index=testdex) lgsub['deal_probability'].clip(0.0, 1.0, inplace=True) # Between 0 and 1 #lgsub.to_csv(path + "../sub/lgsub_0206A.csv.gz",index=True,header=True, compression = 'gzip') print("Model Runtime: %0.2f Minutes" % ((time.time() - modelstart) / 60)) ''' [20] train's rmse: 0.240546 valid's rmse: 0.23821 [40] train's rmse: 0.230003 valid's rmse: 0.22792 [60] train's rmse: 0.22423 valid's rmse: 0.222459
lgb_params['sub_feature'] = 0.80 lgb_params['max_depth'] = 7 lgb_params['feature_fraction'] = 0.7 lgb_params['bagging_fraction'] = 0.7 lgb_params['bagging_freq'] = 10 lgb_params['learning_rate'] = 0.01 lgb_train = lgb.Dataset(X_train, y_train) lightgbm = lgb.train(lgb_params, lgb_train, feature_name=[i for i in feat_names]) # In[ ]: plt.figure(figsize=(12, 6)) lgb.plot_importance(lightgbm, max_num_features=30) plt.title("Feature importances by LightGBM") plt.show() # In[ ]: ax = lgb.plot_tree(lightgbm, tree_index=83, figsize=(20, 8), show_info=['split_gain']) plt.show() # # Acknowledgement: # 1. Pedro Schoen # #
valid_sets=[dt, dv], valid_names=["training", "valid"], num_boost_round=MAX_ROUNDS, verbose_eval=False, early_stopping_rounds=EARLY_STOPPING_ROUNDS, ) score = model.best_score["valid"][METRIC] best_params = model.params print("Best params:", best_params) print(f" {METRIC} = {score}") print(" Params: ") for key, value in best_params.items(): print(f" {key}: {value}") import lightgbm as lgb model = lgb.train( best_params, dt, valid_sets=[dt, dv], valid_names=["training", "valid"], num_boost_round=MAX_ROUNDS, early_stopping_rounds=EARLY_STOPPING_ROUNDS, verbose_eval=REPORT_ROUNDS, ) lgb.plot_importance(model, importance_type="gain", grid=False) plt.show()
reg_lambda=0, silent=False) print(ctime() + '...training final model...') bst.fit(X=X_eval, y=Y_eval, eval_set=[(X_eval, Y_eval)], eval_names=['eval'], eval_metric=['rmse'], early_stopping_rounds=5000, feature_name=feature_names, categorical_feature=categorical_features) joblib.dump(bst, join(fittedModelDir, 'model5_nonCV_{}{}'.format(date, '.pkl'))) #===train the final model on all data=== #===make prediction for test set== fittedMdlPath='/home/arash/MEGA/MEGAsync/Machine Learning/'+\ 'Kaggle/Recruit/Fitted models/model5_nonCV_{}.pkl'.\ format(date) bst = joblib.load(fittedMdlPath) gbm.plot_importance(bst) y_test = bst.predict(X_test) df=pd.DataFrame({'id':df_test.air_store_id+'_'+\ df_test.visit_date.dt.strftime('%Y-%m-%d'), 'visitors':np.expm1(y_test)}) df.sort_values(by='id', inplace=True) df.to_csv(join(submissionsDir, 'model5_{}.csv'.format(date)), index=False) #===make prediction for test set===
def DO(frm, to, fileno): dtypes = { 'ip': 'uint32', 'app': 'uint16', 'device': 'uint8', 'os': 'uint16', 'channel': 'uint16', 'is_attributed': 'uint8', 'click_id': 'uint32', } print('loading train data...', frm, to) train_df = pd.read_csv(inputpath + "/train.csv", parse_dates=['click_time'], skiprows=range(1, frm), nrows=to - frm, dtype=dtypes, usecols=[ 'ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed' ]) print('loading test data...') if debug: test_df = pd.read_csv(inputpath + "/test_supplement.csv", nrows=100000, parse_dates=['click_time'], dtype=dtypes, usecols=[ 'ip', 'app', 'device', 'os', 'channel', 'click_time', 'click_id' ]) else: test_df = pd.read_csv(inputpath + "/test_supplement.csv", parse_dates=['click_time'], dtype=dtypes, usecols=[ 'ip', 'app', 'device', 'os', 'channel', 'click_time', 'click_id' ]) local_tz = pytz.timezone('Asia/Shanghai') def utc_to_local(utc_dt): local_dt = utc_dt.replace(tzinfo=pytz.utc).astimezone(local_tz) return local_tz.normalize(local_dt) train_df['click_time'] = train_df['click_time'].apply(utc_to_local) test_df['click_time'] = test_df['click_time'].apply(utc_to_local) train_df['hour'] = pd.to_datetime( train_df.click_time).dt.hour.astype('uint8') test_df['hour'] = pd.to_datetime( test_df.click_time).dt.hour.astype('uint8') train_df['day'] = pd.to_datetime( train_df.click_time).dt.day.astype('uint8') test_df['day'] = pd.to_datetime(test_df.click_time).dt.day.astype('uint8') len_train = len(train_df) # train_df=train_df.append(test_df) # del test_df gc.collect() def process_data(data): data = do_next_Click(data, agg_suffix='nextClick', agg_type='float32') gc.collect() data = do_prev_Click(data, agg_suffix='prevClick', agg_type='float32') gc.collect() ## Removed temporarily due RAM sortage. data = do_countuniq(data, ['day', 'ip'], 'channel') gc.collect() print('data columns', data.columns) data = do_countuniq(data, ['day', 'ip', 'device', 'os'], 'app') gc.collect() data = do_countuniq(data, ['day', 'ip', 'day'], 'hour') gc.collect() data = do_countuniq(data, ['day', 'ip'], 'app') gc.collect() data = do_countuniq(data, ['day', 'ip', 'app'], 'os') gc.collect() data = do_countuniq(data, ['day', 'ip'], 'device') gc.collect() data = do_countuniq(data, ['day', 'app'], 'channel') gc.collect() data = do_cumcount(data, ['day', 'ip'], 'os') gc.collect() data = do_cumcount(data, ['day', 'ip', 'device', 'os'], 'app') gc.collect() data = do_count(data, ['day', 'ip', 'day', 'hour']) gc.collect() data = do_count(data, ['day', 'ip', 'app']) gc.collect() data = do_count(data, ['day', 'ip', 'app', 'os']) gc.collect() data = do_var(data, ['day', 'ip', 'app', 'os'], 'hour') gc.collect() del data['day'] gc.collect() return data train_df = process_data(train_df) print('train_df cols after process', train_df.columns) test_df = process_data(test_df) print('test_df cols after process', test_df.columns) # predictors = list(set(predictors)) print('\n\nBefore appending predictors...\n\n', sorted(predictors)) target = 'is_attributed' word = ['app', 'device', 'os', 'channel', 'hour'] for feature in word: if feature not in predictors: predictors.append(feature) categorical = ['app', 'device', 'os', 'channel', 'hour'] print('\n\nAfter appending predictors...\n\n', sorted(predictors)) if debug: test_df = test_df else: relation = pd.read_csv(inputpath + 'mapping.csv', usecols=['old_click_id']) # test_df = train_df[len_train:] test_df = test_df.iloc[relation.old_click_id] del relation # val_df = train_df[(len_train-val_size):] # train_df = train_df[:(len_train-val_size)] val_df = train_df[train_df.day == 9] train_df = train_df[(train_df.day == 7) | (train_df.day == 8)] print("\ntrain size: ", len(train_df)) print("\nvalid size: ", len(val_df)) print("\ntest size : ", len(test_df)) sub = pd.DataFrame() sub['click_id'] = test_df['click_id'].astype('int') gc.collect() print("Training...") start_time = time.time() print('predictors', predictors) print('train cols', train_df.columns) print('test cols', test_df.columns) params = { 'learning_rate': 0.02, #'is_unbalance': 'true', # replaced with scale_pos_weight argument 'num_leaves': 31, # 2^max_depth - 1 'max_depth': -1, # -1 means no limit 'min_child_samples': 100, # Minimum number of data need in a child(min_data_in_leaf) 'max_bin': 128, # Number of bucketed bin for feature values 'subsample': 0.7, # Subsample ratio of the training instance. 'subsample_freq': 1, # frequence of subsample, <=0 means no enable 'colsample_bytree': 0.9, # Subsample ratio of columns when constructing each tree. 'min_child_weight': 0, # Minimum sum of instance weight(hessian) needed in a child(leaf) 'scale_pos_weight': 200 # because training data is extremely unbalanced } (bst, best_iteration) = lgb_modelfit_nocv(params, train_df, val_df, predictors, target, objective='xentropy', metrics='auc', early_stopping_rounds=30, verbose_eval=True, num_boost_round=2000, categorical_features=categorical) print('[{}]: model training time'.format(time.time() - start_time)) del train_df del val_df gc.collect() print('Plot feature importances...') fig = plt.figure(figsize=(20, 20)) ax = lgb.plot_importance(bst, max_num_features=100, figsize=(20, 15)) # plt.show() plt.savefig(str(fileno) + '_importance.png') print("Predicting...") sub['is_attributed'] = bst.predict(test_df[predictors], num_iteration=best_iteration) # if not debug: # print("writing...") sub.click_id = sub.index sub.to_csv('sub_it%d.csv' % (fileno), index=False, float_format='%.9f') print("done...") return sub
["booking_date", "checkin_date", "resort_id"], ]: if not isinstance(col, list): col = [col] col_name = "_".join(col) all_df = pd.concat([ train_df[["reservation_id"] + col], test_df[["reservation_id"] + col] ]) gdf = all_df.groupby(col)["reservation_id"].count().reset_index() gdf.columns = col + [col_name + "_count"] train_df = pd.merge(train_df, gdf, on=col, how="left") from catboost import CatBoostRegressor model = CatBoostRegressor(iterations=10000, depth=3, learning_rate=0.1, loss_function='RMSE') model.fit(X_tr1, y_tr1, eval_set=(X_tst1, y_tst1), plot=True) print(r2_score(y_tst1, model.predict(X_tst1))) print(np.sqrt(mean_squared_error(y_tst1, model.predict(X_tst1)))) import matplotlib.pyplot as plt import lightgbm as lgb fig, ax = plt.subplots(figsize=(12, 30)) lgb.plot_importance(lgbm, max_num_features=100, height=0.8, ax=ax) ax.grid(False) plt.title("LightGBM - Feature Importance", fontsize=15) plt.show()
def feature_importance(self): lgb.plot_importance(self.model, max_num_features=10) plt.show() return self.model.feature_importance()
def DO(train_frm,train_to, test_nrows, groups, rategroup, fileno, initial_cols=['ip', 'app','device','os', 'channel', 'hour']): predictors=[] dtypes = { 'ip' : 'uint32', 'app' : 'uint16', 'device' : 'uint16', 'os' : 'uint16', 'channel' : 'uint16', 'is_attributed' : 'uint8', 'click_id' : 'uint32', } print('loading train data...',frm,to) train_df = pd.read_csv(inputpath + "train.csv", parse_dates=['click_time'], skiprows=range(1,train_frm), nrows=train_to-train_frm, dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'is_attributed']) print('loading test data...') # if debug: # test_df = pd.read_csv(inputpath+"test.csv", nrows=100000, parse_dates=['click_time'], dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'click_id']) # else: test_df = pd.read_csv(inputpath+"test.csv", nrows=test_nrows, parse_dates=['click_time'], dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'click_id']) print('Extracting new features...') local_tz = pytz.timezone('Asia/Shanghai') # use your local timezone name here # NOTE: pytz.reference.LocalTimezone() would produce wrong result here ## You could use `tzlocal` module to get local timezone on Unix and Win32 # from tzlocal import get_localzone # $ pip install tzlocal # # get local timezone # local_tz = get_localzone() def utc_to_local(utc_dt): local_dt = utc_dt.replace(tzinfo=pytz.utc).astimezone(local_tz) return local_tz.normalize(local_dt) train_df['click_time'] = train_df['click_time'].apply(utc_to_local) test_df['click_time'] = test_df['click_time'].apply(utc_to_local) train_df['hour'] = pd.to_datetime(train_df.click_time).dt.hour.astype('uint8') test_df['hour'] = pd.to_datetime(test_df.click_time).dt.hour.astype('uint8') train_df['day'] = pd.to_datetime(train_df.click_time).dt.day.astype('uint8') test_df['day'] = pd.to_datetime(test_df.click_time).dt.day.astype('uint8') # Find frequency of is_attributed for each unique value in column freqs = {} for cols in rategroups: def rate_calculation(x): """Calculate the attributed rate. Scale by confidence""" rate = x.sum() / float(x.count()) conf = np.min([1, np.log(x.count()) / log_group]) return rate * conf # New feature name new_feature = '_'.join(cols)+'_confRate' predictors.append(new_feature) filename = new_feature + '.csv' if os.path.exists(filename): gp=pd.read_csv(filename) train_df = train_df.merge(gp, on=cols, how='left') test_df = test_df.merge(gp, on=cols, how='left') else: # Perform the groupby group_object = train_df.groupby(cols) # Group sizes group_sizes = group_object.size() log_group = np.log(100000) # 1000 views -> 60% confidence, 100 views -> 40% confidence print(">> Calculating confidence-weighted rate for: {}.\n Saving to: {}. Group Max /Mean / Median / Min: {} / {} / {} / {}".format( cols, new_feature, group_sizes.max(), np.round(group_sizes.mean(), 2), np.round(group_sizes.median(), 2), group_sizes.min() )) # Aggregation function gp = group_object['is_attributed'].apply(rate_calculation).reset_index().rename( index=str, columns={'is_attributed': new_feature})[cols + [new_feature]] # Perform the merge train_df = train_df.merge(gp, on=cols, how='left') test_df = test_df.merge(gp, on=cols, how='left') gp.to_csv(filename, index=False) del gp print(train_df.shape) gc.collect() print('shape of train: ', train_df.shape) print('shape of test: ', test_df.shape) # print('train.head: ') # print(train_df.head()) # print('test head: ') # print(test_df.head()) len_train = len(train_df) train_df=train_df.append(test_df) # train_df = pd.concat([train_df, test_df], 0) del test_df gc.collect() # def extract_feature(df, col): # filename = col + '.csv' # if os.path.exists(filename): # print('loading from {} file...'.format(filename)) # dp = pd.read_csv(filename) # df[col] = dp.values # del dp # else: # df[col] = pd.to_datetime(df.click_time).dt.hour.astype('uint8') # df[col].to_csv(filename) # print('Preprocessing click_time...') # train_df['click_time'] = (train_df['click_time'].astype(np.int64) // 10 ** 9).astype(np.int32) # train_df['day'] = pd.to_datetime(train_df.click_time).dt.day.astype('uint8') # extract_feature(train_df) # extract_feature(test_df) gc.collect() # naddfeat=9 # for i in range(0,naddfeat): # if i==0: selcols=['ip', 'channel']; QQ=4; # if i==1: selcols=['ip', 'device', 'os', 'app']; QQ=5; # if i==2: selcols=['ip', 'day', 'hour']; QQ=4; # if i==3: selcols=['ip', 'app']; QQ=4; # if i==4: selcols=['ip', 'app', 'os']; QQ=4; # if i==5: selcols=['ip', 'device']; QQ=4; # if i==6: selcols=['app', 'channel']; QQ=4; # if i==7: selcols=['ip', 'os']; QQ=5; # if i==8: selcols=['ip', 'device', 'os', 'app']; QQ=4; # tpye: # 4: nunique 不同selctor 所对应unique value 的数量 # 5: cumcont for i, item in enumerate(groups): selcols = item[0] QQ = item[-1] print('selcols',selcols,'QQ',QQ) colname = '_'.join(selcols) + '_' + str(QQ) predictors.append(colname) filename= colname + '.csv' if os.path.exists(filename): if QQ==5: gp=pd.read_csv(filename,header=None) train_df[colname]=gp else: gp=pd.read_csv(filename) train_df = train_df.merge(gp, on=selcols[0:-1], how='left') else: if QQ==0: gp = train_df[selcols].groupby(by=selcols[0:-1])[selcols[-1]].count().reset_index().\ rename(index=str, columns={selcols[-1]: colname}) train_df = train_df.merge(gp, on=selcols[0:-1], how='left') if QQ==1: gp = train_df[selcols].groupby(by=selcols[0:-1])[selcols[-1]].mean().reset_index().\ rename(index=str, columns={selcols[-1]: 'X'+str(i)}) train_df = train_df.merge(gp, on=selcols[0:-1], how='left') if QQ==2: gp = train_df[selcols].groupby(by=selcols[0:-1])[selcols[-1]].var().reset_index().\ rename(index=str, columns={selcols[-1]: 'X'+str(i)}) train_df = train_df.merge(gp, on=selcols[0:-1], how='left') if QQ==3: gp = train_df[selcols].groupby(by=selcols[0:-1])[selcols[-1]].skew().reset_index().\ rename(index=str, columns={selcols[-1]: 'X'+str(i)}) train_df = train_df.merge(gp, on=selcols[0:-1], how='left') if QQ==4: gp = train_df[selcols].groupby(by=selcols[0:-1])[selcols[-1]].nunique().reset_index().\ rename(index=str, columns={selcols[-1]: colname}) train_df = train_df.merge(gp, on=selcols[0:-1], how='left') if QQ==5: gp = train_df[selcols].groupby(by=selcols[0:-1])[selcols[-1]].cumcount() train_df[colname]=gp.values if QQ == 6: gp = train_df[selcols].groupby(by=selcols[0:-1])[selcols[-1]].var().reset_index().rename(index=str, columns={selcols[-1]: colname}) train_df = train_df.merge(gp, on=selcols[0:-1], how='left') if QQ == 7: gp = train_df[selcols].groupby(by=selcols[0:-1])[selcols[-1]].mean().reset_index().rename(index=str, columns={selcols[-1]: colname}) train_df = train_df.merge(gp, on=selcols[0:-1], how='left') if QQ == 'NC': train_df['click_time'] = (train_df['click_time'].astype(np.int64) // 10 ** 9).astype(np.int32) gp = (train_df.groupby(selcols).click_time.shift(-1) - train_df.click_time).astype(np.float32) train_df[colname] = gp # if not debug: if QQ != 'NC': if debug: gp.to_csv('test'+filename, index=False) else: gp.to_csv(filename,index=False) del gp gc.collect() # train_df['click_time'] = (train_df['click_time'].astype(np.int64) // 10 ** 9).astype(np.int32) # train_df['NC'] = (train_df.groupby(['ip', 'app', 'device', 'os']).click_time.shift(-1) - train_df.click_time).astype(np.float32) # print('doing nextClick') # new_feature = 'nextClick' # filename='nextClick_%d_%d.csv'%(frm,to) # if os.path.exists(filename): # print('loading from save file') # QQ=pd.read_csv(filename).values # else: # # D=2**26 # # train_df['category'] = (train_df['ip'].astype(str) + "_" + train_df['app'].astype(str) + "_" + train_df['device'].astype(str) \ # # + "_" + train_df['os'].astype(str)).apply(hash) % D # # click_buffer= np.full(D, 3000000000, dtype=np.uint32) # # train_df['epochtime']= train_df['click_time'].astype(np.int64) // 10 ** 9 # # next_clicks= [] # # for category, t in zip(reversed(train_df['category'].values), reversed(train_df['epochtime'].values)): # # next_clicks.append(click_buffer[category]-t) # # click_buffer[category]= t # # del(click_buffer) # # QQ= list(reversed(next_clicks)) # # del train_df['category'] # # del train_df['epochtime'] # train_df['click_time'] = (train_df['click_time'].astype(np.int64) // 10 ** 9).astype(np.int32) # train_df['nextClick'] = (train_df.groupby(['ip', 'app', 'device', 'os']).click_time.shift(-1) - train_df.click_time).astype(np.float32) # if not debug: # print('saving') # pd.DataFrame(QQ).to_csv(filename,index=False) # train_df[new_feature] = QQ # predictors.append(new_feature) # train_df[new_feature+'_shift'] = pd.DataFrame(QQ).shift(+1).values # predictors.append(new_feature+'_shift') # del QQ # gc.collect() # predictors.extend(['nextClick', 'category', 'epochtime', 'nextClick_shift']) # print('grouping by ip-day-hour combination...') # gp = train_df[['ip','day','hour','channel']].groupby(by=['ip','day','hour'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_tcount'}) # train_df = train_df.merge(gp, on=['ip','day','hour'], how='left') # del gp # gc.collect() # print('grouping by ip-app combination...') # gp = train_df[['ip', 'app', 'channel']].groupby(by=['ip', 'app'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_app_count'}) # train_df = train_df.merge(gp, on=['ip','app'], how='left') # del gp # gc.collect() # print('grouping by ip-app-os combination...') # gp = train_df[['ip','app', 'os', 'channel']].groupby(by=['ip', 'app', 'os'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_app_os_count'}) # train_df = train_df.merge(gp, on=['ip','app', 'os'], how='left') # del gp # gc.collect() # Adding features with var and mean hour (inspired from nuhsikander's script) # print('grouping by : ip_day_chl_var_hour') # gp = train_df[['ip','day','hour','channel']].groupby(by=['ip','day','channel'])[['hour']].var().reset_index().rename(index=str, columns={'hour': 'ip_tchan_count'}) # train_df = train_df.merge(gp, on=['ip','day','channel'], how='left') # del gp # gc.collect() # predictors.append() # print('grouping by : ip_app_os_var_hour') # gp = train_df[['ip','app', 'os', 'hour']].groupby(by=['ip', 'app', 'os'])[['hour']].var().reset_index().rename(index=str, columns={'hour': 'ip_app_os_var'}) # train_df = train_df.merge(gp, on=['ip','app', 'os'], how='left') # del gp # gc.collect() # print('grouping by : ip_app_channel_var_day') # gp = train_df[['ip','app', 'channel', 'day']].groupby(by=['ip', 'app', 'channel'])[['day']].var().reset_index().rename(index=str, columns={'day': 'ip_app_channel_var_day'}) # train_df = train_df.merge(gp, on=['ip','app', 'channel'], how='left') # del gp # gc.collect() # print('grouping by : ip_app_chl_mean_hour') # gp = train_df[['ip','app', 'channel','hour']].groupby(by=['ip', 'app', 'channel'])[['hour']].mean().reset_index().rename(index=str, columns={'hour': 'ip_app_channel_mean_hour'}) # print("merging...") # train_df = train_df.merge(gp, on=['ip','app', 'channel'], how='left') # del gp # gc.collect() print("variables and data type: ") train_df.info() # train_df['ip_tcount'] = train_df['ip_tcount'].astype('uint16') # train_df['ip_app_count'] = train_df['ip_app_count'].astype('uint16') # train_df['ip_app_os_count'] = train_df['ip_app_os_count'].astype('uint16') target = 'is_attributed' ### 有问题 需要解决 predictors.extend(initial_cols) categorical = ['app', 'device', 'os', 'channel', 'hour'] # for i in range(0,naddfeat): # predictors.append('X'+str(i)) print('predictors',predictors) test_df = train_df[len_train:] val_df = train_df[train_df.day == 9] train_df = train_df[(train_df.day == 7) | (train_df.day == 8)] # val_df = train_df[(len_train-val_size):len_train] # train_df = train_df[:(len_train-val_size)] print("train size: ", len(train_df)) print("valid size: ", len(val_df)) print("test size : ", len(test_df)) sub = pd.DataFrame() sub['click_id'] = test_df['click_id'].astype('int') gc.collect() print("Training...") start_time = time.time() params = { 'learning_rate': 0.08, #0.2, #'is_unbalance': 'true', # replaced with scale_pos_weight argument 'num_leaves': 7, # 2^max_depth - 1 'max_depth': 3, # -1 means no limit 'min_child_samples': 100, # Minimum number of data need in a child(min_data_in_leaf) 'max_bin': 100, # Number of bucketed bin for feature values 'subsample': 0.7, # Subsample ratio of the training instance. 'subsample_freq': 1, # frequence of subsample, <=0 means no enable 'colsample_bytree': 0.9, # Subsample ratio of columns when constructing each tree. 'min_child_weight': 0, # Minimum sum of instance weight(hessian) needed in a child(leaf) 'scale_pos_weight':200 # because training data is extremely unbalanced } (bst,best_iteration) = lgb_modelfit_nocv(params, train_df, val_df, predictors, target, objective='binary', metrics='auc', early_stopping_rounds=30, verbose_eval=True, num_boost_round=1000, categorical_features=categorical) print('[{}]: model training time'.format(time.time() - start_time)) del train_df del val_df gc.collect() print('Plot feature importances...') fig = plt.figure(figsize=(20, 20)) ax = lgb.plot_importance(bst, max_num_features=100, figsize=(20, 15)) # plt.show() plt.savefig(fileno+'_importance.png') print("Predicting...") sub['is_attributed'] = bst.predict(test_df[predictors],num_iteration=best_iteration) # if not debug: print("writing...") sub.to_csv('sub_{}.csv.gz'.format(str(fileno)),index=False,compression='gzip') print("done...") return sub
def Submission(valid_hour=11): wd = ['/Users/ewenwang/Documents/practice_data/conversion_rate/', '/Users/ewenwang/Documents/GitHub/Kaggle/conversion_rate/round2/'] test_file = ['round2_ijcai_18_test_b_20180510.txt'] train = Merge(which_data='train') if valid_hour>0: filter_ = (train.hour>=valid_hour) train_ = train[~filter_] valid_ = train[filter_] else: train_, valid_ = train_test_split(train, test_size=0.2, random_state=0) target = 'is_trade' # if drop_list == None: # drop_list = ['is_trade', 'instance_id', 'user_id', 'item_id', 'context_id', 'context_page_id', 'shop_id', # 'hour', 'context_timestamp'] # features = [x for x in train.columns if x not in drop_list] features = [ 'user_gender_id', 'user_age_level', 'user_star_level', 'item_brand_id', 'item_city_id', 'item_price_level', 'item_sales_level', 'item_collected_level', 'shop_star_level', 'shop_review_positive_rate', 'shop_score_service', 'shop_score_description', 'item_id_ratio', 'item_city_id_user_age_level_prob', 'item_collected_level_ratio', 'item_price_level_ratio', 'context_page_id_user_gender_id_prob', 'context_page_id_user_star_level_prob', 'shop_score_service_bin_ratio', 'shop_star_level_ratio', 'shop_review_positive_rate_bin_ratio', 'shop_id_ratio', 'shop_review_num_level_ratio', 'user_pagerank', 'hour_ratio', 'wt_item_id', 'wt_item_category_list', 'match_prop_ct_shop_id_wt', 'item_city_id_shop_id_wt', 'item_city_id_context_page_id_wt', 'list_wt_item_property_list', 'wt_item_city_id', 'match_cat_ct_shop_id_wt', 'context_page_id_item_category_list_wt', 'item_brand_id_match_prop_ct_wt', 'context_page_id_shop_star_level_wt' ] X = train_[features] y = train_[target].values X_tes = valid_[features] y_tes = valid_[target].values print('Training LGBM model...') t0=time.time() lgb_1 = lgb.LGBMClassifier( objective='binary', metric='binary_logloss', num_leaves=16, depth=4, learning_rate=0.01, seed=2018, colsample_bytree=0.6, subsample=0.8, n_estimators=20000, silent = True) lgb_model_1 = lgb_1.fit(X, y, eval_set=[(X_tes, y_tes)], early_stopping_rounds=200, verbose=False, callbacks=[lgb.print_evaluation(100)]) print('\ttime spend: ', time.time()-t0) best_iter = lgb_model_1.best_iteration_ best_score = lgb_model_1.best_score_ print('best_iter: ', best_iter, '\nbest_score: ', best_score) X_2 = train[features] y_2 = train[target].values print('Training LGBM model...') t0=time.time() lgb_2 = lgb.LGBMClassifier( objective='binary', metric='binary_logloss', num_leaves=32, depth=4, learning_rate=0.01, seed=2018, colsample_bytree=0.6, subsample=0.9, n_estimators=best_iter, silent = True) lgb_model_2 = lgb_2.fit(X_2, y_2) print('\ttime spend: ', time.time()-t0) del train test = Merge(which_data='test') print('predicting...') t0=time.time() pred = lgb_model_2.predict_proba(test[features])[:, 1] print('\ttime spend: ', time.time()-t0) test['predicted_score'] = pred result = test[['instance_id', 'predicted_score']] result = pd.DataFrame(pd.read_csv(wd[0]+test_file[0], sep=' ')['instance_id']).merge(result, on='instance_id', how='left').fillna(0) print('\nsaving...') t0=time.time() result.to_csv(wd[0]+'results.txt', sep=' ', index=False) print('\ttime spend: ', time.time()-t0) print('plotting...') lgb.plot_importance(lgb_model_2, figsize=(12, 25)) plt.show() return lgb_model_2
early_stopping_rounds=20, # early_stoppingの判定基準 verbose_eval=10) y_pred = model.predict(x_valid, num_iteration=model.best_iteration) auc = roc_auc_score(y_valid, y_pred) print(auc) models.append(model) aucs.append(auc) # 平均AUCを計算する print(mean(aucs)) # 特徴量重要度の表示 for model in models: lgb.plot_importance(model, importance_type='gain', max_num_features=15) """ 予測精度: 0.9316555393578665 """ ''' テストデータの予測 ''' # テストデータの説明変数を指定 X_test = test.drop(['y', 'id'], axis=1) # テストデータにおける予測 preds = [] for model in models:
print('Starting training...') # train gbm = lgb.train( params, lgb_train, num_boost_round=100, valid_sets=[lgb_train, lgb_test], feature_name=['f' + str(i + 1) for i in range(X_train.shape[-1])], categorical_feature=[21], evals_result=evals_result, verbose_eval=10) print('Plotting metrics recorded during training...') ax = lgb.plot_metric(evals_result, metric='l1') plt.show() print('Plotting feature importances...') ax = lgb.plot_importance(gbm, max_num_features=10) plt.show() print('Plotting 54th tree...') # one tree use categorical feature to split ax = lgb.plot_tree(gbm, tree_index=53, figsize=(15, 15), show_info=['split_gain']) plt.show() print('Plotting 54th tree with graphviz...') graph = lgb.create_tree_digraph(gbm, tree_index=53, name='Tree54') graph.render(view=True)
# LGBM Dataset Formatting lgtrain = lgb.Dataset(X, y, feature_name=tfvocab, categorical_feature=categorical) del X gc.collect() # Go Go Go lgb_clf = lgb.train( lgbm_params, lgtrain, num_boost_round=1500, verbose_eval=100 ) # Feature Importance Plot f, ax = plt.subplots(figsize=[7, 10]) lgb.plot_importance(lgb_clf, max_num_features=50, ax=ax) plt.title("Light GBM Feature Importance") plt.savefig('feature_import.png') print("Model Evaluation Stage") lgpred = lgb_clf.predict(testing) # Mixing lightgbm with ridge. I haven't really tested if this improves the score or not # blend = 0.95*lgpred + 0.05*ridge_oof_test[:,0] lgsub = pd.DataFrame(lgpred, columns=["deal_probability"], index=testdex) lgsub['deal_probability'].clip(0.0, 1.0, inplace=True) # Between 0 and 1 lgsub.to_csv("lgsub.csv", index=True, header=True) # print("Model Runtime: %0.2f Minutes"%((time.time() - modelstart)/60)) print("Notebook Runtime: %0.2f Minutes" % ((time.time() - notebookstart) / 60))