def get_train_test_features0(): config.set_feature_name('f0') if os.path.exists(config.train_feature_file) and os.path.exists(config.test_feature_file): logger.info('loading the training and test features from files.') trn = pd.read_csv(config.train_feature_file) tst = pd.read_csv(config.test_feature_file) y = trn['click_mode'].values sub = tst[['sid']].copy() feat = pd.read_csv('/home/ubuntu/projects/kddcup2019track1/build/feature/od_coord_feature.csv') trn = trn.merge(feat, how='left', on='sid') tst = tst.merge(feat, how='left', on='sid') feat = pd.read_csv('/home/ubuntu/projects/kddcup2019track1/input/data_set_phase1/var_dist_time.csv') trn = trn.merge(feat, how='left', on='sid') tst = tst.merge(feat, how='left', on='sid') feat = pd.read_csv('/home/ubuntu/projects/kddcup2019track1/input/data_set_phase1/var_dist_min.csv') trn = trn.merge(feat, how='left', on='sid') tst = tst.merge(feat, how='left', on='sid') trn.drop(['sid', 'click_mode'], axis=1, inplace=True) tst.drop(['sid', 'click_mode'], axis=1, inplace=True) return trn, y, tst, sub
def get_train_test_features4(): config.set_feature_name('f4') if os.path.exists(config.train_feature_file) and os.path.exists(config.test_feature_file): logger.info('loading the training and test features from files.') trn = pd.read_csv(config.train_feature_file) tst = pd.read_csv(config.test_feature_file) y = trn['click_mode'].values sub = tst[['sid']].copy() trn.drop(['sid', 'pid', 'click_mode'], axis=1, inplace=True) tst.drop(['sid', 'pid', 'click_mode'], axis=1, inplace=True) return trn, y, tst, sub
def get_train_test_features3(): config.set_feature_name('f3') if os.path.exists(config.train_feature_file) and os.path.exists(config.test_feature_file): logger.info('loading the training and test features from files.') trn = pd.read_csv(config.train_feature_file) tst = pd.read_csv(config.test_feature_file) else: df = merge_raw_data() logger.info('generating feature f3.') trn, tst = generate_f3(df) logger.info('saving the training and test f3 features.') trn.to_csv(config.train_feature_file, index=False) tst.to_csv(config.test_feature_file, index=False) y = trn['click_mode'].values sub = tst[['sid']].copy() trn.drop(['sid', 'pid', 'click_mode'], axis=1, inplace=True) tst.drop(['sid', 'pid', 'click_mode'], axis=1, inplace=True) return trn, y, tst, sub
prob_trn_tst = 0 for seed in [0, 17, 23, 29]: params['seed'] = 2019 + seed print(params) clf = lgb.train(params, lgb_trn, valid_sets=[lgb_trn], num_boost_round=best_iteration, verbose_eval=50, feval=eval_f) prob_trn_tst += clf.predict(tst) prob_trn_tst /= 4.0 np.savetxt(config.predict_trn_tst_bag_file, prob_trn_tst, delimiter=',') trn_tst = np.argmax(prob_trn_tst, axis=1) return trn_tst if __name__ == '__main__': trn, y, tst, sub = get_train_test_features2a() config.set_algo_name('lgb3') config.set_feature_name('f2a') p_tst = train_lgb(trn, y, tst) submit_result(sub, p_tst)
prob_trn_tst = 0 for seed in [0, 17, 23, 29]: params['seed'] = 2019 + seed print(params) clf = lgb.train(params, lgb_trn, valid_sets=[lgb_trn], num_boost_round=best_iteration, verbose_eval=50, feval=eval_f) prob_trn_tst += clf.predict(tst) prob_trn_tst /= 4.0 np.savetxt(config.predict_trn_tst_bag_file, prob_trn_tst, delimiter=',') trn_tst = np.argmax(prob_trn_tst, axis=1) return trn_tst if __name__ == '__main__': trn, y, tst, sub = get_train_test_features0() config.set_algo_name('lgb5') config.set_feature_name('f0') p_tst = train_lgb(trn, y, tst) submit_result(sub, p_tst)
X_trn, y_trn, X_val, y_val = trn.iloc[:-63388,:], y[:-63388], trn.iloc[-63388:,], y[-63388:] eval_set = [(X_trn, y_trn), (X_val, y_val)] clf.fit(X_trn, y_trn, eval_set=eval_set, eval_metric=f1_weighted, categorical_feature=cat_cols, verbose=10, early_stopping_rounds=100) #clf.fit(X_trn, y_trn, eval_set=eval_set, eval_metric=f1_adj_weighted, categorical_feature=cat_cols, verbose=10, early_stopping_rounds=100) feature_importances = list(clf.feature_importances_) feature_names = trn.columns.values.tolist() imp = pd.DataFrame({'feature_importances': feature_importances, 'feature_names':feature_names}) imp = imp.sort_values('feature_importances', ascending=False).drop_duplicates() print("[+] All feature importances", list(imp.values)) pred = clf.predict(X_val, num_iteration=clf.best_iteration_) print('Val F1: %f', f1_score(y_val, pred, average='weighted')) print(classification_report(y_val, pred)) if __name__ == '__main__': trn, y, tst, sub = get_train_test_features2() #df = pd.read_csv(config.train_feature_file) #df = df[~pd.isnull(df['click_mode'])] #trn = df.drop(['sid','req_time', 'click_mode'], axis=1) #y = df['click_mode'].values config.set_algo_name('lgb4') config.set_feature_name('f2') # f2 = train_lgb(trn, y)