def xgboost_cv(): train_start_date = '2016-03-05' train_end_date = '2016-04-06' test_start_date = '2016-04-11' test_end_date = '2016-04-16' sub_start_date = '2016-02-05' sub_end_date = '2016-03-05' sub_test_start_date = '2016-03-05' sub_test_end_date = '2016-03-10' user_index, training_data, label = make_train_set(train_start_date, train_end_date, test_start_date, test_end_date) X_train, X_test, y_train, y_test = train_test_split(training_data, label, test_size=0.2, random_state=0) dtrain=xgb.DMatrix(X_train, label=y_train) dtest=xgb.DMatrix(X_test, label=y_test) param = {'max_depth': 10, 'eta': 0.05, 'silent': 1, 'objective': 'binary:logistic'} num_round = 4000 param['nthread'] = 4 param['eval_metric'] = "auc" plst = param.items() plst += [('eval_metric', 'logloss')] evallist = [(dtest, 'eval'), (dtrain, 'train')] bst=xgb.train( plst, dtrain, num_round, evallist) sub_user_index, sub_trainning_date, sub_label = make_train_set(sub_start_date, sub_end_date, sub_test_start_date, sub_test_end_date) test = xgb.DMatrix(sub_trainning_date) #y = bst.predict(test) pred = sub_user_index.copy() y_true = sub_user_index.copy() pred['label'] = y y_true['label'] = label report(pred, y_true)
def xgboost_cv(): # 一个月为训练数据 train_start_date = '2016-02-15' train_end_date = '2016-03-15' # 根据上一个月训练的数据,预测未来五天的数据 test_start_date = '2016-03-16' test_end_date = '2016-03-20' # 用之前的数据验证模型的有效性 # 输入 sub_start_date = '2016-03-21' sub_end_date = '2016-04-02' # 输出 sub_test_start_date = '2016-04-03' sub_test_end_date = '2016-04-08' user_index, training_data, label = make_train_set(train_start_date, train_end_date, test_start_date, test_end_date) # 简单划分 训练集和验证集 X_train, X_test, y_train, y_test = train_test_split(training_data, label, test_size=0.2, random_state=0) dtrain = xgb.DMatrix(X_train.values, label=y_train) dtest = xgb.DMatrix(X_test.values, label=y_test) param = { 'max_depth': 10, 'eta': 0.05, 'silent': 1, 'objective': 'binary:logistic' } num_round = 166 param['nthread'] = 5 param['eval_metric'] = "auc" plst = param.items() evallist = [(dtest, 'eval'), (dtrain, 'train')] bst = xgb.train(plst, dtrain, num_round, evallist) sub_user_index, sub_trainning_data, sub_label = make_train_set( sub_start_date, sub_end_date, sub_test_start_date, sub_test_end_date) sub_trainning_data = xgb.DMatrix(sub_trainning_data.values) y = bst.predict(sub_trainning_data) y_mean = stats.describe(y).mean # plt.hist(y) # plt.show() pred = sub_user_index.copy() y_true = sub_user_index.copy() pred['label'] = y y_true['label'] = label pred = pred[pred['label'] >= 0.04] y_true = y_true[y_true['label'] == 1] report(pred, y_true)
def xgboost_cv(): train_start_date = '2016-03-05' train_end_date = '2016-04-06' test_start_date = '2016-04-11' test_end_date = '2016-04-16' sub_start_date = '2016-02-05' sub_end_date = '2016-03-05' sub_test_start_date = '2016-03-05' sub_test_end_date = '2016-03-10' user_index, training_data, label = make_train_set(train_start_date, train_end_date, test_start_date, test_end_date) # 简单划分 训练集和验证集 X_train, X_test, y_train, y_test = train_test_split(training_data, label, test_size=0.2, random_state=0) dtrain = xgb.DMatrix(X_train, label=y_train) dtest = xgb.DMatrix(X_test, label=y_test) param = { 'learning_rate': 0.1, 'n_estimators': 1000, 'max_depth': 3, 'min_child_weight': 5, 'gamma': 0, 'subsample': 1.0, 'colsample_bytree': 0.8, 'scale_pos_weight': 1, 'eta': 0.05, 'silent': 1, 'objective': 'binary:logistic' } num_round = 250 param['nthread'] = 5 param['eval_metric'] = "auc" plst = param.items() plst += [('eval_metric', 'logloss')] evallist = [(dtest, 'eval'), (dtrain, 'train')] bst = xgb.train(plst, dtrain, num_round, evallist) sub_user_index, sub_trainning_date, sub_label = make_train_set( sub_start_date, sub_end_date, sub_test_start_date, sub_test_end_date) test = xgb.DMatrix(sub_trainning_date) y = bst.predict(test) pred = sub_user_index.copy() y_true = sub_user_index.copy() pred['label'] = y y_true['label'] = label report(pred, y_true)
def xgboost_cv(): train_start_date = '2016-03-06' train_end_date = '2016-04-06' test_start_date = '2016-04-06' test_end_date = '2016-04-11' sub_start_date = '2016-03-11' sub_end_date = '2016-04-11' sub_test_start_date = '2016-04-11' sub_test_end_date = '2016-04-16' user_index, training_data, label = make_train_set(train_start_date, train_end_date, test_start_date, test_end_date) X_train, X_test, y_train, y_test = train_test_split(training_data, label, test_size=0.2, random_state=0) del user_index, training_data, label dtrain=xgb.DMatrix(X_train.values, label=y_train) #todo: missing=-999.0 dtest=xgb.DMatrix(X_test.values, label=y_test) del X_train, X_test, y_train, y_test # param = {'max_depth': 10, 'eta': 0.05, 'silent': 1, 'objective': 'binary:logistic'} param = {'learning_rate' : 0.1, 'n_estimators': 1000, 'max_depth': 3, 'min_child_weight': 5, 'gamma': 0, 'subsample': 1.0, 'colsample_bytree': 0.8, 'scale_pos_weight': 1, 'eta': 0.05, 'silent': 1, 'objective': 'binary:logistic'} num_round = 4000 param['nthread'] = 6 param['eval_metric'] = "auc" plst = param.items() plst += [('eval_metric', 'logloss')] evallist = [(dtest, 'eval'), (dtrain, 'train')] bst=xgb.train( plst, dtrain, num_round, evallist) del dtrain print('saving model...') # flag = 'basic' # bst.save_model('./cache/' + flag + '_model') sub_user_index, sub_training_data, sub_label = make_train_set(sub_start_date, sub_end_date, sub_test_start_date, sub_test_end_date, test=True) test = xgb.DMatrix(sub_training_data.values) y = bst.predict(test) pred = sub_user_index.copy() pred['label'] = y pred = pred.sort_values(by=['label'], ascending=False).groupby(['user_id'], as_index=False).first() limits = np.linspace(0, 0.5, 100) scores = np.zeros((1,100)) count = 0 for i in limits: print('--------------------------------------------------------------------') print('limit=%s' % str(i)) p = pred[pred['label'] > i] scores[:, count] = report(p, sub_label) count += 1 print('max score : %s\nmax limit : %s' % (np.max(scores), limits[np.argmax(scores)]))
def logistic_make_submission(): train_start_date = '2016-03-10' train_end_date = '2016-04-11' test_start_date = '2016-04-11' test_end_date = '2016-04-16' sub_start_date = '2016-03-15' sub_end_date = '2016-04-16' user_index, training_data, label = make_train_set(train_start_date, train_end_date, test_start_date, test_end_date) X_train, X_test, y_train, y_test = train_test_split(training_data.values, label.values, test_size=0.2, random_state=0) y_train = list(map(int, y_train)) # print(np.any(np.isnan(X_train))) # print(np.all(np.isfinite(X_train))) clf = lg() # 使用类,参数全是默认的 clf.fit(X_train, y_train) sub_user_index, sub_trainning_data = make_test_set(sub_start_date, sub_end_date) y_hat = clf.predict(sub_trainning_data.values) sub_user_index['label'] = y_hat pred = sub_user_index[sub_user_index['label'] == 1] pred = pred[['user_id', 'sku_id']] pred = pred.groupby('user_id').first().reset_index() pred['user_id'] = pred['user_id'].astype(int) pred.to_csv('../sub/submissionLOG508.csv', index=False, index_label=False)
def xgboost_make_submission(): train_start_date = '2016-03-10' train_end_date = '2016-04-11' test_start_date = '2016-04-11' test_end_date = '2016-04-16' sub_start_date = '2016-03-15' sub_end_date = '2016-04-16' user_index, training_data, label = make_train_set(train_start_date, train_end_date, test_start_date, test_end_date) X_train, X_test, y_train, y_test = train_test_split(training_data.values, label.values, test_size=0.2, random_state=0) dtrain=xgb.DMatrix(X_train.values, label=y_train) dtest=xgb.DMatrix(X_test.values, label=y_test) param = {'learning_rate' : 0.1, 'n_estimators': 1000, 'max_depth': 3, 'min_child_weight': 5, 'gamma': 0, 'subsample': 1.0, 'colsample_bytree': 0.8, 'scale_pos_weight': 1, 'eta': 0.05, 'silent': 1, 'objective': 'binary:logistic'} num_round = 283 param['nthread'] = 4 #param['eval_metric'] = "auc" plst = param.items() plst += [('eval_metric', 'logloss')] evallist = [(dtest, 'eval'), (dtrain, 'train')] bst=xgb.train(plst, dtrain, num_round, evallist) sub_user_index, sub_trainning_data = make_test_set(sub_start_date, sub_end_date,) sub_trainning_data = xgb.DMatrix(sub_trainning_data.values) y = bst.predict(sub_trainning_data) sub_user_index['label'] = y pred = sub_user_index[sub_user_index['label'] >= 0.03] pred = pred[['user_id', 'sku_id']] pred = pred.groupby('user_id').first().reset_index() pred['user_id'] = pred['user_id'].astype(int) pred.to_csv('./sub/submission.csv', index=False, index_label=False)
def xgboost_make_submission(): train_start_date = '2016-03-10' train_end_date = '2016-04-11' test_start_date = '2016-04-11' test_end_date = '2016-04-16' #测试集构建,根据测试集特征数据集预测后五天的label sub_start_date = '2016-03-15' sub_end_date = '2016-04-16' user_index, training_data, label = make_train_set(train_start_date, train_end_date, test_start_date, test_end_date) X_train, X_test, y_train, y_test = train_test_split(training_data.values, label.values, test_size=0.2, random_state=0) dtrain=xgb.DMatrix(X_train, label=y_train) dtest=xgb.DMatrix(X_test, label=y_test) param = {'learning_rate' : 0.1, 'n_estimators': 1000, 'max_depth': 3, 'min_child_weight': 5, 'gamma': 0, 'subsample': 1.0, 'colsample_bytree': 0.8, 'scale_pos_weight': 1, 'eta': 0.05, 'silent': 1, 'objective': 'binary:logistic'} num_round = 283 param['nthread'] = 4 #param['eval_metric'] = "auc" plst = param.items() plst += [('eval_metric', 'logloss')] evallist = [(dtest, 'eval'), (dtrain, 'train')] bst=xgb.train(plst, dtrain, num_round, evallist) sub_user_index, sub_trainning_data = make_test_set(sub_start_date, sub_end_date,) sub_trainning_data = xgb.DMatrix(sub_trainning_data.values) #预测得到用户-商品对数据标签 y = bst.predict(sub_trainning_data) sub_user_index['label'] = y #将用户-商品对出现概率大于0.03的拿出来 pred = sub_user_index[sub_user_index['label'] >= 0.03] pred = pred[['user_id', 'sku_id']] pred = pred.groupby('user_id').first().reset_index() pred['user_id'] = pred['user_id'].astype(int) pred.to_csv('./sub/submission.csv', index=False, index_label=False)
def xgboost_make_submission(): train_start_date = '2016-03-10' train_end_date = '2016-04-11' test_start_date = '2016-04-11' test_end_date = '2016-04-16' sub_start_date = '2016-03-15' sub_end_date = '2016-04-16' user_index, training_data, label = make_train_set(train_start_date, train_end_date, test_start_date, test_end_date) X_train, X_test, y_train, y_test = train_test_split( training_data.values, label.values, test_size=0.2, random_state=0) # select some features dtrain = xgb.DMatrix(X_train, label=y_train) dtest = xgb.DMatrix(X_test, label=y_test) # don't use these param = { 'learning_rate': 0.05, 'n_estimators': 1000, 'max_depth': 5, 'min_child_weight': 1, 'gamma': 0, 'subsample': 1, 'colsample_bytree': 0.8, 'scale_pos_weight': 1, 'eta': 0.05, 'silent': 1, 'objective': 'binary:logistic' } num_round = 20 param['nthread'] = 5 #param['eval_metric'] = "auc" plst = param.items() plst = list(plst) plst += [('eval_metric', 'auc')] evallist = [(dtest, 'eval'), (dtrain, 'train')] bst = xgb.train(plst, dtrain, num_round, evallist) # make test data sub_user_index, sub_trainning_data = make_test_set(sub_start_date, sub_end_date) sub_trainning_data = xgb.DMatrix( sub_trainning_data.values ) # predict this subdata,the DMatrix Object is array y_hat = bst.predict(sub_trainning_data) sub_user_index['label'] = y_hat pred = sub_user_index[sub_user_index['label'] >= 0.05] pred = pred[['user_id', 'sku_id']] pred = pred.groupby('user_id').first().reset_index() pred['user_id'] = pred['user_id'].astype(int) pred.to_csv('../sub/submission424.csv', index=False, index_label=False)
def gdbt_train(): from sklearn.ensemble import RandomForestClassifier train_start_date = '2016-03-10' train_end_date = '2016-04-11' test_start_date = '2016-04-11' test_end_date = '2016-04-16' user_index, training_data, label = make_train_set(train_start_date, train_end_date, test_start_date, test_end_date) X_train, X_test, y_train, y_test = train_test_split(training_data.values, label.values, test_size=0.2, random_state=0) # np.savetxt('train.txt',X_train,fmt='%.2f',delimiter=' ') # clf = GradientBoostingClassifier(n_estimators=220) clf = RandomForestClassifier(n_estimators=220, criterion="gini", max_depth=10, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, min_impurity_split=1e-7, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False, class_weight=None) print type(X_train) clf.fit(X_train, y_train) pre_y_test = clf.predict_proba(X_test) print pre_y_test print("GBDT Metrics : {0}".format( precision_recall_fscore_support(y_test, pre_y_test))) print u'保存结果.....' f_result = open('result.txt', 'w') for i in range(0, len(pre_y_test)): if i == 0: print str(pre_y_test[i][0]) if i == len(pre_y_test) - 1: print str(pre_y_test[i][0]) f_result.write(str(pre_y_test[i][0]) + '\n')
def gbdt_make_submission(): train_start_date = '2016-03-10' train_end_date = '2016-04-11' test_start_date = '2016-04-11' test_end_date = '2016-04-16' sub_start_date = '2016-03-15' sub_end_date = '2016-04-16' user_index, training_data, label = make_train_set(train_start_date, train_end_date, test_start_date, test_end_date) training_data = training_data.fillna(0) print(training_data.info()) X_train, X_test, y_train, y_test = train_test_split(training_data.values, label.values, test_size=0.2, random_state=0) # X_train = X_train.astype(int) y_train = list(map(int, y_train)) param = { 'n_estimators': 1200, 'max_depth': 3, 'subsample': 1.0, 'learning_rate': 0.01, 'min_samples_leaf': 1, 'random_state': 3, 'max_features': 0.8 } clf = gbdt(param) clf.fit(X_train, y_train) sub_user_index, sub_trainning_data = make_test_set(sub_start_date, sub_end_date) sub_trainning_data = sub_trainning_data.fillna(0) y_hat = clf.predict(sub_trainning_data.values) sub_user_index['label'] = y_hat pred = sub_user_index[sub_user_index['label'] == 1] pred = pred[['user_id', 'sku_id']] pred = pred.groupby('user_id').first().reset_index() pred['user_id'] = pred['user_id'].astype(int) pred.to_csv('../sub/submissionGBDT508.csv', index=False, index_label=False)
def xgboost_train(): train_start_date = '2016-03-10' train_end_date = '2016-04-11' test_start_date = '2016-04-11' test_end_date = '2016-04-16' user_index, training_data, label = make_train_set(train_start_date, train_end_date, test_start_date, test_end_date) X_train, X_test, y_train, y_test = train_test_split(training_data.values, label.values, test_size=0.2, random_state=0) dtrain = xgb.DMatrix(X_train, label=y_train) dtest = xgb.DMatrix(X_test, label=y_test) param = { 'learning_rate': 0.1, 'n_estimators': 1000, 'max_depth': 3, 'min_child_weight': 5, 'gamma': 0, 'subsample': 1.0, 'colsample_bytree': 0.8, 'scale_pos_weight': 1, 'eta': 0.05, 'silent': 1, 'objective': 'binary:logistic' } num_round = 283 param['nthread'] = 4 # param['eval_metric'] = "auc" plst = param.items() plst += [('eval_metric', 'logloss')] evallist = [(dtest, 'eval'), (dtrain, 'train')] bst = xgb.train(plst, dtrain, num_round, evallist) bst.save_model('./cache/bstmodel.bin') return bst
def xgboost_cv2(): train_start_date = '2016-03-05' train_end_date = '2016-04-06' test_start_date = '2016-04-06' test_end_date = '2016-04-11' sub_start_date = '2016-03-10' sub_end_date = '2016-04-11' sub_test_start_date = '2016-04-11' sub_test_end_date = '2016-04-16' user_index, training_data, label = make_train_set(train_start_date, train_end_date, test_start_date, test_end_date) X_train, X_test, y_train, y_test = train_test_split(training_data, label, test_size=0.2, random_state=0) dtrain = xgb.DMatrix(X_train.values, label=y_train) dtest = xgb.DMatrix(X_test.values, label=y_test) param = { 'learning_rate': 0.05, 'n_estimators': 1000, 'max_depth': 10, 'min_child_weight': 1, 'gamma': 0, 'subsample': 0.8, 'colsample_bytree': 0.8, 'scale_pos_weight': 1, 'eta': 0.05, 'silent': 1, 'objective': 'binary:logistic', 'eval_metric': 'auc' } num_round = 300 param['nthread'] = 5 # param['eval_metric'] = "auc" # plst = param.items() # plst += [('eval_metric', 'logloss')] evallist = [(dtest, 'eval'), (dtrain, 'train')] bst = xgb.train(param, dtrain, num_round, evallist) sub_user_index, sub_trainning_date, sub_label = make_train_set( sub_start_date, sub_end_date, sub_test_start_date, sub_test_end_date) # use this data to see the offline score test = xgb.DMatrix(sub_trainning_date.values) y = bst.predict(test) pred = sub_user_index.copy() y_true = get_labels_8( sub_test_start_date, sub_test_end_date) # during the test date, real label for cate 8 # y_true = sub_user_index.copy() pred[ 'label'] = y # add the new column which is the predict label for the test date # print(pred[(pred.label >= 0.12)].shape) # print("y_true:") # print(y_true) # pred = pred[(pred.label >= 0.35)] # print(len(pred)) # print(pred) ans = [] for i in range(0, 30): pred = sub_user_index.copy() pred['label'] = y pred = pred[pred.label >= i / 100] # print(pred) rep = report(pred, y_true) print('%s : score:%s' % (i / 100, rep)) ans.append([i / 100, rep]) print('ans:%s' % ans) threshold = sorted(ans, key=getKey, reverse=True)[0][0] bestscore = sorted(ans, key=getKey, reverse=True)[0][1] print('best threshold:%s' % threshold) print('best score:%s' % bestscore)
def gbdt_cv(): train_start_date = '2016-03-05' train_end_date = '2016-04-06' test_start_date = '2016-04-06' test_end_date = '2016-04-11' sub_start_date = '2016-03-10' sub_end_date = '2016-04-11' sub_test_start_date = '2016-04-11' sub_test_end_date = '2016-04-16' user_index, training_data, label = make_train_set(train_start_date, train_end_date, test_start_date, test_end_date) X_train, X_test, y_train, y_test = train_test_split(training_data, label, test_size=0.2, random_state=0) param = { 'n_estimators': 1200, 'max_depth': 3, 'subsample': 1.0, 'learning_rate': 0.01, 'min_samples_leaf': 1, 'random_state': 3, 'max_features': 0.8 } clf = gbdt(param) clf.fit(X_train, y_train) sub_user_index, sub_trainning_date, sub_label = make_train_set( sub_start_date, sub_end_date, sub_test_start_date, sub_test_end_date) # use this data to see the offline score test = sub_trainning_date.values y = clf.predict(test) pred = sub_user_index.copy() y_true = get_labels_8( sub_test_start_date, sub_test_end_date) # during the test date, real label for cate 8 # y_true = sub_user_index.copy() pred[ 'label'] = y # add the new column which is the predict label for the test date ans = [] for i in range(0, 30): pred = sub_user_index.copy() pred['label'] = y pred = pred[pred.label >= i / 100] # print(pred) rep = report(pred, y_true) print('%s : score:%s' % (i / 100, rep)) ans.append([i / 100, rep]) print('ans:%s' % ans) threshold = sorted(ans, key=getKey, reverse=True)[0][0] bestscore = sorted(ans, key=getKey, reverse=True)[0][1] print('best threshold:%s' % threshold) print('best score:%s' % bestscore)
def xgboost_report_submission(): train_start_date = '2016-03-08' train_end_date = '2016-04-09' result_start_date = '2016-04-09' result_end_date = '2016-04-14' valid_start_date = '2016-03-01' valid_end_date = '2016-04-02' valid_result_start_date = '2016-04-02' valid_result_end_date = '2016-04-07' test_start_date = '2016-03-15' test_end_date = '2016-04-16' user_index, training_data, label = make_train_set(train_start_date, train_end_date, result_start_date, result_end_date) x_train, x_test, y_train, y_test = train_test_split(training_data.values, label.values, test_size=0.2, random_state=0) dtrain = xgb.DMatrix(x_train, label=y_train) dtest = xgb.DMatrix(x_test, label=y_test) param = {'learning_rate': 0.1, 'n_estimators': 1000, 'max_depth': 3, 'min_child_weight': 5, 'gamma': 0, 'subsample': 1.0, 'colsample_bytree': 0.8, 'scale_pos_weight': 1, 'eta': 0.05, 'silent': 1, 'objective': 'binary:logistic'} num_round = 283 param['nthread'] = 4 #param['eval_metric'] = "auc" plst = param.items() plst += [('eval_metric', 'logloss')] evallist = [(dtest, 'eval'), (dtrain, 'train')] bst = xgb.train(plst, dtrain, num_round, evallist) # Report with validation set valid_user_index, valid_trainning_date = make_test_set(valid_start_date, valid_end_date) valid_trainning_date = xgb.DMatrix(valid_trainning_date.values) pred_y = bst.predict(valid_trainning_date) valid_pred = valid_user_index.copy() valid_pred['label'] = pred_y valid_pred = valid_pred[valid_pred['label'] >= 0.014] valid_pred = valid_pred.sort_values('label', ascending=False).groupby('user_id').first().reset_index() valid_true = get_true(valid_result_start_date, valid_result_end_date) report(valid_pred, valid_true) valid_pred = valid_pred[valid_pred['label'] >= 0.016] print 0.016 report(valid_pred, valid_true) valid_pred = valid_pred[valid_pred['label'] >= 0.018] print 0.018 report(valid_pred, valid_true) valid_pred = valid_pred[valid_pred['label'] >= 0.02] print 0.02 report(valid_pred, valid_true) valid_pred = valid_pred[valid_pred['label'] >= 0.022] print 0.022 report(valid_pred, valid_true) valid_pred = valid_pred[valid_pred['label'] >= 0.024] print 0.024 report(valid_pred, valid_true) valid_pred = valid_pred[valid_pred['label'] >= 0.026] print 0.026 report(valid_pred, valid_true) valid_pred = valid_pred[valid_pred['label'] >= 0.028] print 0.028 report(valid_pred, valid_true) valid_pred = valid_pred[valid_pred['label'] >= 0.03] print 0.03 report(valid_pred, valid_true)
def xgboost_make_submission(): train_start_date = '2016-03-31' train_end_date = '2016-04-10' test_start_date = '2016-04-10' test_end_date = '2016-04-16' sub_start_date = '2016-04-06' sub_end_date = '2016-04-16' user_index, training_data, label = make_train_set(train_start_date, train_end_date, test_start_date, test_end_date) list_of_train = list(training_data.columns) print len(list_of_train) X_train, X_test, y_train, y_test = train_test_split(training_data.values, label.values, test_size=0.2, random_state=0) dtrain = xgb.DMatrix(X_train, label=y_train) dtest = xgb.DMatrix(X_test, label=y_test) param = { 'learning_rate': 0.1, 'n_estimators': 1000, 'max_depth': 3, 'min_child_weight': 5, 'gamma': 0, 'subsample': 1.0, 'colsample_bytree': 0.8, 'scale_pos_weight': 1, 'eta': 0.05, 'silent': 1, 'objective': 'binary:logistic' } # num_round = 345 num_round = 511 # param['nthread'] = 8 param['eval_metric'] = "auc" plst = param.items() plst += [('eval_metric', 'logloss')] evallist = [(dtest, 'eval'), (dtrain, 'train')] bst = xgb.train(plst, dtrain, num_round, evallist, early_stopping_rounds=10) importance = bst.get_fscore() print importance feat_importances = [] for ft, score in importance.iteritems(): ft = ft.split('f')[1] feat_importances.append({'Feature': ft, 'Importance': score}) feat_importances = pd.DataFrame(feat_importances) feat_importances = feat_importances.sort_values( by='Importance', ascending=False).reset_index(drop=True) new_columns = [] for index in list(feat_importances['Feature']): index = int(index) # feat_importances[index]['Feature'] = new_columns.append(list_of_train[index]) name_of = pd.DataFrame({'new': new_columns}) feat_importances = pd.concat([feat_importances, name_of], axis=1) feat_importances.to_csv('./sub/fecure.csv') sub_user_index, sub_trainning_data = make_test_set(sub_start_date, sub_end_date) sub_trainning_data = xgb.DMatrix(sub_trainning_data.values) y_label = bst.predict(xgb.DMatrix(X_test)) fpr, tpr, threasholds = roc_curve(y_test, y_label, pos_label=2) print fpr, tpr, threasholds # print auc(fpr,tpr) # plt.plot(threasholds,fpr) # plt.show() y = bst.predict(sub_trainning_data) sub_user_index['label'] = y # print np.median(y) # print sub_user_index # pred = sub_user_index.groupby('user_id').max().reset_index() # print pred pred = sub_user_index[sub_user_index['label'] >= 0.04] pred = pred[['user_id', 'sku_id']] pred = pred.groupby('user_id').max().reset_index() pred['user_id'] = pred['user_id'].astype(int) pred.to_csv('./sub/submission.csv', index=False, index_label=False) buy_cate_8 = np.load('./unique_8/user_id_unique.npy') pred = pred[~pred['user_id'].isin(buy_cate_8)] pred.to_csv('./sub/submission_unique.csv', index=False, index_label=False)