Esempio n. 1
0
def xgboost_cv():
    train_start_date = '2016-03-05'
    train_end_date = '2016-04-06'
    test_start_date = '2016-04-11'
    test_end_date = '2016-04-16'

    sub_start_date = '2016-02-05'
    sub_end_date = '2016-03-05'
    sub_test_start_date = '2016-03-05'
    sub_test_end_date = '2016-03-10'

    user_index, training_data, label = make_train_set(train_start_date, train_end_date, test_start_date, test_end_date)
    X_train, X_test, y_train, y_test = train_test_split(training_data, label, test_size=0.2, random_state=0)
    dtrain=xgb.DMatrix(X_train, label=y_train)
    dtest=xgb.DMatrix(X_test, label=y_test)
    param = {'max_depth': 10, 'eta': 0.05, 'silent': 1, 'objective': 'binary:logistic'}
    num_round = 4000
    param['nthread'] = 4
    param['eval_metric'] = "auc"
    plst = param.items()
    plst += [('eval_metric', 'logloss')]
    evallist = [(dtest, 'eval'), (dtrain, 'train')]
    bst=xgb.train( plst, dtrain, num_round, evallist)

    sub_user_index, sub_trainning_date, sub_label = make_train_set(sub_start_date, sub_end_date,
                                                                   sub_test_start_date, sub_test_end_date)
    test = xgb.DMatrix(sub_trainning_date)
    #y = bst.predict(test)

    pred = sub_user_index.copy()
    y_true = sub_user_index.copy()
    pred['label'] = y
    y_true['label'] = label
    report(pred, y_true)
Esempio n. 2
0
def xgboost_cv():
    # 一个月为训练数据
    train_start_date = '2016-02-15'
    train_end_date = '2016-03-15'
    # 根据上一个月训练的数据,预测未来五天的数据
    test_start_date = '2016-03-16'
    test_end_date = '2016-03-20'

    # 用之前的数据验证模型的有效性
    # 输入
    sub_start_date = '2016-03-21'
    sub_end_date = '2016-04-02'
    # 输出
    sub_test_start_date = '2016-04-03'
    sub_test_end_date = '2016-04-08'

    user_index, training_data, label = make_train_set(train_start_date,
                                                      train_end_date,
                                                      test_start_date,
                                                      test_end_date)
    # 简单划分 训练集和验证集
    X_train, X_test, y_train, y_test = train_test_split(training_data,
                                                        label,
                                                        test_size=0.2,
                                                        random_state=0)
    dtrain = xgb.DMatrix(X_train.values, label=y_train)
    dtest = xgb.DMatrix(X_test.values, label=y_test)
    param = {
        'max_depth': 10,
        'eta': 0.05,
        'silent': 1,
        'objective': 'binary:logistic'
    }
    num_round = 166
    param['nthread'] = 5
    param['eval_metric'] = "auc"
    plst = param.items()
    evallist = [(dtest, 'eval'), (dtrain, 'train')]
    bst = xgb.train(plst, dtrain, num_round, evallist)

    sub_user_index, sub_trainning_data, sub_label = make_train_set(
        sub_start_date, sub_end_date, sub_test_start_date, sub_test_end_date)
    sub_trainning_data = xgb.DMatrix(sub_trainning_data.values)
    y = bst.predict(sub_trainning_data)

    y_mean = stats.describe(y).mean
    # plt.hist(y)
    # plt.show()

    pred = sub_user_index.copy()
    y_true = sub_user_index.copy()
    pred['label'] = y
    y_true['label'] = label

    pred = pred[pred['label'] >= 0.04]
    y_true = y_true[y_true['label'] == 1]

    report(pred, y_true)
Esempio n. 3
0
def xgboost_cv():
    train_start_date = '2016-03-05'
    train_end_date = '2016-04-06'
    test_start_date = '2016-04-11'
    test_end_date = '2016-04-16'

    sub_start_date = '2016-02-05'
    sub_end_date = '2016-03-05'
    sub_test_start_date = '2016-03-05'
    sub_test_end_date = '2016-03-10'

    user_index, training_data, label = make_train_set(train_start_date,
                                                      train_end_date,
                                                      test_start_date,
                                                      test_end_date)
    # 简单划分 训练集和验证集
    X_train, X_test, y_train, y_test = train_test_split(training_data,
                                                        label,
                                                        test_size=0.2,
                                                        random_state=0)
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    param = {
        'learning_rate': 0.1,
        'n_estimators': 1000,
        'max_depth': 3,
        'min_child_weight': 5,
        'gamma': 0,
        'subsample': 1.0,
        'colsample_bytree': 0.8,
        'scale_pos_weight': 1,
        'eta': 0.05,
        'silent': 1,
        'objective': 'binary:logistic'
    }
    num_round = 250
    param['nthread'] = 5
    param['eval_metric'] = "auc"
    plst = param.items()
    plst += [('eval_metric', 'logloss')]
    evallist = [(dtest, 'eval'), (dtrain, 'train')]
    bst = xgb.train(plst, dtrain, num_round, evallist)

    sub_user_index, sub_trainning_date, sub_label = make_train_set(
        sub_start_date, sub_end_date, sub_test_start_date, sub_test_end_date)
    test = xgb.DMatrix(sub_trainning_date)
    y = bst.predict(test)

    pred = sub_user_index.copy()
    y_true = sub_user_index.copy()
    pred['label'] = y
    y_true['label'] = label
    report(pred, y_true)
Esempio n. 4
0
def xgboost_cv():
    train_start_date = '2016-03-06'
    train_end_date = '2016-04-06'
    test_start_date = '2016-04-06'
    test_end_date = '2016-04-11'

    sub_start_date = '2016-03-11'
    sub_end_date = '2016-04-11'
    sub_test_start_date = '2016-04-11'
    sub_test_end_date = '2016-04-16'

    user_index, training_data, label = make_train_set(train_start_date, train_end_date, test_start_date, test_end_date)
    X_train, X_test, y_train, y_test = train_test_split(training_data, label, test_size=0.2, random_state=0)
    del user_index, training_data, label
    dtrain=xgb.DMatrix(X_train.values, label=y_train) #todo: missing=-999.0
    dtest=xgb.DMatrix(X_test.values, label=y_test)
    del X_train, X_test, y_train, y_test
    # param = {'max_depth': 10, 'eta': 0.05, 'silent': 1, 'objective': 'binary:logistic'}
    param = {'learning_rate' : 0.1, 'n_estimators': 1000, 'max_depth': 3, 
        'min_child_weight': 5, 'gamma': 0, 'subsample': 1.0, 'colsample_bytree': 0.8,
        'scale_pos_weight': 1, 'eta': 0.05, 'silent': 1, 'objective': 'binary:logistic'}
    num_round = 4000
    param['nthread'] = 6
    param['eval_metric'] = "auc"
    plst = param.items()
    plst += [('eval_metric', 'logloss')]
    evallist = [(dtest, 'eval'), (dtrain, 'train')]
    bst=xgb.train( plst, dtrain, num_round, evallist)
    del dtrain	
    print('saving model...')
    # flag = 'basic'
    # bst.save_model('./cache/' + flag + '_model')
    sub_user_index, sub_training_data, sub_label = make_train_set(sub_start_date, sub_end_date,
                                                                   sub_test_start_date, sub_test_end_date, test=True)
    test = xgb.DMatrix(sub_training_data.values)
    y = bst.predict(test)

    pred = sub_user_index.copy()
    pred['label'] = y
    pred = pred.sort_values(by=['label'], ascending=False).groupby(['user_id'], as_index=False).first()

    limits = np.linspace(0, 0.5, 100)
    scores = np.zeros((1,100))
    count = 0
    for i in limits:
    	print('--------------------------------------------------------------------')
    	print('limit=%s' % str(i))
    	p = pred[pred['label'] > i]
    	scores[:, count] = report(p, sub_label)
    	count += 1

    print('max score : %s\nmax limit : %s' % (np.max(scores), limits[np.argmax(scores)]))
Esempio n. 5
0
def logistic_make_submission():
    train_start_date = '2016-03-10'
    train_end_date = '2016-04-11'
    test_start_date = '2016-04-11'
    test_end_date = '2016-04-16'

    sub_start_date = '2016-03-15'
    sub_end_date = '2016-04-16'

    user_index, training_data, label = make_train_set(train_start_date,
                                                      train_end_date,
                                                      test_start_date,
                                                      test_end_date)
    X_train, X_test, y_train, y_test = train_test_split(training_data.values,
                                                        label.values,
                                                        test_size=0.2,
                                                        random_state=0)

    y_train = list(map(int, y_train))
    # print(np.any(np.isnan(X_train)))
    # print(np.all(np.isfinite(X_train)))
    clf = lg()  # 使用类,参数全是默认的
    clf.fit(X_train, y_train)

    sub_user_index, sub_trainning_data = make_test_set(sub_start_date,
                                                       sub_end_date)

    y_hat = clf.predict(sub_trainning_data.values)
    sub_user_index['label'] = y_hat
    pred = sub_user_index[sub_user_index['label'] == 1]
    pred = pred[['user_id', 'sku_id']]
    pred = pred.groupby('user_id').first().reset_index()
    pred['user_id'] = pred['user_id'].astype(int)
    pred.to_csv('../sub/submissionLOG508.csv', index=False, index_label=False)
Esempio n. 6
0
def xgboost_make_submission():
    train_start_date = '2016-03-10'
    train_end_date = '2016-04-11'
    test_start_date = '2016-04-11'
    test_end_date = '2016-04-16'

    sub_start_date = '2016-03-15'
    sub_end_date = '2016-04-16'

    user_index, training_data, label = make_train_set(train_start_date, train_end_date, test_start_date, test_end_date)
    X_train, X_test, y_train, y_test = train_test_split(training_data.values, label.values, test_size=0.2, random_state=0)
    dtrain=xgb.DMatrix(X_train.values, label=y_train)
    dtest=xgb.DMatrix(X_test.values, label=y_test)
    param = {'learning_rate' : 0.1, 'n_estimators': 1000, 'max_depth': 3, 
        'min_child_weight': 5, 'gamma': 0, 'subsample': 1.0, 'colsample_bytree': 0.8,
        'scale_pos_weight': 1, 'eta': 0.05, 'silent': 1, 'objective': 'binary:logistic'}
    num_round = 283
    param['nthread'] = 4
    #param['eval_metric'] = "auc"
    plst = param.items()
    plst += [('eval_metric', 'logloss')]
    evallist = [(dtest, 'eval'), (dtrain, 'train')]
    bst=xgb.train(plst, dtrain, num_round, evallist)
    sub_user_index, sub_trainning_data = make_test_set(sub_start_date, sub_end_date,)
    sub_trainning_data = xgb.DMatrix(sub_trainning_data.values)
    y = bst.predict(sub_trainning_data)
    sub_user_index['label'] = y
    pred = sub_user_index[sub_user_index['label'] >= 0.03]
    pred = pred[['user_id', 'sku_id']]
    pred = pred.groupby('user_id').first().reset_index()
    pred['user_id'] = pred['user_id'].astype(int)
    pred.to_csv('./sub/submission.csv', index=False, index_label=False)
Esempio n. 7
0
def xgboost_make_submission():
    train_start_date = '2016-03-10'
    train_end_date = '2016-04-11'
    test_start_date = '2016-04-11'
    test_end_date = '2016-04-16'
#测试集构建,根据测试集特征数据集预测后五天的label
    sub_start_date = '2016-03-15'
    sub_end_date = '2016-04-16'

    user_index, training_data, label = make_train_set(train_start_date, train_end_date, test_start_date, test_end_date)
    X_train, X_test, y_train, y_test = train_test_split(training_data.values, label.values, test_size=0.2, random_state=0)
    dtrain=xgb.DMatrix(X_train, label=y_train)
    dtest=xgb.DMatrix(X_test, label=y_test)
    param = {'learning_rate' : 0.1, 'n_estimators': 1000, 'max_depth': 3, 
        'min_child_weight': 5, 'gamma': 0, 'subsample': 1.0, 'colsample_bytree': 0.8,
        'scale_pos_weight': 1, 'eta': 0.05, 'silent': 1, 'objective': 'binary:logistic'}
    num_round = 283
    param['nthread'] = 4
    #param['eval_metric'] = "auc"
    plst = param.items()
    plst += [('eval_metric', 'logloss')]
    evallist = [(dtest, 'eval'), (dtrain, 'train')]
    bst=xgb.train(plst, dtrain, num_round, evallist)
    sub_user_index, sub_trainning_data = make_test_set(sub_start_date, sub_end_date,)
    sub_trainning_data = xgb.DMatrix(sub_trainning_data.values)
    #预测得到用户-商品对数据标签
    y = bst.predict(sub_trainning_data)
    sub_user_index['label'] = y
    #将用户-商品对出现概率大于0.03的拿出来
    pred = sub_user_index[sub_user_index['label'] >= 0.03]
    pred = pred[['user_id', 'sku_id']]
    pred = pred.groupby('user_id').first().reset_index()
    pred['user_id'] = pred['user_id'].astype(int)
    pred.to_csv('./sub/submission.csv', index=False, index_label=False)
Esempio n. 8
0
def xgboost_make_submission():
    train_start_date = '2016-03-10'
    train_end_date = '2016-04-11'
    test_start_date = '2016-04-11'
    test_end_date = '2016-04-16'

    sub_start_date = '2016-03-15'
    sub_end_date = '2016-04-16'

    user_index, training_data, label = make_train_set(train_start_date,
                                                      train_end_date,
                                                      test_start_date,
                                                      test_end_date)

    X_train, X_test, y_train, y_test = train_test_split(
        training_data.values, label.values, test_size=0.2,
        random_state=0)  # select some features
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)  # don't use these
    param = {
        'learning_rate': 0.05,
        'n_estimators': 1000,
        'max_depth': 5,
        'min_child_weight': 1,
        'gamma': 0,
        'subsample': 1,
        'colsample_bytree': 0.8,
        'scale_pos_weight': 1,
        'eta': 0.05,
        'silent': 1,
        'objective': 'binary:logistic'
    }
    num_round = 20
    param['nthread'] = 5
    #param['eval_metric'] = "auc"
    plst = param.items()
    plst = list(plst)
    plst += [('eval_metric', 'auc')]
    evallist = [(dtest, 'eval'), (dtrain, 'train')]
    bst = xgb.train(plst, dtrain, num_round, evallist)
    # make test data
    sub_user_index, sub_trainning_data = make_test_set(sub_start_date,
                                                       sub_end_date)
    sub_trainning_data = xgb.DMatrix(
        sub_trainning_data.values
    )  # predict this subdata,the DMatrix Object is array

    y_hat = bst.predict(sub_trainning_data)
    sub_user_index['label'] = y_hat
    pred = sub_user_index[sub_user_index['label'] >= 0.05]
    pred = pred[['user_id', 'sku_id']]
    pred = pred.groupby('user_id').first().reset_index()
    pred['user_id'] = pred['user_id'].astype(int)
    pred.to_csv('../sub/submission424.csv', index=False, index_label=False)
Esempio n. 9
0
def gdbt_train():
    from sklearn.ensemble import RandomForestClassifier
    train_start_date = '2016-03-10'
    train_end_date = '2016-04-11'
    test_start_date = '2016-04-11'
    test_end_date = '2016-04-16'

    user_index, training_data, label = make_train_set(train_start_date,
                                                      train_end_date,
                                                      test_start_date,
                                                      test_end_date)
    X_train, X_test, y_train, y_test = train_test_split(training_data.values,
                                                        label.values,
                                                        test_size=0.2,
                                                        random_state=0)
    # np.savetxt('train.txt',X_train,fmt='%.2f',delimiter=' ')
    # clf = GradientBoostingClassifier(n_estimators=220)
    clf = RandomForestClassifier(n_estimators=220,
                                 criterion="gini",
                                 max_depth=10,
                                 min_samples_split=2,
                                 min_samples_leaf=1,
                                 min_weight_fraction_leaf=0.,
                                 max_features="auto",
                                 max_leaf_nodes=None,
                                 min_impurity_split=1e-7,
                                 bootstrap=True,
                                 oob_score=False,
                                 n_jobs=1,
                                 random_state=None,
                                 verbose=0,
                                 warm_start=False,
                                 class_weight=None)
    print type(X_train)
    clf.fit(X_train, y_train)
    pre_y_test = clf.predict_proba(X_test)
    print pre_y_test
    print("GBDT Metrics : {0}".format(
        precision_recall_fscore_support(y_test, pre_y_test)))

    print u'保存结果.....'
    f_result = open('result.txt', 'w')
    for i in range(0, len(pre_y_test)):
        if i == 0:
            print str(pre_y_test[i][0])
        if i == len(pre_y_test) - 1:
            print str(pre_y_test[i][0])
        f_result.write(str(pre_y_test[i][0]) + '\n')
Esempio n. 10
0
def gbdt_make_submission():
    train_start_date = '2016-03-10'
    train_end_date = '2016-04-11'
    test_start_date = '2016-04-11'
    test_end_date = '2016-04-16'

    sub_start_date = '2016-03-15'
    sub_end_date = '2016-04-16'

    user_index, training_data, label = make_train_set(train_start_date,
                                                      train_end_date,
                                                      test_start_date,
                                                      test_end_date)
    training_data = training_data.fillna(0)
    print(training_data.info())
    X_train, X_test, y_train, y_test = train_test_split(training_data.values,
                                                        label.values,
                                                        test_size=0.2,
                                                        random_state=0)
    # X_train = X_train.astype(int)
    y_train = list(map(int, y_train))

    param = {
        'n_estimators': 1200,
        'max_depth': 3,
        'subsample': 1.0,
        'learning_rate': 0.01,
        'min_samples_leaf': 1,
        'random_state': 3,
        'max_features': 0.8
    }
    clf = gbdt(param)

    clf.fit(X_train, y_train)

    sub_user_index, sub_trainning_data = make_test_set(sub_start_date,
                                                       sub_end_date)

    sub_trainning_data = sub_trainning_data.fillna(0)

    y_hat = clf.predict(sub_trainning_data.values)
    sub_user_index['label'] = y_hat
    pred = sub_user_index[sub_user_index['label'] == 1]
    pred = pred[['user_id', 'sku_id']]
    pred = pred.groupby('user_id').first().reset_index()
    pred['user_id'] = pred['user_id'].astype(int)
    pred.to_csv('../sub/submissionGBDT508.csv', index=False, index_label=False)
Esempio n. 11
0
def xgboost_train():
    train_start_date = '2016-03-10'
    train_end_date = '2016-04-11'
    test_start_date = '2016-04-11'
    test_end_date = '2016-04-16'

    user_index, training_data, label = make_train_set(train_start_date,
                                                      train_end_date,
                                                      test_start_date,
                                                      test_end_date)
    X_train, X_test, y_train, y_test = train_test_split(training_data.values,
                                                        label.values,
                                                        test_size=0.2,
                                                        random_state=0)
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    param = {
        'learning_rate': 0.1,
        'n_estimators': 1000,
        'max_depth': 3,
        'min_child_weight': 5,
        'gamma': 0,
        'subsample': 1.0,
        'colsample_bytree': 0.8,
        'scale_pos_weight': 1,
        'eta': 0.05,
        'silent': 1,
        'objective': 'binary:logistic'
    }
    num_round = 283
    param['nthread'] = 4
    # param['eval_metric'] = "auc"
    plst = param.items()
    plst += [('eval_metric', 'logloss')]
    evallist = [(dtest, 'eval'), (dtrain, 'train')]
    bst = xgb.train(plst, dtrain, num_round, evallist)
    bst.save_model('./cache/bstmodel.bin')
    return bst
Esempio n. 12
0
def xgboost_cv2():
    train_start_date = '2016-03-05'
    train_end_date = '2016-04-06'
    test_start_date = '2016-04-06'
    test_end_date = '2016-04-11'

    sub_start_date = '2016-03-10'
    sub_end_date = '2016-04-11'
    sub_test_start_date = '2016-04-11'
    sub_test_end_date = '2016-04-16'

    user_index, training_data, label = make_train_set(train_start_date,
                                                      train_end_date,
                                                      test_start_date,
                                                      test_end_date)
    X_train, X_test, y_train, y_test = train_test_split(training_data,
                                                        label,
                                                        test_size=0.2,
                                                        random_state=0)

    dtrain = xgb.DMatrix(X_train.values, label=y_train)
    dtest = xgb.DMatrix(X_test.values, label=y_test)

    param = {
        'learning_rate': 0.05,
        'n_estimators': 1000,
        'max_depth': 10,
        'min_child_weight': 1,
        'gamma': 0,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'scale_pos_weight': 1,
        'eta': 0.05,
        'silent': 1,
        'objective': 'binary:logistic',
        'eval_metric': 'auc'
    }
    num_round = 300

    param['nthread'] = 5
    #     param['eval_metric'] = "auc"
    #     plst = param.items()
    #     plst += [('eval_metric', 'logloss')]
    evallist = [(dtest, 'eval'), (dtrain, 'train')]

    bst = xgb.train(param, dtrain, num_round, evallist)
    sub_user_index, sub_trainning_date, sub_label = make_train_set(
        sub_start_date, sub_end_date, sub_test_start_date,
        sub_test_end_date)  # use this data to see the offline score
    test = xgb.DMatrix(sub_trainning_date.values)
    y = bst.predict(test)

    pred = sub_user_index.copy()
    y_true = get_labels_8(
        sub_test_start_date,
        sub_test_end_date)  # during the test date, real label for cate 8
    # y_true = sub_user_index.copy()
    pred[
        'label'] = y  # add the new column which is the predict label for the test date
    # print(pred[(pred.label >= 0.12)].shape)
    # print("y_true:")
    # print(y_true)
    # pred = pred[(pred.label >= 0.35)]
    # print(len(pred))
    # print(pred)

    ans = []
    for i in range(0, 30):
        pred = sub_user_index.copy()
        pred['label'] = y
        pred = pred[pred.label >= i / 100]
        # print(pred)
        rep = report(pred, y_true)
        print('%s : score:%s' % (i / 100, rep))
        ans.append([i / 100, rep])

    print('ans:%s' % ans)

    threshold = sorted(ans, key=getKey, reverse=True)[0][0]
    bestscore = sorted(ans, key=getKey, reverse=True)[0][1]
    print('best threshold:%s' % threshold)
    print('best score:%s' % bestscore)
Esempio n. 13
0
def gbdt_cv():
    train_start_date = '2016-03-05'
    train_end_date = '2016-04-06'
    test_start_date = '2016-04-06'
    test_end_date = '2016-04-11'

    sub_start_date = '2016-03-10'
    sub_end_date = '2016-04-11'
    sub_test_start_date = '2016-04-11'
    sub_test_end_date = '2016-04-16'

    user_index, training_data, label = make_train_set(train_start_date,
                                                      train_end_date,
                                                      test_start_date,
                                                      test_end_date)
    X_train, X_test, y_train, y_test = train_test_split(training_data,
                                                        label,
                                                        test_size=0.2,
                                                        random_state=0)
    param = {
        'n_estimators': 1200,
        'max_depth': 3,
        'subsample': 1.0,
        'learning_rate': 0.01,
        'min_samples_leaf': 1,
        'random_state': 3,
        'max_features': 0.8
    }
    clf = gbdt(param)
    clf.fit(X_train, y_train)

    sub_user_index, sub_trainning_date, sub_label = make_train_set(
        sub_start_date, sub_end_date, sub_test_start_date,
        sub_test_end_date)  # use this data to see the offline score

    test = sub_trainning_date.values
    y = clf.predict(test)

    pred = sub_user_index.copy()
    y_true = get_labels_8(
        sub_test_start_date,
        sub_test_end_date)  # during the test date, real label for cate 8
    # y_true = sub_user_index.copy()
    pred[
        'label'] = y  # add the new column which is the predict label for the test date

    ans = []
    for i in range(0, 30):
        pred = sub_user_index.copy()
        pred['label'] = y
        pred = pred[pred.label >= i / 100]
        # print(pred)
        rep = report(pred, y_true)
        print('%s : score:%s' % (i / 100, rep))
        ans.append([i / 100, rep])

    print('ans:%s' % ans)

    threshold = sorted(ans, key=getKey, reverse=True)[0][0]
    bestscore = sorted(ans, key=getKey, reverse=True)[0][1]
    print('best threshold:%s' % threshold)
    print('best score:%s' % bestscore)
Esempio n. 14
0
File: train.py Progetto: DPC11/jd
def xgboost_report_submission():
    train_start_date = '2016-03-08'
    train_end_date = '2016-04-09'
    result_start_date = '2016-04-09'
    result_end_date = '2016-04-14'

    valid_start_date = '2016-03-01'
    valid_end_date = '2016-04-02'
    valid_result_start_date = '2016-04-02'
    valid_result_end_date = '2016-04-07'

    test_start_date = '2016-03-15'
    test_end_date = '2016-04-16'

    user_index, training_data, label = make_train_set(train_start_date, train_end_date, result_start_date,
                                                      result_end_date)
    x_train, x_test, y_train, y_test = train_test_split(training_data.values,
                                                        label.values, test_size=0.2, random_state=0)
    dtrain = xgb.DMatrix(x_train, label=y_train)
    dtest = xgb.DMatrix(x_test, label=y_test)
    param = {'learning_rate': 0.1, 'n_estimators': 1000, 'max_depth': 3,
             'min_child_weight': 5, 'gamma': 0, 'subsample': 1.0, 'colsample_bytree': 0.8,
             'scale_pos_weight': 1, 'eta': 0.05, 'silent': 1, 'objective': 'binary:logistic'}
    num_round = 283
    param['nthread'] = 4
    #param['eval_metric'] = "auc"
    plst = param.items()
    plst += [('eval_metric', 'logloss')]
    evallist = [(dtest, 'eval'), (dtrain, 'train')]
    bst = xgb.train(plst, dtrain, num_round, evallist)

    # Report with validation set
    valid_user_index, valid_trainning_date = make_test_set(valid_start_date, valid_end_date)
    valid_trainning_date = xgb.DMatrix(valid_trainning_date.values)
    pred_y = bst.predict(valid_trainning_date)

    valid_pred = valid_user_index.copy()
    valid_pred['label'] = pred_y
    valid_pred = valid_pred[valid_pred['label'] >= 0.014]
    valid_pred = valid_pred.sort_values('label', ascending=False).groupby('user_id').first().reset_index()
    valid_true = get_true(valid_result_start_date, valid_result_end_date)
    report(valid_pred, valid_true)

    valid_pred = valid_pred[valid_pred['label'] >= 0.016]
    print 0.016
    report(valid_pred, valid_true)

    valid_pred = valid_pred[valid_pred['label'] >= 0.018]
    print 0.018
    report(valid_pred, valid_true)

    valid_pred = valid_pred[valid_pred['label'] >= 0.02]
    print 0.02
    report(valid_pred, valid_true)

    valid_pred = valid_pred[valid_pred['label'] >= 0.022]
    print 0.022
    report(valid_pred, valid_true)

    valid_pred = valid_pred[valid_pred['label'] >= 0.024]
    print 0.024
    report(valid_pred, valid_true)

    valid_pred = valid_pred[valid_pred['label'] >= 0.026]
    print 0.026
    report(valid_pred, valid_true)

    valid_pred = valid_pred[valid_pred['label'] >= 0.028]
    print 0.028
    report(valid_pred, valid_true)

    valid_pred = valid_pred[valid_pred['label'] >= 0.03]
    print 0.03
    report(valid_pred, valid_true)
Esempio n. 15
0
def xgboost_make_submission():
    train_start_date = '2016-03-31'
    train_end_date = '2016-04-10'
    test_start_date = '2016-04-10'
    test_end_date = '2016-04-16'
    sub_start_date = '2016-04-06'
    sub_end_date = '2016-04-16'

    user_index, training_data, label = make_train_set(train_start_date,
                                                      train_end_date,
                                                      test_start_date,
                                                      test_end_date)
    list_of_train = list(training_data.columns)
    print len(list_of_train)

    X_train, X_test, y_train, y_test = train_test_split(training_data.values,
                                                        label.values,
                                                        test_size=0.2,
                                                        random_state=0)
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    param = {
        'learning_rate': 0.1,
        'n_estimators': 1000,
        'max_depth': 3,
        'min_child_weight': 5,
        'gamma': 0,
        'subsample': 1.0,
        'colsample_bytree': 0.8,
        'scale_pos_weight': 1,
        'eta': 0.05,
        'silent': 1,
        'objective': 'binary:logistic'
    }
    # num_round = 345
    num_round = 511
    # param['nthread'] = 8
    param['eval_metric'] = "auc"
    plst = param.items()
    plst += [('eval_metric', 'logloss')]
    evallist = [(dtest, 'eval'), (dtrain, 'train')]
    bst = xgb.train(plst,
                    dtrain,
                    num_round,
                    evallist,
                    early_stopping_rounds=10)
    importance = bst.get_fscore()
    print importance
    feat_importances = []
    for ft, score in importance.iteritems():
        ft = ft.split('f')[1]
        feat_importances.append({'Feature': ft, 'Importance': score})
    feat_importances = pd.DataFrame(feat_importances)
    feat_importances = feat_importances.sort_values(
        by='Importance', ascending=False).reset_index(drop=True)
    new_columns = []
    for index in list(feat_importances['Feature']):
        index = int(index)
        # feat_importances[index]['Feature'] =
        new_columns.append(list_of_train[index])
    name_of = pd.DataFrame({'new': new_columns})
    feat_importances = pd.concat([feat_importances, name_of], axis=1)
    feat_importances.to_csv('./sub/fecure.csv')
    sub_user_index, sub_trainning_data = make_test_set(sub_start_date,
                                                       sub_end_date)
    sub_trainning_data = xgb.DMatrix(sub_trainning_data.values)

    y_label = bst.predict(xgb.DMatrix(X_test))
    fpr, tpr, threasholds = roc_curve(y_test, y_label, pos_label=2)
    print fpr, tpr, threasholds
    # print auc(fpr,tpr)
    # plt.plot(threasholds,fpr)
    # plt.show()
    y = bst.predict(sub_trainning_data)
    sub_user_index['label'] = y
    # print np.median(y)
    # print sub_user_index
    # pred = sub_user_index.groupby('user_id').max().reset_index()
    # print pred
    pred = sub_user_index[sub_user_index['label'] >= 0.04]

    pred = pred[['user_id', 'sku_id']]
    pred = pred.groupby('user_id').max().reset_index()

    pred['user_id'] = pred['user_id'].astype(int)
    pred.to_csv('./sub/submission.csv', index=False, index_label=False)
    buy_cate_8 = np.load('./unique_8/user_id_unique.npy')
    pred = pred[~pred['user_id'].isin(buy_cate_8)]
    pred.to_csv('./sub/submission_unique.csv', index=False, index_label=False)