コード例 #1
0
ファイル: main.py プロジェクト: zhilangtaosha/Mobike-1
def test(**kwargs):
	
	# ---------------------- 更新参数 ----------------------
	opt = DefaultConfig()
	opt.update(**kwargs)
	opt.printf()

    # ---------------------- 数据处理 ----------------------

    # 获取数据
	train, test = get_test_data(opt)
	gc.collect()
 #    # 获取样本
	# test_sample = get_sample(train, test, load=True)
	# gc.collect()
 #    # 获取特征
	# test_feat = get_feat(train, test_sample)
	# gc.collect()

	# 保存特征至文件
	# test_feat.to_hdf('/home/xuwenchao/dyj-storage/all-feat/feat_{}.hdf'.format(test.shape[0]), 'w', complib='blosc', complevel=5)
	test_feat = pd.read_hdf('/home/xuwenchao/dyj-storage/all-feat/feat_{}.hdf'.format(test.shape[0]))
	test_feat = get_feat(train, test_feat)
	gc.collect()
	test_feat.to_hdf('/home/xuwenchao/dyj-storage/all-feat/feat_{}_filter.hdf'.format(test.shape[0]), 'w', complib='blosc', complevel=5)

    # ---------------------- 载入模型 ----------------------

	# opt['model_name'] = 'lgb_1_90_all.pkl'
	# gbm0, use_feat0 = load_model(opt)
	opt['model_name'] = 'lgb_2017-09-23#20:14:52_0.58893.pkl'
	gbm1, use_feat1 = load_model(opt)
	# opt['model_name'] = 'lgb_2_300_top15.pkl'
	# gbm2, use_feat2 = load_model(opt)
	# opt['model_name'] = 'lgb_3_300_top10.pkl'
	# gbm3, use_feat3 = load_model(opt)
	# opt['model_name'] = 'lgb_4_300_top5.pkl'
	# gbm4, use_feat4 = load_model(opt)
    
	# ---------------------- 保存预测结果 -------------------

	# test_feat.loc[:, 'pred'] = gbm0.predict(test_feat[use_feat0])
	# gc.collect()
	# res = test_feat[['orderid', 'geohashed_end_loc', 'pred']].sort_values(by=['orderid', 'pred'], ascending=False).groupby('orderid').head(25)
	# res[['orderid', 'geohashed_end_loc']].to_hdf('/home/xuwenchao/dyj-storage/sample_25_{}_filter_leak_sample.hdf'.format(test.shape[0]), 'w', complib='blosc', complevel=5)
	# gc.collect()

	# test_feat.loc[:, 'pred'] = gbm1.predict(test_feat[use_feat1])
	# test_feat[['orderid', 'geohashed_end_loc', 'pred']].to_hdf('/home/xuwenchao/dyj-storage/pred/pred_{}_0.58820.hdf'.format(test.shape[0]), 'w', complib='blosc', complevel=5)

	res = predict(test_feat, use_feat1, gbm1)
	test_feat[['orderid', 'geohashed_end_loc', 'pred']].to_hdf('/home/xuwenchao/dyj-storage/pred/pred_{}_0.58893.hdf'.format(test.shape[0]), 'w', complib='blosc', complevel=5)
	gc.collect()
	cur_time = datetime.datetime.now().strftime('%Y-%m-%d#%H:%M:%S')
	res_path = '{}/day{}_{}_wc_sample_0.58893.csv'.format(opt['result_dir'], opt['test_startday'], cur_time)
	res.to_csv(res_path, index=False)
	print('保存测试结果至:', res_path)
コード例 #2
0
ファイル: main.py プロジェクト: milkboylyf/Mobike
def val(**kwargs):

    # ---------------------- 更新参数 ----------------------
    opt = DefaultConfig()
    opt.update(**kwargs)
    opt.printf()

    # ---------------------- 数据处理 ----------------------

    # 获取数据
    # train1, train2, train_test = get_train_data(opt)
    # 获取样本
    # train_sample = get_sample(train1, train2, load=True)
    # 获取特征
    # train_feat = get_feat(train_test, train_sample)
    # gc.collect()

    # train_feat.to_hdf('/home/xuwenchao/dyj-storage/all-feat/feat_{}.hdf'.format(opt['startday']), 'w', complib='blosc', complevel=5)
    train_feat = pd.read_hdf(
        '/home/xuwenchao/dyj-storage/all-feat/feat_23_24_label.hdf')

    # ---------------------- 载入模型 ----------------------

    # opt['model_name'] = 'lgb_1_90_all.pkl'
    # gbm0, use_feat0 = load_model(opt)
    opt['model_name'] = 'lgb_1_2017-09-15#19:50:48_0.58820.pkl'
    gbm, use_feat = load_model(opt)
    opt['model_name'] = 'lgb_2017-09-23#20:14:52_0.58893.pkl'
    gbm1, use_feat1 = load_model(opt)
    # gbm2, use_feat2 = load_model(opt)
    # opt['model_name'] = 'lgb_2017-09-03#23:24:26_0.57836.pkl'
    # gbm3, use_feat3 = load_model(opt)
    # opt['model_name'] = ''
    # gbm4, use_feat4 = load_model(opt)

    # ---------------------- 评估 -------------------------

    train_feat.loc[:, 'pred'] = gbm.predict(train_feat[use_feat])
    gc.collect()
    train_feat[['orderid', 'geohashed_end_loc', 'pred']].to_csv(
        '/home/xuwenchao/dyj-storage/pred/pred_23_24_0.58820.csv', index=None)
    train_feat.loc[:, 'pred'] = gbm1.predict(train_feat[use_feat1])
    gc.collect()
    train_feat[['orderid', 'geohashed_end_loc', 'pred']].to_csv(
        '/home/xuwenchao/dyj-storage/pred/pred_23_24_0.58893.csv', index=None)
コード例 #3
0
ファイル: main.py プロジェクト: lsp140510/Mobike
def val(**kwargs):

	# ---------------------- 更新参数 ----------------------
	opt = DefaultConfig()
	opt.update(**kwargs)
	opt.printf()

	# ---------------------- 数据处理 ----------------------

	# 获取数据
	# train1, train2, train_test = get_train_data(opt)
	# 获取样本
	# train_sample = get_sample(train1, train2, load=True)
	# 获取特征
	# train_feat = get_feat(train_test, train_sample)
	# gc.collect()
    
	# train_feat.to_hdf('/home/xuwenchao/dyj-storage/all-feat/feat_{}.hdf'.format(opt['startday']), 'w', complib='blosc', complevel=5)
	train_feat = pd.read_hdf('/home/xuwenchao/dyj-storage/all-feat/feat_23_24_label.hdf')

    # ---------------------- 载入模型 ----------------------
	
	# opt['model_name'] = 'lgb_1_90_all.pkl'
	# gbm0, use_feat0 = load_model(opt)
	opt['model_name'] = 'lgb_1_2017-09-15#19:50:48_0.58820.pkl'
	gbm, use_feat = load_model(opt)
	opt['model_name'] = 'lgb_2017-09-23#20:14:52_0.58893.pkl'
	gbm1, use_feat1 = load_model(opt)
	# gbm2, use_feat2 = load_model(opt)
	# opt['model_name'] = 'lgb_2017-09-03#23:24:26_0.57836.pkl'
	# gbm3, use_feat3 = load_model(opt)
	# opt['model_name'] = ''
	# gbm4, use_feat4 = load_model(opt)

	# ---------------------- 评估 -------------------------

	train_feat.loc[:, 'pred'] = gbm.predict(train_feat[use_feat])
	gc.collect()
	train_feat[['orderid', 'geohashed_end_loc', 'pred']].to_csv('/home/xuwenchao/dyj-storage/pred/pred_23_24_0.58820.csv', index=None)
	train_feat.loc[:, 'pred'] = gbm1.predict(train_feat[use_feat1])
	gc.collect()
	train_feat[['orderid', 'geohashed_end_loc', 'pred']].to_csv('/home/xuwenchao/dyj-storage/pred/pred_23_24_0.58893.csv', index=None)
コード例 #4
0
ファイル: main.py プロジェクト: milkboylyf/Mobike
def train(**kwargs):

    # ---------------------- 更新参数 ----------------------
    opt = DefaultConfig()
    opt.update(**kwargs)
    opt.printf()

    # ---------------------- 数据处理 ----------------------

    # 获取数据
    train1, train2 = get_train_data(opt)
    # 获取样本
    # train_sample = get_sample(train1, train2, load=True)
    # 获取特征
    # train_feat = get_feat(train1, train_sample)
    # 获取标签
    # train_all = get_label(train_feat, opt)
    # gc.collect()

    # train_all.to_hdf('/home/xuwenchao/dyj-storage/all-feat/feat_23_24_label.hdf', 'w', complib='blosc', complevel=5)
    train_all = pd.read_hdf(
        '/home/xuwenchao/dyj-storage/all-feat/feat_23_24_label.hdf')
    print(train_all.shape)

    # 取出需要用的特征
    # opt['model_name'] = 'lgb_1_2017-09-15#19:50:48_0.58820.pkl'
    # gbm, use_feat = load_model(opt)
    # predictors_100 = pd.DataFrame(data={'feature_name': gbm.feature_name(), 'feature_importance': gbm.feature_importance()})
    # predictors_100 = predictors_100.sort_values(by=['feature_importance'], ascending=False)['feature_name'].values[:100]
    # use_feat = list(predictors_100) + ['orderid', 'geohashed_end_loc', 'label'] + ['sloc_eloc_common_eloc_count', 'sloc_eloc_common_sloc_count', 'sloc_eloc_common_conn1_count', 'sloc_eloc_common_conn2_count', 'sloc_eloc_common_eloc_rate', 'sloc_eloc_common_sloc_rate', 'sloc_eloc_common_conn1_rate', 'sloc_eloc_common_conn2_rate', 'user_sloc_eloc_common_eloc_count', 'user_sloc_eloc_common_sloc_count', 'user_sloc_eloc_common_conn1_count', 'user_sloc_eloc_common_conn2_count', 'user_sloc_eloc_common_eloc_rate', 'user_sloc_eloc_common_sloc_rate', 'user_sloc_eloc_common_conn1_rate', 'user_sloc_eloc_common_conn2_rate']
    # train_all = train_all[use_feat]
    # gc.collect()

    # -------------------- 训练第一层 ------------------------

    # ********* 准备数据 **********
    # 划分验证集
    train, val = train_test_split(train_all, test_size=0.1)
    # 定义使用哪些特征
    # opt['model_name'] = 'lgb_1_2017-09-15#19:50:48_0.58820.pkl'
    # gbm, use_feat = load_model(opt)
    filters = set([
        'orderid', 'userid', 'biketype', 'geohashed_start_loc', 'bikeid',
        'starttime', 'geohashed_end_loc', 'label'
    ])
    predictors = list(
        filter(lambda x: x not in filters, train_all.columns.tolist()))
    # predictors = pd.DataFrame(data={'feature_name': gbm.feature_name(), 'feature_importance': gbm.feature_importance()})
    # predictors = predictors.sort_values(by=['feature_importance'], ascending=False)['feature_name'].values[:100]
    # use_feat = list(predictors) + ['orderid', 'geohashed_end_loc'] + ['sloc_eloc_common_eloc_count', 'sloc_eloc_common_sloc_count', 'sloc_eloc_common_conn1_count', 'sloc_eloc_common_conn2_count', 'sloc_eloc_common_eloc_rate', 'sloc_eloc_common_sloc_rate', 'sloc_eloc_common_conn1_rate', 'sloc_eloc_common_conn2_rate', 'user_sloc_eloc_common_eloc_count', 'user_sloc_eloc_common_sloc_count', 'user_sloc_eloc_common_conn1_count', 'user_sloc_eloc_common_conn2_count', 'user_sloc_eloc_common_eloc_rate', 'user_sloc_eloc_common_sloc_rate', 'user_sloc_eloc_common_conn1_rate', 'user_sloc_eloc_common_conn2_rate']
    # predictors = list(predictors_100) + ['sloc_eloc_common_eloc_count', 'sloc_eloc_common_sloc_count', 'sloc_eloc_common_conn1_count', 'sloc_eloc_common_conn2_count', 'sloc_eloc_common_eloc_rate', 'sloc_eloc_common_sloc_rate', 'sloc_eloc_common_conn1_rate', 'sloc_eloc_common_conn2_rate', 'user_sloc_eloc_common_eloc_count', 'user_sloc_eloc_common_sloc_count', 'user_sloc_eloc_common_conn1_count', 'user_sloc_eloc_common_conn2_count', 'user_sloc_eloc_common_eloc_rate', 'user_sloc_eloc_common_sloc_rate', 'user_sloc_eloc_common_conn1_rate', 'user_sloc_eloc_common_conn2_rate']
    print('使用的特征:{}维\n'.format(len(predictors)), predictors)
    # 定义数据集
    X_train = train[predictors]
    y_train = train['label']
    X_val = val[predictors]
    y_val = val['label']
    del train, val
    gc.collect()

    # ********* LightGBM *********
    # 数据集
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)
    # 配置
    params = {
        'objective': 'binary',
        'metric': {'auc', 'binary_logloss'},
        'is_unbalance': True,
        'num_leaves': opt['lgb_leaves'],
        'learning_rate': opt['lgb_lr'],
        'feature_fraction': 0.886,
        'bagging_fraction': 0.886,
        'bagging_freq': 5
    }
    gc.collect()
    # ********** 开始训练 *********
    gbm1 = lgb.train(params,
                     lgb_train,
                     num_boost_round=1200,
                     valid_sets=[lgb_train, lgb_val],
                     early_stopping_rounds=5)
    gc.collect()

    # #  ********* 保存模型 *********

    cur_time = datetime.datetime.now().strftime('%Y-%m-%d#%H:%M:%S')
    # save_path = '{}/{}_{}_{:.5f}.pkl'.format(opt['model_dir'], 'lgb', cur_time, score[0])
    save_path = '{}/{}_{}.pkl'.format(opt['model_dir'], 'lgb', cur_time)
    with open(save_path, 'wb') as fout:
        pickle.dump(gbm1, fout)
    print('保存模型:', save_path)
    gc.collect()

    # #  ********* 评估  *********

    # # 在训练集上看效果
    del X_train, y_train, X_val, y_val
    gc.collect()
    score = get_score(train_all, predictors, gbm1, opt)
    print('训练集分数:{}'.format(score))

    import sys
    sys.exit(0)

    # save_path = '{}/{}.pkl'.format(opt['model_dir'], 'lgb_1_300_top25')
    # with open(save_path, 'wb') as fout:
    #     pickle.dump(gbm1, fout)
    # print('保存模型(第一层):', save_path)

    # ********* save predict *****

    # train_all[['orderid', 'geohashed_end_loc', 'pred']].to_hdf('/home/xuwenchao/dyj-storage/train2324_80_pred_res.hdf', 'w', complib='blosc', complevel=5)
    # print('Save train_pred_res.hdf successful!!!')

    # import sys
    # sys.exit(0)

    # -------------------- 训练第二层 ------------------------

    # opt['model_name'] = 'lgb_1_300_top25.pkl'
    # gbm1, use_feat1 = load_model(opt)
    # train_all.loc[:, 'pred'] = gbm1.predict(train_all[use_feat1])

    # 去掉重要性较低的特征,筛选出排名前十的候选样本,重新训练模型(后期可以载入模型finetune,
    # 尤其是对于样本量较少的情况,甚至可以选前5,
    # 但15可以覆盖99.5%的原始label,10可以覆盖98%的原始label,
    # 这两者可能会好一些,备选方案:5(+finetune),10(+finetune),15(+finetune))
    predictors = pd.DataFrame(
        data={
            'feature_name': gbm1.feature_name(),
            'feature_importance': gbm1.feature_importance()
        })
    predictors = predictors[
        predictors['feature_importance'] > 0]['feature_name'].values
    print('第二层使用的特征:{}维\n'.format(len(predictors)), predictors)
    train_all = train_all.sort_values(
        by=['orderid', 'pred'], ascending=False).groupby('orderid').head(15)
    # train_all = rank(train_all, 'orderid', 'pred', ascending=False)
    del train_all['pred']
    print('第二层数据:', train_all.shape)

    # ********* 准备数据 **********
    # 划分验证集
    train, val = train_test_split(train_all, test_size=0.1)

    # 定义数据集
    X_train = train[predictors]
    y_train = train['label']
    X_val = val[predictors]
    y_val = val['label']
    del train, val
    gc.collect()

    # 数据集
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)

    # ********** 开始训练 *********
    gbm2 = lgb.train(params,
                     lgb_train,
                     num_boost_round=1200,
                     valid_sets=[lgb_train, lgb_val],
                     early_stopping_rounds=5
                     # init_model=gbm1 # finetune
                     )

    #  ********* 评估  *********

    # 在训练集上看效果
    score = get_score(train_all, predictors, gbm2, opt)
    print('训练集分数(第二层):{}'.format(score))

    #  ********* 保存模型 *********

    cur_time = datetime.datetime.now().strftime('%Y-%m-%d#%H:%M:%S')
    save_path = '{}/{}_{}_{:.5f}.pkl'.format(opt['model_dir'], 'lgb_2',
                                             cur_time, score[0])
    with open(save_path, 'wb') as fout:
        pickle.dump(gbm2, fout)
    print('保存模型(第二层):', save_path)
    # save_path = '{}/{}.pkl'.format(opt['model_dir'], 'lgb_2_300_top15')
    # with open(save_path, 'wb') as fout:
    #     pickle.dump(gbm2, fout)
    # print('保存模型(第二层):', save_path)

    import sys
    sys.exit(0)

    # -------------------- 训练第三层 ------------------------

    # 筛选出排名前五的候选样本
    predictors = pd.DataFrame(
        data={
            'feature_name': gbm2.feature_name(),
            'feature_importance': gbm2.feature_importance()
        })
    predictors = predictors[
        predictors['feature_importance'] > 0]['feature_name'].values
    print('第三层使用的特征:{}维\n'.format(len(predictors)), predictors)
    train_all = train_all.sort_values(
        by=['orderid', 'pred'], ascending=False).groupby('orderid').head(10)
    # train_all = rank(train_all, 'orderid', 'pred', ascending=False)
    del train_all['pred']
    print('第三层数据:', train_all.shape)

    # ********* 准备数据 **********
    # 划分验证集
    train, val = train_test_split(train_all, test_size=0.1)

    # 定义数据集
    X_train = train[predictors]
    y_train = train['label']
    X_val = val[predictors]
    y_val = val['label']
    del train, val
    gc.collect()

    # 数据集
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)

    # ********** 开始训练 *********
    gbm3 = lgb.train(params,
                     lgb_train,
                     num_boost_round=1200,
                     valid_sets=[lgb_train, lgb_val],
                     early_stopping_rounds=5
                     # init_model=gbm2 # finetune
                     )

    #  ********* 评估  *********

    # 在训练集上看效果
    score = get_score(train_all, predictors, gbm3, opt)
    print('训练集分数(第三层):{}'.format(score))

    #  ********* 保存模型 *********

    cur_time = datetime.datetime.now().strftime('%Y-%m-%d#%H:%M:%S')
    save_path = '{}/{}_{}_{:.5f}.pkl'.format(opt['model_dir'], 'lgb_3',
                                             cur_time, score[0])
    with open(save_path, 'wb') as fout:
        pickle.dump(gbm3, fout)
    print('保存模型(第三层):', save_path)
    save_path = '{}/{}.pkl'.format(opt['model_dir'], 'lgb_3_300_top10')
    with open(save_path, 'wb') as fout:
        pickle.dump(gbm3, fout)
    print('保存模型(第三层):', save_path)

    # -------------------- 训练第四层 ------------------------

    # 筛选出排名前三的候选样本
    predictors = pd.DataFrame(
        data={
            'feature_name': gbm3.feature_name(),
            'feature_importance': gbm3.feature_importance()
        })
    predictors = predictors[
        predictors['feature_importance'] > 0]['feature_name'].values
    print('第四层使用的特征:{}维\n'.format(len(predictors)), predictors)
    train_all = train_all.sort_values(
        by=['orderid', 'pred'], ascending=False).groupby('orderid').head(5)
    # train_all = rank(train_all, 'orderid', 'pred', ascending=False)
    del train_all['pred']
    print('第四层数据:', train_all.shape)

    # ********* 准备数据 **********
    # 划分验证集
    train, val = train_test_split(train_all, test_size=0.1)

    # 定义数据集
    X_train = train[predictors]
    y_train = train['label']
    X_val = val[predictors]
    y_val = val['label']
    del train, val
    gc.collect()

    # 数据集
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)

    # ********** 开始训练 *********
    gbm4 = lgb.train(params,
                     lgb_train,
                     num_boost_round=1200,
                     valid_sets=[lgb_train, lgb_val],
                     early_stopping_rounds=5
                     # init_model=gbm3 # finetune
                     )

    #  ********* 评估  *********

    # 在训练集上看效果
    score = get_score(train_all, predictors, gbm4, opt)
    print('训练集分数(第四层):{}'.format(score))

    #  ********* 保存模型 *********

    cur_time = datetime.datetime.now().strftime('%Y-%m-%d#%H:%M:%S')
    save_path = '{}/{}_{}_{:.5f}.pkl'.format(opt['model_dir'], 'lgb_4',
                                             cur_time, score[0])
    with open(save_path, 'wb') as fout:
        pickle.dump(gbm4, fout)
    print('保存模型(第四层):', save_path)
    save_path = '{}/{}.pkl'.format(opt['model_dir'], 'lgb_4_300_top5')
    with open(save_path, 'wb') as fout:
        pickle.dump(gbm4, fout)
    print('保存模型(第四层):', save_path)
コード例 #5
0
ファイル: main.py プロジェクト: lsp140510/Mobike
def train(**kwargs):

	# ---------------------- 更新参数 ----------------------
	opt = DefaultConfig()
	opt.update(**kwargs)
	opt.printf()

	# ---------------------- 数据处理 ----------------------

	# 获取数据
	# train1, train2 = get_train_data(opt)
	# 获取样本
	# train_sample = get_sample(train1, train2, load=True)
	# 获取特征
	# train_feat = get_feat(train1, train_sample)
	# 获取标签
	# train_all = get_label(train_feat, opt)
	# gc.collect()

	# train_all.to_hdf('/home/xuwenchao/dyj-storage/all-feat/feat_23_24_label.hdf', 'w', complib='blosc', complevel=5)
	train_all = pd.read_hdf('/home/xuwenchao/dyj-storage/all-feat/feat_23_24_label.hdf')
	print(train_all.shape)
    
	# 取出需要用的特征
	# opt['model_name'] = 'lgb_1_2017-09-15#19:50:48_0.58820.pkl'
	# gbm, use_feat = load_model(opt)
	# predictors_100 = pd.DataFrame(data={'feature_name': gbm.feature_name(), 'feature_importance': gbm.feature_importance()})
	# predictors_100 = predictors_100.sort_values(by=['feature_importance'], ascending=False)['feature_name'].values[:100]
	# use_feat = list(predictors_100) + ['orderid', 'geohashed_end_loc', 'label'] + ['sloc_eloc_common_eloc_count', 'sloc_eloc_common_sloc_count', 'sloc_eloc_common_conn1_count', 'sloc_eloc_common_conn2_count', 'sloc_eloc_common_eloc_rate', 'sloc_eloc_common_sloc_rate', 'sloc_eloc_common_conn1_rate', 'sloc_eloc_common_conn2_rate', 'user_sloc_eloc_common_eloc_count', 'user_sloc_eloc_common_sloc_count', 'user_sloc_eloc_common_conn1_count', 'user_sloc_eloc_common_conn2_count', 'user_sloc_eloc_common_eloc_rate', 'user_sloc_eloc_common_sloc_rate', 'user_sloc_eloc_common_conn1_rate', 'user_sloc_eloc_common_conn2_rate']
	# train_all = train_all[use_feat]
	# gc.collect()
    
	# -------------------- 训练第一层 ------------------------

	# ********* 准备数据 **********
	# 划分验证集
	train, val = train_test_split(train_all, test_size=0.1)
	# 定义使用哪些特征
	# opt['model_name'] = 'lgb_1_2017-09-15#19:50:48_0.58820.pkl'
	# gbm, use_feat = load_model(opt)
	filters = set(['orderid', 'userid', 'biketype', 'geohashed_start_loc', 'bikeid', 'starttime', 'geohashed_end_loc', 'label'])
	predictors = list(filter(lambda x: x not in filters, train_all.columns.tolist()))
	# predictors = pd.DataFrame(data={'feature_name': gbm.feature_name(), 'feature_importance': gbm.feature_importance()})
	# predictors = predictors.sort_values(by=['feature_importance'], ascending=False)['feature_name'].values[:100]
	# use_feat = list(predictors) + ['orderid', 'geohashed_end_loc'] + ['sloc_eloc_common_eloc_count', 'sloc_eloc_common_sloc_count', 'sloc_eloc_common_conn1_count', 'sloc_eloc_common_conn2_count', 'sloc_eloc_common_eloc_rate', 'sloc_eloc_common_sloc_rate', 'sloc_eloc_common_conn1_rate', 'sloc_eloc_common_conn2_rate', 'user_sloc_eloc_common_eloc_count', 'user_sloc_eloc_common_sloc_count', 'user_sloc_eloc_common_conn1_count', 'user_sloc_eloc_common_conn2_count', 'user_sloc_eloc_common_eloc_rate', 'user_sloc_eloc_common_sloc_rate', 'user_sloc_eloc_common_conn1_rate', 'user_sloc_eloc_common_conn2_rate']
	# predictors = list(predictors_100) + ['sloc_eloc_common_eloc_count', 'sloc_eloc_common_sloc_count', 'sloc_eloc_common_conn1_count', 'sloc_eloc_common_conn2_count', 'sloc_eloc_common_eloc_rate', 'sloc_eloc_common_sloc_rate', 'sloc_eloc_common_conn1_rate', 'sloc_eloc_common_conn2_rate', 'user_sloc_eloc_common_eloc_count', 'user_sloc_eloc_common_sloc_count', 'user_sloc_eloc_common_conn1_count', 'user_sloc_eloc_common_conn2_count', 'user_sloc_eloc_common_eloc_rate', 'user_sloc_eloc_common_sloc_rate', 'user_sloc_eloc_common_conn1_rate', 'user_sloc_eloc_common_conn2_rate']
	print('使用的特征:{}维\n'.format(len(predictors)), predictors)
	# 定义数据集
	X_train = train[predictors]
	y_train = train['label']
	X_val = val[predictors]
	y_val = val['label']
	del train, val
	gc.collect()

	# ********* LightGBM *********
	# 数据集
	lgb_train = lgb.Dataset(X_train, y_train)
	lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)
	# 配置
	params = {
	    'objective': 'binary',
	    'metric': {'auc', 'binary_logloss'},
	    'is_unbalance': True,
	    'num_leaves': opt['lgb_leaves'],
	    'learning_rate': opt['lgb_lr'],
	    'feature_fraction': 0.886,
	    'bagging_fraction': 0.886,
	    'bagging_freq': 5
	}
	gc.collect()    
	# ********** 开始训练 *********
	gbm1 = lgb.train(
				params,
                lgb_train,
                num_boost_round=1200,
                valid_sets=[lgb_train, lgb_val],
                early_stopping_rounds=5
	)
	gc.collect()
    
	# #  ********* 保存模型 *********

	cur_time = datetime.datetime.now().strftime('%Y-%m-%d#%H:%M:%S')
	# save_path = '{}/{}_{}_{:.5f}.pkl'.format(opt['model_dir'], 'lgb', cur_time, score[0])
	save_path = '{}/{}_{}.pkl'.format(opt['model_dir'], 'lgb', cur_time)
	with open(save_path, 'wb') as fout:
	    pickle.dump(gbm1, fout)
	print('保存模型:', save_path)
	gc.collect()    

	# #  ********* 评估  *********

	# # 在训练集上看效果
	del X_train, y_train, X_val, y_val
	gc.collect()    
	score = get_score(train_all, predictors, gbm1, opt)
	print('训练集分数:{}'.format(score))
    
	import sys
	sys.exit(0)
    
	# save_path = '{}/{}.pkl'.format(opt['model_dir'], 'lgb_1_300_top25')
	# with open(save_path, 'wb') as fout:
	#     pickle.dump(gbm1, fout)
	# print('保存模型(第一层):', save_path)

	# ********* save predict *****

	# train_all[['orderid', 'geohashed_end_loc', 'pred']].to_hdf('/home/xuwenchao/dyj-storage/train2324_80_pred_res.hdf', 'w', complib='blosc', complevel=5)
	# print('Save train_pred_res.hdf successful!!!')

	# import sys
	# sys.exit(0)
    
	# -------------------- 训练第二层 ------------------------
    
	# opt['model_name'] = 'lgb_1_300_top25.pkl'
	# gbm1, use_feat1 = load_model(opt)
	# train_all.loc[:, 'pred'] = gbm1.predict(train_all[use_feat1])

	# 去掉重要性较低的特征,筛选出排名前十的候选样本,重新训练模型(后期可以载入模型finetune,尤其是对于样本量较少的情况,甚至可以选前5,但15可以覆盖99.5%的原始label,10可以覆盖98%的原始label,这两者可能会好一些,备选方案:5(+finetune),10(+finetune),15(+finetune))
	predictors = pd.DataFrame(data={'feature_name': gbm1.feature_name(), 'feature_importance': gbm1.feature_importance()})
	predictors = predictors[predictors['feature_importance']>0]['feature_name'].values
	print('第二层使用的特征:{}维\n'.format(len(predictors)), predictors)
	train_all = train_all.sort_values(by=['orderid', 'pred'], ascending=False).groupby('orderid').head(15)
	# train_all = rank(train_all, 'orderid', 'pred', ascending=False)
	del train_all['pred']
	print('第二层数据:', train_all.shape)
    
	# ********* 准备数据 **********
	# 划分验证集
	train, val = train_test_split(train_all, test_size=0.1)
    
	# 定义数据集
	X_train = train[predictors]
	y_train = train['label']
	X_val = val[predictors]
	y_val = val['label']
	del train, val
	gc.collect()

	# 数据集
	lgb_train = lgb.Dataset(X_train, y_train)
	lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)

	# ********** 开始训练 *********
	gbm2 = lgb.train(
				params,
                lgb_train,
                num_boost_round=1200,
                valid_sets=[lgb_train, lgb_val],
                early_stopping_rounds=5
                # init_model=gbm1 # finetune
	        )

	#  ********* 评估  *********

	# 在训练集上看效果    
	score = get_score(train_all, predictors, gbm2, opt)
	print('训练集分数(第二层):{}'.format(score))

	#  ********* 保存模型 *********

	cur_time = datetime.datetime.now().strftime('%Y-%m-%d#%H:%M:%S')
	save_path = '{}/{}_{}_{:.5f}.pkl'.format(opt['model_dir'], 'lgb_2', cur_time, score[0])
	with open(save_path, 'wb') as fout:
	    pickle.dump(gbm2, fout)
	print('保存模型(第二层):', save_path)
	# save_path = '{}/{}.pkl'.format(opt['model_dir'], 'lgb_2_300_top15')
	# with open(save_path, 'wb') as fout:
	#     pickle.dump(gbm2, fout)
	# print('保存模型(第二层):', save_path)
    
	import sys
	sys.exit(0)
    
	# -------------------- 训练第三层 ------------------------
    
	# 筛选出排名前五的候选样本
	predictors = pd.DataFrame(data={'feature_name': gbm2.feature_name(), 'feature_importance': gbm2.feature_importance()})
	predictors = predictors[predictors['feature_importance']>0]['feature_name'].values
	print('第三层使用的特征:{}维\n'.format(len(predictors)), predictors)
	train_all = train_all.sort_values(by=['orderid', 'pred'], ascending=False).groupby('orderid').head(10)
	# train_all = rank(train_all, 'orderid', 'pred', ascending=False)
	del train_all['pred']
	print('第三层数据:', train_all.shape)
    
	# ********* 准备数据 **********
	# 划分验证集
	train, val = train_test_split(train_all, test_size=0.1)
    
	# 定义数据集
	X_train = train[predictors]
	y_train = train['label']
	X_val = val[predictors]
	y_val = val['label']
	del train, val
	gc.collect()

	# 数据集
	lgb_train = lgb.Dataset(X_train, y_train)
	lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)

	# ********** 开始训练 *********
	gbm3 = lgb.train(
				params,
                lgb_train,
                num_boost_round=1200,
                valid_sets=[lgb_train, lgb_val],
                early_stopping_rounds=5
                # init_model=gbm2 # finetune
	        )

	#  ********* 评估  *********

	# 在训练集上看效果    
	score = get_score(train_all, predictors, gbm3, opt)
	print('训练集分数(第三层):{}'.format(score))

	#  ********* 保存模型 *********

	cur_time = datetime.datetime.now().strftime('%Y-%m-%d#%H:%M:%S')
	save_path = '{}/{}_{}_{:.5f}.pkl'.format(opt['model_dir'], 'lgb_3', cur_time, score[0])
	with open(save_path, 'wb') as fout:
	    pickle.dump(gbm3, fout)
	print('保存模型(第三层):', save_path)
	save_path = '{}/{}.pkl'.format(opt['model_dir'], 'lgb_3_300_top10')
	with open(save_path, 'wb') as fout:
	    pickle.dump(gbm3, fout) 
	print('保存模型(第三层):', save_path)

    
	# -------------------- 训练第四层 ------------------------
    
	# 筛选出排名前三的候选样本
	predictors = pd.DataFrame(data={'feature_name': gbm3.feature_name(), 'feature_importance': gbm3.feature_importance()})
	predictors = predictors[predictors['feature_importance']>0]['feature_name'].values
	print('第四层使用的特征:{}维\n'.format(len(predictors)), predictors)
	train_all = train_all.sort_values(by=['orderid', 'pred'], ascending=False).groupby('orderid').head(5)
	# train_all = rank(train_all, 'orderid', 'pred', ascending=False)
	del train_all['pred']
	print('第四层数据:', train_all.shape)
    
	# ********* 准备数据 **********
	# 划分验证集
	train, val = train_test_split(train_all, test_size=0.1)
    
	# 定义数据集
	X_train = train[predictors]
	y_train = train['label']
	X_val = val[predictors]
	y_val = val['label']
	del train, val
	gc.collect()

	# 数据集
	lgb_train = lgb.Dataset(X_train, y_train)
	lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)

	# ********** 开始训练 *********
	gbm4 = lgb.train(
				params,
                lgb_train,
                num_boost_round=1200,
                valid_sets=[lgb_train, lgb_val],
                early_stopping_rounds=5
                # init_model=gbm3 # finetune
	        )

	#  ********* 评估  *********

	# 在训练集上看效果    
	score = get_score(train_all, predictors, gbm4, opt)
	print('训练集分数(第四层):{}'.format(score))

	#  ********* 保存模型 *********

	cur_time = datetime.datetime.now().strftime('%Y-%m-%d#%H:%M:%S')
	save_path = '{}/{}_{}_{:.5f}.pkl'.format(opt['model_dir'], 'lgb_4', cur_time, score[0])
	with open(save_path, 'wb') as fout:
	    pickle.dump(gbm4, fout)
	print('保存模型(第四层):', save_path)
	save_path = '{}/{}.pkl'.format(opt['model_dir'], 'lgb_4_300_top5')
	with open(save_path, 'wb') as fout:
	    pickle.dump(gbm4, fout) 
	print('保存模型(第四层):', save_path)
コード例 #6
0
def test_stack(**kwargs):
    opt = DefaultConfig()
    opt.update(**kwargs)

    logger = Logger()

    result_dir = '/home/dyj/'
    resmat = [result_dir+'TextCNN1_2017-07-27#12:30:16_test_res.pt',\
              result_dir+'TextCNN2_2017-07-27#12:22:42_test_res.pt', \
              result_dir+'RNN1_2017-07-27#12:35:51_test_res.pt',\
              result_dir+'RNN2_2017-07-27#11:33:24_test_res.pt',\
              result_dir+'RCNN1_2017-07-27#11:30:42_test_res.pt',\
              result_dir+'RCNNcha_2017-07-27#16:00:33_test_res.pt',\
              result_dir+'FastText4_2017-07-28#17:20:21_test_res.pt',\
              result_dir+'FastText1_2017-07-29#10:47:46_test_res.pt']
    opt['stack_num'] = len(resmat)

    test_dataset = Stack_Dataset(resmat=resmat, test=True)
    test_loader = data.DataLoader(test_dataset,
                                  shuffle=False,
                                  batch_size=opt['batch_size'])

    test_idx = np.load(opt['test_idx'])
    topic_idx = np.load(opt['topic_idx'])

    logger.info('Using model {}'.format(opt['model']))
    Model = getattr(models, opt['model'])
    model = Model(opt)
    print model

    if opt['load']:
        if opt.get('load_name', None) is None:
            model = load_model(model,
                               model_dir=opt['model_dir'],
                               model_name=opt['model'])
        else:
            model = load_model(model, model_dir=opt['model_dir'], model_name=opt['model'], \
                              name=opt['load_name'])

    if opt['device'] != None:
        torch.cuda.set_device(opt['device'])

    if opt['cuda']:
        model.cuda()

    logger.info('Start testing...')

    model.eval()
    predict_label_list = []
    res = torch.Tensor(opt['test_num'], opt['class_num'])
    for i, batch in enumerate(test_loader, 0):
        batch_size = batch[0].size(0)
        resmat = batch
        resmat = [Variable(ii) for ii in resmat]
        if opt['cuda']:
            resmat = [ii.cuda() for ii in resmat]
        logit = model(resmat)
        if opt.get('save_resmat', False):
            res[i * opt['batch_size']:i * opt['batch_size'] +
                batch_size] = logit.data.cpu()
        predict_label_list += [list(ii) for ii in logit.topk(5, 1)[1].data]

    if opt.get('save_resmat', False):
        torch.save(
            res, '{}/{}_{}_test_res.pt'.format(
                opt['result_dir'], opt['model'],
                datetime.datetime.now().strftime('%Y-%m-%d#%H:%M:%S')))
        return

    lines = []
    for qid, top5 in zip(test_idx, predict_label_list):
        topic_ids = [topic_idx[i] for i in top5]
        lines.append('{},{}'.format(qid, ','.join(topic_ids)))

    if opt.get('load_name', None) is None:
        write_result(lines,
                     model_dir=opt['model_dir'],
                     model_name=opt['model'],
                     result_dir=opt['result_dir'])
    else:
        write_result(lines, model_dir=opt['model_dir'], model_name=opt['model'], \
                          name=opt['load_name'], result_dir=opt['result_dir'])
コード例 #7
0
def train_stack(**kwargs):
    opt = DefaultConfig()
    opt.update(**kwargs)

    vis = Visualizer(opt['model'])
    logger = Logger()

    result_dir = '/home/dyj/'
    resmat = [(result_dir + 'RNN10_cal_res.pt', 10),\
              (result_dir + 'TextCNN10_char.pt', 10),\
              (result_dir + 'TextCNN10_top1.pt', 10),\
              (result_dir + 'TextCNN10_top1_char.pt', 10),\
              (result_dir + 'FastText10_res.pt', 10),\
              ('/mnt/result/results/TextCNN5_12h.pt', 5),\
              ('/mnt/result/results/RNN1_char.pt', 1)
              ]
    label = result_dir + 'label.pt'
    opt['stack_num'] = len(resmat)

    train_dataset = Stack_Dataset(resmat=resmat, label=label)
    train_loader = data.DataLoader(train_dataset,
                                   shuffle=True,
                                   batch_size=opt['batch_size'])

    logger.info('Using model {}'.format(opt['model']))
    Model = getattr(models, opt['model'])
    model = Model(opt)
    print model

    if opt['use_self_loss']:
        Loss = getattr(models, opt['loss_function'])
    else:
        Loss = getattr(nn, opt['loss_function'])

    if opt['load']:
        if opt.get('load_name', None) is None:
            model = load_model(model,
                               model_dir=opt['model_dir'],
                               model_name=opt['model'])
        else:
            model = load_model(model, model_dir=opt['model_dir'], model_name=opt['model'], \
                              name=opt['load_name'])

    if opt['device'] != None:
        torch.cuda.set_device(opt['device'])

    if opt['cuda']:
        model.cuda()

    loss_function = Loss()
    optimizer = torch.optim.Adam(model.parameters(), lr=opt['lr'])

    logger.info('Start running...')

    steps = 0
    model.train()
    for epoch in range(opt['base_epoch'] + 1, opt['epochs'] + 1):
        for i, batch in enumerate(train_loader, 1):
            resmat, label = batch[0:-1], batch[-1]
            resmat, label = [Variable(ii) for ii in resmat], Variable(label)
            if opt['cuda']:
                resmat, label = [ii.cuda() for ii in resmat], label.cuda()

            optimizer.zero_grad()
            logit = model(resmat)

            loss = loss_function(logit, label)
            loss.backward()
            optimizer.step()

            steps += 1
            if steps % opt['log_interval'] == 0:
                corrects = ((logit.data >
                             opt['threshold']) == (label.data).byte()).sum()
                accuracy = 100.0 * corrects / (opt['batch_size'] *
                                               opt['class_num'])
                log_info = 'Steps[{:>8}] (epoch[{:>2}] / batch[{:>5}]) - loss: {:.6f}, acc: {:.4f} % ({} / {})'.format( \
                                steps, epoch, i, loss.data[0], accuracy, \
                                corrects, opt['batch_size'] * opt['class_num'])
                logger.info(log_info)
                vis.plot('loss', loss.data[0])
                precision, recall, score = get_score(logit.data.cpu(),
                                                     label.data.cpu())
                logger.info('Precision {}, Recall {}, Score {}'.format(
                    precision, recall, score))
                vis.plot('score', score)
        logger.info('Training epoch {} finished!'.format(epoch))
        #save_model(model, model_dir=opt['model_dir'], model_name=opt['model'], epoch=epoch)
        if epoch == 3:
            for param_group in optimizer.param_groups:
                param_group['lr'] = opt['lr'] * opt['lr_decay']
    save_model(model,
               model_dir=opt['model_dir'],
               model_name=opt['model'],
               epoch=epoch)
コード例 #8
0
def test(**kwargs):
    opt = DefaultConfig()
    opt.update(**kwargs)

    logger = Logger()

    prefix = ''
    if opt['use_double_length']: prefix += '_2'
    print prefix
    if opt['use_char']:
        logger.info('Load char data starting...')
        opt['embed_num'] = opt['char_embed_num']
        embed_mat = np.load(opt['char_embed'])
        test_title = np.load(opt['test_title_char' + prefix])
        test_desc = np.load(opt['test_desc_char' + prefix])
        logger.info('Load char data finished!')
    elif opt['use_word']:
        logger.info('Load word data starting...')
        opt['embed_num'] = opt['word_embed_num']
        embed_mat = np.load(opt['word_embed'])
        test_title = np.load(opt['test_title_word' + prefix])
        test_desc = np.load(opt['test_desc_word' + prefix])
        logger.info('Load word data finished!')
    elif opt['use_char_word']:
        logger.info('Load char-word data starting...')
        embed_mat_char = np.load(opt['char_embed'])
        embed_mat_word = np.load(opt['word_embed'])
        embed_mat = np.vstack((embed_mat_char, embed_mat_word))
        test_title = np.load(opt['test_title_char' + prefix])
        test_desc = np.load(opt['test_desc_word' + prefix])
        logger.info('Load char-word data finished!')
    elif opt['use_word_char']:
        logger.info('Load word-char data starting...')
        embed_mat_char = np.load(opt['char_embed'])
        embed_mat_word = np.load(opt['word_embed'])
        embed_mat = np.vstack((embed_mat_char, embed_mat_word))
        test_title = np.load(opt['test_title_word' + prefix])
        test_desc = np.load(opt['test_desc_char' + prefix])
        logger.info('Load word-char data finished!')

    test_idx = np.load(opt['test_idx'])
    topic_idx = np.load(opt['topic_idx'])

    test_dataset = Dataset(test=True, title=test_title, desc=test_desc)
    test_loader = data.DataLoader(test_dataset,
                                  shuffle=False,
                                  batch_size=opt['batch_size'])

    logger.info('Using model {}'.format(opt['model']))
    Model = getattr(models, opt['model'])
    model = Model(embed_mat, opt)

    if opt['load']:
        if opt.get('load_name', None) is None:
            model = load_model(model,
                               model_dir=opt['model_dir'],
                               model_name=opt['model'])
        else:
            model = load_model(model, model_dir=opt['model_dir'], model_name=opt['model'], \
                              name=opt['load_name'])

    if opt['device'] != None:
        torch.cuda.set_device(opt['device'])

    if opt['cuda']:
        model.cuda()

    logger.info('Start testing...')

    model.eval()
    predict_label_list = []
    res = torch.Tensor(opt['test_num'], opt['class_num'])
    for i, batch in enumerate(test_loader, 0):
        batch_size = batch[0].size(0)
        title, desc = batch
        title, desc = Variable(title), Variable(desc)
        if opt['cuda']:
            title, desc = title.cuda(), desc.cuda()
        logit = model(title, desc)
        if opt.get('save_resmat', False):
            res[i * opt['batch_size']:i * opt['batch_size'] +
                batch_size] = logit.data.cpu()
        predict_label_list += [list(ii) for ii in logit.topk(5, 1)[1].data]

    if opt.get('save_resmat', False):
        torch.save(res, '{}/{}_test_res.pt'.format(opt['result_dir'],
                                                   opt['model']))
        return

    lines = []
    for qid, top5 in zip(test_idx, predict_label_list):
        topic_ids = [topic_idx[i] for i in top5]
        lines.append('{},{}'.format(qid, ','.join(topic_ids)))

    if opt.get('load_name', None) is None:
        write_result(lines,
                     model_dir=opt['model_dir'],
                     model_name=opt['model'],
                     result_dir=opt['result_dir'])
    else:
        write_result(lines, model_dir=opt['model_dir'], model_name=opt['model'], \
                          name=opt['load_name'], result_dir=opt['result_dir'])
コード例 #9
0
def train(**kwargs):
    opt = DefaultConfig()
    opt.update(**kwargs)

    vis = Visualizer(opt['model'])
    logger = Logger()

    prefix = ''
    if opt['use_double_length']: prefix += '_2'
    print prefix
    if opt['use_char']:
        logger.info('Load char data starting...')
        opt['embed_num'] = opt['char_embed_num']
        embed_mat = np.load(opt['char_embed'])
        train_title = np.load(opt['train_title_char' + prefix])
        train_desc = np.load(opt['train_desc_char' + prefix])
        train_label = np.load(opt['train_label'])
        val_title = np.load(opt['val_title_char' + prefix])
        val_desc = np.load(opt['val_desc_char' + prefix])
        val_label = np.load(opt['val_label'])
        logger.info('Load char data finished!')
    elif opt['use_word']:
        logger.info('Load word data starting...')
        opt['embed_num'] = opt['word_embed_num']
        embed_mat = np.load(opt['word_embed'])
        train_title = np.load(opt['train_title_word' + prefix])
        train_desc = np.load(opt['train_desc_word' + prefix])
        train_label = np.load(opt['train_label'])
        val_title = np.load(opt['val_title_word' + prefix])
        val_desc = np.load(opt['val_desc_word' + prefix])
        val_label = np.load(opt['val_label'])
        logger.info('Load word data finished!')
    elif opt['use_char_word']:
        logger.info('Load char-word data starting...')
        embed_mat_char = np.load(opt['char_embed'])
        embed_mat_word = np.load(opt['word_embed'])
        embed_mat = np.vstack((embed_mat_char, embed_mat_word))
        train_title = np.load(opt['train_title_char' + prefix])
        train_desc = np.load(opt['train_desc_word' + prefix])
        train_label = np.load(opt['train_label'])
        val_title = np.load(opt['val_title_char' + prefix])
        val_desc = np.load(opt['val_desc_word' + prefix])
        val_label = np.load(opt['val_label'])
        logger.info('Load char-word data finished!')
    elif opt['use_word_char']:
        logger.info('Load word-char data starting...')
        embed_mat_char = np.load(opt['char_embed'])
        embed_mat_word = np.load(opt['word_embed'])
        embed_mat = np.vstack((embed_mat_char, embed_mat_word))
        train_title = np.load(opt['train_title_word' + prefix])
        train_desc = np.load(opt['train_desc_char' + prefix])
        train_label = np.load(opt['train_label'])
        val_title = np.load(opt['val_title_word' + prefix])
        val_desc = np.load(opt['val_desc_char' + prefix])
        val_label = np.load(opt['val_label'])
        logger.info('Load word-char data finished!')

    train_dataset = Dataset(title=train_title,
                            desc=train_desc,
                            label=train_label,
                            class_num=opt['class_num'])
    train_loader = data.DataLoader(train_dataset,
                                   shuffle=True,
                                   batch_size=opt['batch_size'])
    val_dataset = Dataset(title=val_title,
                          desc=val_desc,
                          label=val_label,
                          class_num=opt['class_num'])
    val_loader = data.DataLoader(val_dataset,
                                 shuffle=False,
                                 batch_size=opt['batch_size'])

    logger.info('Using model {}'.format(opt['model']))
    Model = getattr(models, opt['model'])
    model = Model(embed_mat, opt)
    print model

    loss_weight = torch.ones(opt['class_num'])
    if opt['boost']:
        if opt['base_layer'] != 0:
            cal_res = torch.load('{}/{}/layer_{}_cal_res_3.pt'.format(
                opt['model_dir'], opt['model'], opt['base_layer']),
                                 map_location=lambda storage, loc: storage)
            logger.info('Load cal_res successful!')
            loss_weight = torch.load('{}/{}/layer_{}_loss_weight_3.pt'.format(
                opt['model_dir'], opt['model'], opt['base_layer'] + 1),
                                     map_location=lambda storage, loc: storage)
        else:
            cal_res = torch.zeros(opt['val_num'], opt['class_num'])
        print 'cur_layer:', opt['base_layer'] + 1, \
              'loss_weight:', loss_weight.mean(), loss_weight.max(), loss_weight.min(), loss_weight.std()

    if opt['use_self_loss']:
        Loss = getattr(models, opt['loss_function'])
    else:
        Loss = getattr(nn, opt['loss_function'])

    if opt['load']:
        if opt.get('load_name', None) is None:
            model = load_model(model,
                               model_dir=opt['model_dir'],
                               model_name=opt['model'])
        else:
            model = load_model(model, model_dir=opt['model_dir'], model_name=opt['model'], \
                              name=opt['load_name'])

    if opt['cuda'] and opt['device'] != None:
        torch.cuda.set_device(opt['device'])

    if opt['cuda']:
        model.cuda()
        loss_weight = loss_weight.cuda()

    # import sys
    # precision, recall, score = eval(val_loader, model, opt, save_res=True)
    # print precision, recall, score
    # sys.exit()

    loss_function = Loss(weight=loss_weight + 1 - loss_weight.mean())
    optimizer = torch.optim.Adam(model.parameters(), lr=opt['lr'])

    logger.info('Start running...')

    steps = 0
    model.train()
    base_epoch = opt['base_epoch']
    for epoch in range(1, opt['epochs'] + 1):
        for i, batch in enumerate(train_loader, 0):
            title, desc, label = batch
            title, desc, label = Variable(title), Variable(desc), Variable(
                label).float()
            if opt['cuda']:
                title, desc, label = title.cuda(), desc.cuda(), label.cuda()

            optimizer.zero_grad()
            logit = model(title, desc)

            loss = loss_function(logit, label)
            loss.backward()
            optimizer.step()

            steps += 1
            if steps % opt['log_interval'] == 0:
                corrects = ((logit.data >
                             opt['threshold']) == (label.data).byte()).sum()
                accuracy = 100.0 * corrects / (opt['batch_size'] *
                                               opt['class_num'])
                log_info = 'Steps[{:>8}] (epoch[{:>2}] / batch[{:>5}]) - loss: {:.6f}, acc: {:.4f} % ({} / {})'.format( \
                                steps, epoch + base_epoch, (i+1), loss.data[0], accuracy, \
                                corrects, opt['batch_size'] * opt['class_num'])
                logger.info(log_info)
                vis.plot('loss', loss.data[0])
                precision, recall, score = eval(batch,
                                                model,
                                                opt,
                                                isBatch=True)
                vis.plot('score', score)
        logger.info('Training epoch {} finished!'.format(epoch + base_epoch))
        precision, recall, score = eval(val_loader, model, opt)
        log_info = 'Epoch[{}] - score: {:.6f} (precision: {:.4f}, recall: {:.4f})'.format( \
                            epoch + base_epoch, score, precision, recall)
        vis.log(log_info)
        save_model(model, model_dir=opt['model_dir'], model_name=opt['model'], \
                    epoch=epoch+base_epoch, score=score)
        if epoch + base_epoch == 2:
            model.opt['static'] = False
        elif epoch + base_epoch == 4:
            for param_group in optimizer.param_groups:
                param_group['lr'] = opt['lr'] * opt['lr_decay']
        elif epoch + base_epoch >= 5:
            if opt['boost']:
                res, truth = eval(val_loader, model, opt, return_res=True)
                ori_score = get_score(cal_res, truth)
                cal_res += res
                cur_score = get_score(cal_res, truth)
                logger.info('Layer {}: {}, Layer {}: {}'.format(
                    opt['base_layer'], ori_score, opt['base_layer'] + 1,
                    cur_score))
                loss_weight = get_loss_weight(cal_res, truth)
                torch.save(
                    cal_res, '{}/{}/layer_{}_cal_res_3.pt'.format(
                        opt['model_dir'], opt['model'], opt['base_layer'] + 1))
                logger.info('Save cal_res successful!')
                torch.save(
                    loss_weight, '{}/{}/layer_{}_loss_weight_3.pt'.format(
                        opt['model_dir'], opt['model'], opt['base_layer'] + 2))
            break