def testF1(iter): train_vec = pd.read_csv('../content_vec_withoutD.csv', header=None) # train_vec_sentiment = pd.read_csv('../content_vec_sentiment.csv', header=None) train_vec = np.array(train_vec) # train_vec_sentiment = np.array(train_vec_sentiment) data = pd.read_csv('../data/train.csv') subject_vocab = list(['价格', '配置', '操控', '舒适性', '油耗', '动力', '内饰', '安全性', '空间', '外观']) subject_list = list() for i in data['subject']: for k in range(10): if subject_vocab[k] == i: subject_list.append(k) break # value_list = list() # for i in data['sentiment_value']: # value_list.append(i) train_vec = Bdc.cal_bdc_with_vec(train_vec, subject_list, 10) test_id_csv = data['content_id'] test_id = list(test_id_csv) test_id = test_id[9447:] test_vec = train_vec[9447:] test_id_single = list() test_vec_single = list() for l in range(len(test_id)): if test_id[l] not in test_id_single: test_id_single.append(test_id[l]) test_vec_single.append(test_vec[l]) for i in range(100): print((i + 1) * iter) res_id, res_subject = Lgb.cal_subject(train_vec[:9447], subject_list[:9447], test_id_single, test_vec_single, (i + 1) * iter) GetResult.cal_F1(res_id, res_subject, 9447)
def cvtest(): res = open('../res.txt1', 'w') params = { 'boosting_type':'gbdt', 'num_leaves':55, 'reg_alpha':0.1, 'reg_lambda':0, 'max_depth':15, 'objective':'binary', 'subsample':0.8, 'colsample_bytree':0.8, 'subsample_freq':1, 'learning_rate':0.06, 'min_child_weight':1, 'random_state':20, 'n_jobs':4} train_vec = pd.read_csv('../content_vec_withoutD.csv', header=None) # train_vec_sentiment = pd.read_csv('../content_vec_sentiment.csv', header=None) train_vec = np.array(train_vec) # train_vec_sentiment = np.array(train_vec_sentiment) data = pd.read_csv('../data/train.csv') subject_vocab = list(['价格', '配置', '操控', '舒适性', '油耗', '动力', '内饰', '安全性', '空间', '外观']) subject_list = list() for i in data['subject']: for k in range(10): if subject_vocab[k] == i: subject_list.append(k) break print(train_vec) train_vec = Bdc.cal_bdc_with_vec(train_vec, subject_list, 10) print(train_vec) test_res = list() for l in range(len(subject_list)): test_res.append(list()) for i in range(10): train_label_onehot = subject_list.copy() for l in range(len(subject_list)): if subject_list[l] != i: train_label_onehot[l] = 0 else: train_label_onehot[l] = 1 # print(train_label_onehot) # print(train_subject) data_train = lgb.Dataset(train_vec, train_label_onehot) clf = lgb.cv( params, data_train, num_boost_round=10000, nfold=5, stratified=False, shuffle=True, metrics='rmse', early_stopping_rounds=50, verbose_eval=50, show_stdv=True, seed=0) res.write(str(len(clf['rmse-mean']))) res.write(' ') res.write(str(clf['rmse-mean'][-1])) res.write('\n') data_train = lgb.Dataset(train_vec, subject_list) clf = lgb.cv( params, data_train, num_boost_round=10000, nfold=5, stratified=False, shuffle=True, metrics='rmse', early_stopping_rounds=50, verbose_eval=50, show_stdv=True, seed=0) res.write(str(len(clf['rmse-mean'])))