def add_group_feature(x_train, x_valid, x_test, x_sub): others = [x_valid, x_test, x_sub] x_train, others = fe.goal_min_group(x_train, others) x_train, others = fe.goal_max_group(x_train, others) x_train, others = fe.duration_group(x_train, others) x_train, others = fe.text_to_word_count(x_train, others) return x_train, others[0], others[1], others[2]
def add_group_feature(x_train, x_valid, x_test, x_sub, y_train): others = [x_valid, x_test, x_sub] # x_train, others = fe.goal_min_group(x_train, others) # x_train, others = fe.goal_max_group(x_train, others) # x_train, others = fe.duration_group(x_train, others) x_train, others = fe.target_encoding(x_train, y_train, others, 'country_encoding') x_train, others = fe.target_encoding(x_train, y_train, others, 'category1_encoding') x_train, others = fe.target_encoding(x_train, y_train, others, 'category2_encoding') x_train, others = fe.multi_target_encoding( x_train, y_train, others, ['country_encoding', 'category1_encoding', 'category2_encoding']) x_train, others = fe.text_to_word_count(x_train, others) return x_train, others[0], others[1], others[2]
target = 'state' cv = 10 train_dfs, valid_dfs, test_dfs = training.cv(train_df, cv) scores = list() params = None tune = True name = 'lgb_baseline' for cv_idx in range(cv): 'prepare' x_train, y_train = train_dfs[cv_idx][features], train_dfs[cv_idx][target] x_valid, y_valid = valid_dfs[cv_idx][features], valid_dfs[cv_idx][target] x_test, y_test = test_dfs[cv_idx][features], test_dfs[cv_idx][target] x_train, others = fe.text_to_word_count(x_train, [x_valid, x_test, test_df[features]]) x_valid, x_test, x_sub = others[0], others[1], others[2] lgb_train, lgb_valid = lgb.Dataset(x_train, y_train, categorical_feature=[3, 4, 5], free_raw_data=False), lgb.Dataset(x_valid, y_valid, categorical_feature=[3, 4, 5], free_raw_data=False) 'train' if tune: params = training.tuning(lgb_train, lgb_valid, 100) pd.to_pickle(params, 'params/{0}_cv{1}.pkl'.format(name, cv_idx)) model = training.train(lgb_train, lgb_valid, params) score = training.evaluation(model, x_test, y_test) scores.append(score) model.save_model('model/{0}_cv{1}.txt'.format(name, cv_idx), num_iteration=model.best_iteration) 'predict' pred = model.predict(x_sub)
target = 'state' cv = 10 train_dfs, valid_dfs, test_dfs = training.cv(train_df, cv) scores = list() params = None tune = True name = 'word_count_plus_h1_word_count' for cv_idx in range(cv): 'prepare' x_train, y_train = train_dfs[cv_idx][features], train_dfs[cv_idx][target] x_valid, y_valid = valid_dfs[cv_idx][features], valid_dfs[cv_idx][target] x_test, y_test = test_dfs[cv_idx][features], test_dfs[cv_idx][target] x_train, others = fe.text_to_word_count(x_train, [x_valid, x_test, test_df[features]], del_html_content=False) x_valid, x_test, x_sub = others[0], others[1], others[2] x_train, others = fe.text_to_h1_word_count(x_train, [x_valid, x_test, x_sub]) x_valid, x_test, x_sub = others[0], others[1], others[2] lgb_train, lgb_valid = lgb.Dataset(x_train, y_train, categorical_feature=[3, 4, 5], free_raw_data=False), lgb.Dataset(x_valid, y_valid, categorical_feature=[3, 4, 5], free_raw_data=False) 'train' if tune: params = training.tuning(lgb_train, lgb_valid, 100) pd.to_pickle(params, 'params/{0}_cv{1}.pkl'.format(name, cv_idx)) model = training.train(lgb_train, lgb_valid, params) score = training.evaluation(model, x_test, y_test) scores.append(score) model.save_model('model/{0}_cv{1}.txt'.format(name, cv_idx), num_iteration=model.best_iteration)