def run(train_file, test_file, predict_res_file, index_file=''): high_corr_columns = ['v8', 'v23', 'v25', 'v36', 'v37', 'v46', 'v51', 'v53', 'v54', 'v63', 'v73', 'v81', 'v82', 'v89', 'v92', 'v95', 'v105', 'v107', 'v108', 'v109', 'v116', 'v117', 'v118', 'v119', 'v123', 'v124', 'v128'] train_columns_to_drop = ['ID', 'target'] + high_corr_columns print train_columns_to_drop test_columns_to_drop = ['ID'] + high_corr_columns xgb_num_rounds = 1450 #xgb_num_rounds = 2 num_classes = 2 print 'load data' train = pd.read_csv(train_file) test = pd.read_csv(test_file) # if using stacking if 'train' in test_file: test = test.drop(['target'], axis=1) train = compute_nan_feat(train) test = compute_nan_feat(test) ################################################ # add category combination feat train_test = pd.concat([train, test]) # if using stacking if 'train' in test_file: train_test = add_cate_comb_reindex(train_test, index_file) else: train_test = add_cate_comb(train_test) train = train_test[train_test.target.isnull() == False] # so test must drop target!!!!!!! test = train_test[train_test.target.isnull() == True] test = test.drop(['target'], axis=1) train_id = train['ID'].values train_target = train['target'].values train_feat = train.drop(train_columns_to_drop, axis=1) test_feat = test.drop(test_columns_to_drop, axis=1) factorize_category_both(train_feat, test_feat) train_feat.fillna(-1,inplace=True) test_feat.fillna(-1,inplace=True) xgtrain = xgb.DMatrix(train_feat, train['target'].values) xgtest = xgb.DMatrix(test_feat) # get the parameters for xgboost plst = get_params() print(plst) # train model model = xgb.train(plst, xgtrain, xgb_num_rounds) test_preds = model.predict(xgtest, ntree_limit=model.best_iteration) preds_out = pd.DataFrame({"ID": test['ID'].values, "PredictedProb": test_preds}) preds_out = preds_out.set_index('ID') preds_out.to_csv(predict_res_file) print 'finish'
xgb_num_rounds = 1350 num_classes = 2 print 'load data' train = pd.read_csv('./data/train.csv') test = pd.read_csv('./data/test.csv') submission = pd.read_csv('./data/sample_submission.csv') train = compute_nan_feat(train) test = compute_nan_feat(test) ################################################ # add category combination feat train_test = pd.concat([train, test]) train_test = add_cate_comb(train_test) train = train_test[train_test.target.isnull() == False] test = train_test[train_test.target.isnull() == True] test = test.drop(['target'], axis=1) ################################################ # v22 feat(base64) train = handle_v22(train) test = handle_v22(test) train_id = train['ID'].values train_target = train['target'].values train_feat = train.drop(train_columns_to_drop, axis=1)
def run(train_file, test_file, predict_res_file, index_file=''): high_corr_columns = [ 'v8', 'v23', 'v25', 'v36', 'v37', 'v46', 'v51', 'v53', 'v54', 'v63', 'v73', 'v81', 'v82', 'v89', 'v92', 'v95', 'v105', 'v107', 'v108', 'v109', 'v116', 'v117', 'v118', 'v119', 'v123', 'v124', 'v128' ] train_columns_to_drop = ['ID', 'target'] + high_corr_columns print train_columns_to_drop test_columns_to_drop = ['ID'] + high_corr_columns xgb_num_rounds = 1450 #xgb_num_rounds = 2 num_classes = 2 print 'load data' train = pd.read_csv(train_file) test = pd.read_csv(test_file) # if using stacking if 'train' in test_file: test = test.drop(['target'], axis=1) train = compute_nan_feat(train) test = compute_nan_feat(test) ################################################ # add category combination feat train_test = pd.concat([train, test]) # if using stacking if 'train' in test_file: train_test = add_cate_comb_reindex(train_test, index_file) else: train_test = add_cate_comb(train_test) train = train_test[train_test.target.isnull() == False] # so test must drop target!!!!!!! test = train_test[train_test.target.isnull() == True] test = test.drop(['target'], axis=1) train_id = train['ID'].values train_target = train['target'].values train_feat = train.drop(train_columns_to_drop, axis=1) test_feat = test.drop(test_columns_to_drop, axis=1) factorize_category_both(train_feat, test_feat) train_feat.fillna(-1, inplace=True) test_feat.fillna(-1, inplace=True) xgtrain = xgb.DMatrix(train_feat, train['target'].values) xgtest = xgb.DMatrix(test_feat) # get the parameters for xgboost plst = get_params() print(plst) # train model model = xgb.train(plst, xgtrain, xgb_num_rounds) test_preds = model.predict(xgtest, ntree_limit=model.best_iteration) preds_out = pd.DataFrame({ "ID": test['ID'].values, "PredictedProb": test_preds }) preds_out = preds_out.set_index('ID') preds_out.to_csv(predict_res_file) print 'finish'