def test_boosting_type(boosting_type): output_model_path = yatest.common.test_output_path('model.bin') output_eval_path = yatest.common.test_output_path('test.eval') train_file = data_file('adult', 'train_small') test_file = data_file('adult', 'test_small') cd_file = data_file('adult', 'train.cd') params = { '--use-best-model': 'false', '--loss-function': 'Logloss', '-f': train_file, '-t': test_file, '--column-description': cd_file, '--boosting-type': boosting_type, '-i': '10', '-w': '0.03', '-T': '4', '-r': '0', '-m': output_model_path, } fit_catboost_gpu(params) apply_catboost(output_model_path, test_file, cd_file, output_eval_path) return [local_canonical_file(output_eval_path)]
def test_queryrmse(boosting_type, qwise_loss): output_model_path = yatest.common.test_output_path('model.bin') test_error_path = yatest.common.test_output_path('test_error.tsv') learn_error_path = yatest.common.test_output_path('learn_error.tsv') predictions_path_learn = yatest.common.test_output_path('predictions_learn.tsv') predictions_path_test = yatest.common.test_output_path('predictions_test.tsv') learn_file = data_file('querywise', 'train') cd_file = data_file('querywise', 'train.cd') test_file = data_file('querywise', 'test') params = {"--loss-function": qwise_loss, "-f": learn_file, "-t": test_file, '--column-description': cd_file, '--boosting-type': boosting_type, '-i': '100', '-T': '4', '-r': '0', '-m': output_model_path, '--learn-err-log': learn_error_path, '--test-err-log': test_error_path, '--use-best-model': 'false' } fit_catboost_gpu(params) apply_catboost(output_model_path, learn_file, cd_file, predictions_path_learn) apply_catboost(output_model_path, test_file, cd_file, predictions_path_test) return [local_canonical_file(learn_error_path), local_canonical_file(test_error_path), local_canonical_file(predictions_path_learn), local_canonical_file(predictions_path_test), ]
def test_overfit_detector_inc_to_dec(boosting_type): output_model_path = yatest.common.test_output_path('model.bin') output_eval_path = yatest.common.test_output_path('test.eval') cd_file = data_file('adult', 'train.cd') test_file = data_file('adult', 'test_small') params = { '--use-best-model': 'false', '--loss-function': 'Logloss', '-f': data_file('adult', 'train_small'), '-t': test_file, '--column-description': cd_file, '--boosting-type': boosting_type, '-i': '2000', '-T': '4', '-r': '0', '-m': output_model_path, '-x': '1', '-n': '8', '-w': '0.5', '--od-pval': '0.5', '--od-type': 'IncToDec', '--od-wait': '2', } fit_catboost_gpu(params) apply_catboost(output_model_path, test_file, cd_file, output_eval_path) return [local_canonical_file(output_eval_path)]
def test_all_targets(loss_function, boosting_type): output_model_path = yatest.common.test_output_path('model.bin') output_eval_path = yatest.common.test_output_path('test.eval') test_file = data_file('adult', 'test_small') cd_file = data_file('adult', 'train.cd') params = ( '--use-best-model', 'false', '--loss-function', loss_function, '-f', data_file('adult', 'train_small'), '-t', test_file, '--column-description', cd_file, '--boosting-type', boosting_type, '-i', '10', '-w', '0.03', '-T', '4', '-r', '0', '-m', output_model_path, ) fit_catboost_gpu(params) apply_catboost(output_model_path, test_file, cd_file, output_eval_path) return [local_canonical_file(output_eval_path)]
def run_catboost(eval_path, model_path, train, is_additional_query_weights): fit_params = [ '--use-best-model', 'false', '--loss-function', 'QueryRMSE', '-f', 'quantized://' + data_file('querywise', train), '-i', '5', '-T', '4', '-r', '0', '-m', model_path, '--eval-file', eval_path, ] if is_additional_query_weights: fit_params += [ '--learn-group-weights', data_file('querywise', 'train.group_weights'), '--test-group-weights', data_file('querywise', 'test.group_weights'), ] fit_catboost_gpu(fit_params) apply_catboost(model_path, data_file('querywise', 'test'), data_file('querywise', 'train.cd.group_weight'), eval_path)
def test_custom_priors(boosting_type): output_model_path = yatest.common.test_output_path('model.bin') output_eval_path = yatest.common.test_output_path('test.eval') test_file = data_file('adult', 'test_small') cd_file = data_file('adult', 'train.cd') params = ( '--use-best-model', 'false', '--loss-function', 'Logloss', '-f', data_file('adult', 'train_small'), '-t', test_file, '--column-description', cd_file, '--boosting-type', boosting_type, '-i', '10', '-w', '0.03', '-T', '4', '-r', '0', '-m', output_model_path, '--ctr', 'Borders:Prior=-2:Prior=0:Prior=8/3:Prior=1:Prior=-1:Prior=3,' 'FeatureFreq:Prior=0', '--per-feature-ctr', '4:Borders:Prior=0.444,FeatureFreq:Prior=0.444;' '6:Borders:Prior=0.666,FeatureFreq:Prior=0.666;' '8:Borders:Prior=-0.888:Prior=2/3,FeatureFreq:Prior=-0.888:Prior=0.888' ) fit_catboost_gpu(params) apply_catboost(output_model_path, test_file, cd_file, output_eval_path) return [local_canonical_file(output_eval_path)]
def run_catboost(eval_path, model_path, cd_file, is_additional_query_weights): cd_file_path = data_file('querywise', cd_file) fit_params = [ '--use-best-model', 'false', '--loss-function', 'QueryRMSE', '-f', data_file('querywise', 'train'), '--column-description', cd_file_path, '-i', '5', '-T', '4', '-r', '0', '-m', model_path, '--eval-file', eval_path, ] if is_additional_query_weights: fit_params += [ '--learn-group-weights', data_file('querywise', 'train.group_weights'), '--test-group-weights', data_file('querywise', 'test.group_weights'), ] fit_catboost_gpu(fit_params) apply_catboost(model_path, data_file('querywise', 'test'), cd_file_path, eval_path)
def test_pairs_generation_with_max_pairs(): output_model_path = yatest.common.test_output_path('model.bin') test_error_path = yatest.common.test_output_path('test_error.tsv') learn_error_path = yatest.common.test_output_path('learn_error.tsv') predictions_path_learn = yatest.common.test_output_path( 'predictions_learn.tsv') predictions_path_test = yatest.common.test_output_path( 'predictions_test.tsv') cd_file = data_file('querywise', 'train.cd') learn_file = data_file('querywise', 'train') test_file = data_file('querywise', 'test') params = [ '--loss-function', 'PairLogit:max_pairs=30', '--eval-metric', 'PairAccuracy', '-f', learn_file, '-t', test_file, '--column-description', cd_file, '--l2-leaf-reg', '0', '-i', '20', '-T', '4', '-r', '0', '-m', output_model_path, '--learn-err-log', learn_error_path, '--test-err-log', test_error_path, '--use-best-model', 'false' ] fit_catboost_gpu(params) apply_catboost(output_model_path, learn_file, cd_file, predictions_path_learn) apply_catboost(output_model_path, test_file, cd_file, predictions_path_test) return [ local_canonical_file(learn_error_path, diff_tool=diff_tool()), local_canonical_file(test_error_path, diff_tool=diff_tool()), local_canonical_file(predictions_path_learn, diff_tool=diff_tool()), local_canonical_file(predictions_path_test, diff_tool=diff_tool()), ]
def test_allow_writing_files_and_used_ram_limit(boosting_type, used_ram_limit): output_model_path = yatest.common.test_output_path('model.bin') output_eval_path = yatest.common.test_output_path('test.eval') cd_file = data_file('airlines_5K', 'cd') params = ( '--use-best-model', 'false', '--allow-writing-files', 'false', '--used-ram-limit', used_ram_limit, '--loss-function', 'Logloss', '--max-ctr-complexity', '8', '--depth', '10', '-f', data_file('airlines_5K', 'train'), '-t', data_file('airlines_5K', 'test'), '--column-description', cd_file, '--has-header', '--boosting-type', boosting_type, '-i', '20', '-w', '0.03', '-T', '4', '-r', '0', '-m', output_model_path, '--eval-file', output_eval_path, ) fit_catboost_gpu(params) test_file = data_file('airlines_5K', 'test') apply_catboost(output_model_path, test_file, cd_file, output_eval_path, has_header=True) return [local_canonical_file(output_eval_path)]
def test_bootstrap(boosting_type): bootstrap_option = { 'no': { '--bootstrap-type': 'No' }, 'bayes': { '--bootstrap-type': 'Bayesian', '--bagging-temperature': '0.0' }, 'bernoulli': { '--bootstrap-type': 'Bernoulli', '--subsample': '1.0' } } test_file = data_file('adult', 'test_small') cd_file = data_file('adult', 'train.cd') params = { '--use-best-model': 'false', '--loss-function': 'Logloss', '--random-strength': '0', '-f': data_file('adult', 'train_small'), '-t': test_file, '--column-description': cd_file, '--boosting-type': boosting_type, '-i': '10', '-w': '0.03', '-T': '4', '-r': '0', } for bootstrap in bootstrap_option: model_path = yatest.common.test_output_path('model_' + bootstrap + '.bin') eval_path = yatest.common.test_output_path('test_' + bootstrap + '.eval') model_option = {'-m': model_path} run_params = combine_dicts(params, bootstrap_option[bootstrap], model_option) fit_catboost_gpu(run_params) apply_catboost(model_path, test_file, cd_file, eval_path) ref_eval_path = yatest.common.test_output_path('test_no.eval') assert (filecmp.cmp(ref_eval_path, yatest.common.test_output_path('test_bayes.eval'))) assert (filecmp.cmp(ref_eval_path, yatest.common.test_output_path('test_bernoulli.eval'))) return [local_canonical_file(ref_eval_path)]
def test_ctr_type(ctr_type, boosting_type): output_model_path = yatest.common.test_output_path('model.bin') output_eval_path = yatest.common.test_output_path('test.eval') cd_file = data_file('adult_crossentropy', 'train.cd') test_file = data_file('adult_crossentropy', 'test_proba') params = ('--use-best-model', 'false', '--loss-function', 'RMSE', '-f', data_file('adult_crossentropy', 'train_proba'), '-t', test_file, '--column-description', cd_file, '--boosting-type', boosting_type, '-i', '3', '-T', '4', '-r', '0', '-m', output_model_path, '--ctr', ctr_type) fit_catboost_gpu(params) apply_catboost(output_model_path, test_file, cd_file, output_eval_path) return [local_canonical_file(output_eval_path)]
def test_train_on_binarized_equal_train_on_float(boosting_type, qwise_loss): output_model_path = yatest.common.test_output_path('model.bin') output_model_path_binarized = yatest.common.test_output_path('model_binarized.bin') test_error_path = yatest.common.test_output_path('test_error.tsv') learn_error_path = yatest.common.test_output_path('learn_error.tsv') borders_file = yatest.common.test_output_path('borders.tsv') borders_file_output = borders_file + '.out' predictions_path_learn = yatest.common.test_output_path('predictions_learn.tsv') predictions_path_learn_binarized = yatest.common.test_output_path('predictions_learn_binarized.tsv') predictions_path_test = yatest.common.test_output_path('predictions_test.tsv') predictions_path_test_binarized = yatest.common.test_output_path('predictions_test_binarized.tsv') learn_file = data_file('querywise', 'train') cd_file = data_file('querywise', 'train.cd') test_file = data_file('querywise', 'test') params = {"--loss-function": qwise_loss, "-f": learn_file, "-t": test_file, '--column-description': cd_file, '--boosting-type': boosting_type, '-i': '100', '-T': '4', '-r': '0', '-m': output_model_path, '--learn-err-log': learn_error_path, '--test-err-log': test_error_path, '--use-best-model': 'false', '--output-borders-file': borders_file_output, } params_binarized = dict(params) params_binarized['--input-borders-file'] = borders_file_output params_binarized['--output-borders-file'] = borders_file params_binarized['-m'] = output_model_path_binarized fit_catboost_gpu(params) apply_catboost(output_model_path, learn_file, cd_file, predictions_path_learn) apply_catboost(output_model_path, test_file, cd_file, predictions_path_test) # learn_error_path and test_error_path already exist after first fit_catboost_gpu() call # and would be automatically marked as input_data for YT operation, # which will lead to error, because input files are available only for reading. # That's why we explicitly drop files from input_data and implicitly add them to output_data. fit_catboost_gpu(params_binarized, input_data={learn_error_path: None, test_error_path: None}) apply_catboost(output_model_path_binarized, learn_file, cd_file, predictions_path_learn_binarized) apply_catboost(output_model_path_binarized, test_file, cd_file, predictions_path_test_binarized) assert (filecmp.cmp(predictions_path_learn, predictions_path_learn_binarized)) assert (filecmp.cmp(predictions_path_test, predictions_path_test_binarized)) return [local_canonical_file(learn_error_path), local_canonical_file(test_error_path), local_canonical_file(predictions_path_test), local_canonical_file(predictions_path_learn), local_canonical_file(borders_file)]
def test_weighted_pool_leaf_estimation_method(boosting_type, leaf_estimation): output_model_path = yatest.common.test_output_path('model.bin') output_eval_path = yatest.common.test_output_path('test.eval') cd_file = data_file('adult_weight', 'train.cd') test_file = data_file('adult_weight', 'test_weight') params = { '--use-best-model': 'false', '--loss-function': 'Logloss', '-f': data_file('adult_weight', 'train_weight'), '-t': test_file, '--column-description': cd_file, '--boosting-type': boosting_type, '-i': '10', '-T': '4', '-r': '0', '--leaf-estimation-method': leaf_estimation, '-m': output_model_path, } fit_catboost_gpu(params) apply_catboost(output_model_path, test_file, cd_file, output_eval_path) return [local_canonical_file(output_eval_path)]
def test_crossentropy(boosting_type): output_model_path = yatest.common.test_output_path('model.bin') output_eval_path = yatest.common.test_output_path('test.eval') cd_file = data_file('adult_crossentropy', 'train.cd') test_file = data_file('adult_crossentropy', 'test_proba') params = { '--loss-function': 'CrossEntropy', '-f': data_file('adult_crossentropy', 'train_proba'), '-t': test_file, '--column-description': cd_file, '--boosting-type': boosting_type, '-i': '10', '-w': '0.03', '-T': '4', '-r': '0', '-m': output_model_path, } fit_catboost_gpu(params) apply_catboost(output_model_path, test_file, cd_file, output_eval_path) return [local_canonical_file(output_eval_path)]
def test_nan_mode(nan_mode, boosting_type): output_model_path = yatest.common.test_output_path('model.bin') output_eval_path = yatest.common.test_output_path('test.eval') test_file = data_file('adult_nan', 'test_small') cd_file = data_file('adult_nan', 'train.cd') params = { '--use-best-model': 'false', '-f': data_file('adult_nan', 'train_small'), '-t': test_file, '--column-description': cd_file, '--boosting-type': boosting_type, '-i': '20', '-T': '4', '-r': '0', '-m': output_model_path, '--nan-mode': nan_mode } fit_catboost_gpu(params) apply_catboost(output_model_path, test_file, cd_file, output_eval_path) return [local_canonical_file(output_eval_path)]
def test_fold_len_mult(): output_model_path = yatest.common.test_output_path('model.bin') output_eval_path = yatest.common.test_output_path('test.eval') cd_file = data_file('adult_not_binarized', 'train.cd') test_file = data_file('adult_not_binarized', 'test_small') params = { '--use-best-model': 'false', '--loss-function': 'Logloss', '-f': data_file('adult_not_binarized', 'train_small'), '-t': test_file, '--column-description': cd_file, '--boosting-type': 'Ordered', '-i': '10', '-w': '0.03', '-T': '4', '-r': '0', '--fold-len-multiplier': 1.2, '-m': output_model_path, } fit_catboost_gpu(params) apply_catboost(output_model_path, test_file, cd_file, output_eval_path) return [local_canonical_file(output_eval_path)]
def test_quantized_pool(loss_function, boosting_type): output_model_path = yatest.common.test_output_path('model.bin') output_eval_path = yatest.common.test_output_path('test.eval') quantized_train_file = 'quantized://' + data_file('quantized_adult', 'train.qbin') quantized_test_file = 'quantized://' + data_file('quantized_adult', 'test.qbin') params = ( '--use-best-model', 'false', '--loss-function', loss_function, '-f', quantized_train_file, '-t', quantized_test_file, '--boosting-type', boosting_type, '-i', '10', '-w', '0.03', '-T', '4', '-r', '0', '-m', output_model_path, ) fit_catboost_gpu(params) cd_file = data_file('quantized_adult', 'pool.cd') test_file = data_file('quantized_adult', 'test_small.tsv') apply_catboost(output_model_path, test_file, cd_file, output_eval_path) return [local_canonical_file(output_eval_path)]