Beispiel #1
0
def test_boosting_type(boosting_type):
    output_model_path = yatest.common.test_output_path('model.bin')
    output_eval_path = yatest.common.test_output_path('test.eval')

    train_file = data_file('adult', 'train_small')
    test_file = data_file('adult', 'test_small')
    cd_file = data_file('adult', 'train.cd')

    params = {
        '--use-best-model': 'false',
        '--loss-function': 'Logloss',
        '-f': train_file,
        '-t': test_file,
        '--column-description': cd_file,
        '--boosting-type': boosting_type,
        '-i': '10',
        '-w': '0.03',
        '-T': '4',
        '-r': '0',
        '-m': output_model_path,
    }

    fit_catboost_gpu(params)
    apply_catboost(output_model_path, test_file, cd_file, output_eval_path)
    return [local_canonical_file(output_eval_path)]
Beispiel #2
0
def test_queryrmse(boosting_type, qwise_loss):
    output_model_path = yatest.common.test_output_path('model.bin')
    test_error_path = yatest.common.test_output_path('test_error.tsv')
    learn_error_path = yatest.common.test_output_path('learn_error.tsv')
    predictions_path_learn = yatest.common.test_output_path('predictions_learn.tsv')
    predictions_path_test = yatest.common.test_output_path('predictions_test.tsv')

    learn_file = data_file('querywise', 'train')
    cd_file = data_file('querywise', 'train.cd')
    test_file = data_file('querywise', 'test')
    params = {"--loss-function": qwise_loss,
              "-f": learn_file,
              "-t": test_file,
              '--column-description': cd_file,
              '--boosting-type': boosting_type,
              '-i': '100',
              '-T': '4',
              '-r': '0',
              '-m': output_model_path,
              '--learn-err-log': learn_error_path,
              '--test-err-log': test_error_path,
              '--use-best-model': 'false'
              }

    fit_catboost_gpu(params)
    apply_catboost(output_model_path, learn_file, cd_file, predictions_path_learn)
    apply_catboost(output_model_path, test_file, cd_file, predictions_path_test)

    return [local_canonical_file(learn_error_path),
            local_canonical_file(test_error_path),
            local_canonical_file(predictions_path_learn),
            local_canonical_file(predictions_path_test),
            ]
Beispiel #3
0
def test_overfit_detector_inc_to_dec(boosting_type):
    output_model_path = yatest.common.test_output_path('model.bin')
    output_eval_path = yatest.common.test_output_path('test.eval')
    cd_file = data_file('adult', 'train.cd')
    test_file = data_file('adult', 'test_small')
    params = {
        '--use-best-model': 'false',
        '--loss-function': 'Logloss',
        '-f': data_file('adult', 'train_small'),
        '-t': test_file,
        '--column-description': cd_file,
        '--boosting-type': boosting_type,
        '-i': '2000',
        '-T': '4',
        '-r': '0',
        '-m': output_model_path,
        '-x': '1',
        '-n': '8',
        '-w': '0.5',
        '--od-pval': '0.5',
        '--od-type': 'IncToDec',
        '--od-wait': '2',
    }

    fit_catboost_gpu(params)
    apply_catboost(output_model_path, test_file, cd_file, output_eval_path)

    return [local_canonical_file(output_eval_path)]
Beispiel #4
0
def test_all_targets(loss_function, boosting_type):
    output_model_path = yatest.common.test_output_path('model.bin')
    output_eval_path = yatest.common.test_output_path('test.eval')

    test_file = data_file('adult', 'test_small')
    cd_file = data_file('adult', 'train.cd')
    params = (
        '--use-best-model',
        'false',
        '--loss-function',
        loss_function,
        '-f',
        data_file('adult', 'train_small'),
        '-t',
        test_file,
        '--column-description',
        cd_file,
        '--boosting-type',
        boosting_type,
        '-i',
        '10',
        '-w',
        '0.03',
        '-T',
        '4',
        '-r',
        '0',
        '-m',
        output_model_path,
    )

    fit_catboost_gpu(params)
    apply_catboost(output_model_path, test_file, cd_file, output_eval_path)

    return [local_canonical_file(output_eval_path)]
Beispiel #5
0
 def run_catboost(eval_path, model_path, train,
                  is_additional_query_weights):
     fit_params = [
         '--use-best-model',
         'false',
         '--loss-function',
         'QueryRMSE',
         '-f',
         'quantized://' + data_file('querywise', train),
         '-i',
         '5',
         '-T',
         '4',
         '-r',
         '0',
         '-m',
         model_path,
         '--eval-file',
         eval_path,
     ]
     if is_additional_query_weights:
         fit_params += [
             '--learn-group-weights',
             data_file('querywise', 'train.group_weights'),
             '--test-group-weights',
             data_file('querywise', 'test.group_weights'),
         ]
     fit_catboost_gpu(fit_params)
     apply_catboost(model_path, data_file('querywise', 'test'),
                    data_file('querywise', 'train.cd.group_weight'),
                    eval_path)
Beispiel #6
0
def test_custom_priors(boosting_type):
    output_model_path = yatest.common.test_output_path('model.bin')
    output_eval_path = yatest.common.test_output_path('test.eval')

    test_file = data_file('adult', 'test_small')
    cd_file = data_file('adult', 'train.cd')
    params = (
        '--use-best-model', 'false',
        '--loss-function', 'Logloss',
        '-f', data_file('adult', 'train_small'),
        '-t', test_file,
        '--column-description', cd_file,
        '--boosting-type', boosting_type,
        '-i', '10',
        '-w', '0.03',
        '-T', '4',
        '-r', '0',
        '-m', output_model_path,
        '--ctr', 'Borders:Prior=-2:Prior=0:Prior=8/3:Prior=1:Prior=-1:Prior=3,'
                 'FeatureFreq:Prior=0',
        '--per-feature-ctr', '4:Borders:Prior=0.444,FeatureFreq:Prior=0.444;'
                             '6:Borders:Prior=0.666,FeatureFreq:Prior=0.666;'
                             '8:Borders:Prior=-0.888:Prior=2/3,FeatureFreq:Prior=-0.888:Prior=0.888'
    )

    fit_catboost_gpu(params)
    apply_catboost(output_model_path, test_file, cd_file, output_eval_path)
    return [local_canonical_file(output_eval_path)]
Beispiel #7
0
 def run_catboost(eval_path, model_path, cd_file,
                  is_additional_query_weights):
     cd_file_path = data_file('querywise', cd_file)
     fit_params = [
         '--use-best-model',
         'false',
         '--loss-function',
         'QueryRMSE',
         '-f',
         data_file('querywise', 'train'),
         '--column-description',
         cd_file_path,
         '-i',
         '5',
         '-T',
         '4',
         '-r',
         '0',
         '-m',
         model_path,
         '--eval-file',
         eval_path,
     ]
     if is_additional_query_weights:
         fit_params += [
             '--learn-group-weights',
             data_file('querywise', 'train.group_weights'),
             '--test-group-weights',
             data_file('querywise', 'test.group_weights'),
         ]
     fit_catboost_gpu(fit_params)
     apply_catboost(model_path, data_file('querywise', 'test'),
                    cd_file_path, eval_path)
Beispiel #8
0
def test_pairs_generation_with_max_pairs():
    output_model_path = yatest.common.test_output_path('model.bin')
    test_error_path = yatest.common.test_output_path('test_error.tsv')
    learn_error_path = yatest.common.test_output_path('learn_error.tsv')
    predictions_path_learn = yatest.common.test_output_path(
        'predictions_learn.tsv')
    predictions_path_test = yatest.common.test_output_path(
        'predictions_test.tsv')

    cd_file = data_file('querywise', 'train.cd')
    learn_file = data_file('querywise', 'train')
    test_file = data_file('querywise', 'test')

    params = [
        '--loss-function', 'PairLogit:max_pairs=30', '--eval-metric',
        'PairAccuracy', '-f', learn_file, '-t', test_file,
        '--column-description', cd_file, '--l2-leaf-reg', '0', '-i', '20',
        '-T', '4', '-r', '0', '-m', output_model_path, '--learn-err-log',
        learn_error_path, '--test-err-log', test_error_path,
        '--use-best-model', 'false'
    ]
    fit_catboost_gpu(params)
    apply_catboost(output_model_path, learn_file, cd_file,
                   predictions_path_learn)
    apply_catboost(output_model_path, test_file, cd_file,
                   predictions_path_test)

    return [
        local_canonical_file(learn_error_path, diff_tool=diff_tool()),
        local_canonical_file(test_error_path, diff_tool=diff_tool()),
        local_canonical_file(predictions_path_learn, diff_tool=diff_tool()),
        local_canonical_file(predictions_path_test, diff_tool=diff_tool()),
    ]
Beispiel #9
0
def test_allow_writing_files_and_used_ram_limit(boosting_type, used_ram_limit):
    output_model_path = yatest.common.test_output_path('model.bin')
    output_eval_path = yatest.common.test_output_path('test.eval')

    cd_file = data_file('airlines_5K', 'cd')

    params = (
        '--use-best-model', 'false',
        '--allow-writing-files', 'false',
        '--used-ram-limit', used_ram_limit,
        '--loss-function', 'Logloss',
        '--max-ctr-complexity', '8',
        '--depth', '10',
        '-f', data_file('airlines_5K', 'train'),
        '-t', data_file('airlines_5K', 'test'),
        '--column-description', cd_file,
        '--has-header',
        '--boosting-type', boosting_type,
        '-i', '20',
        '-w', '0.03',
        '-T', '4',
        '-r', '0',
        '-m', output_model_path,
        '--eval-file', output_eval_path,
    )
    fit_catboost_gpu(params)

    test_file = data_file('airlines_5K', 'test')
    apply_catboost(output_model_path, test_file, cd_file,
                   output_eval_path, has_header=True)

    return [local_canonical_file(output_eval_path)]
Beispiel #10
0
def test_bootstrap(boosting_type):
    bootstrap_option = {
        'no': {
            '--bootstrap-type': 'No'
        },
        'bayes': {
            '--bootstrap-type': 'Bayesian',
            '--bagging-temperature': '0.0'
        },
        'bernoulli': {
            '--bootstrap-type': 'Bernoulli',
            '--subsample': '1.0'
        }
    }

    test_file = data_file('adult', 'test_small')
    cd_file = data_file('adult', 'train.cd')

    params = {
        '--use-best-model': 'false',
        '--loss-function': 'Logloss',
        '--random-strength': '0',
        '-f': data_file('adult', 'train_small'),
        '-t': test_file,
        '--column-description': cd_file,
        '--boosting-type': boosting_type,
        '-i': '10',
        '-w': '0.03',
        '-T': '4',
        '-r': '0',
    }

    for bootstrap in bootstrap_option:
        model_path = yatest.common.test_output_path('model_' + bootstrap +
                                                    '.bin')
        eval_path = yatest.common.test_output_path('test_' + bootstrap +
                                                   '.eval')
        model_option = {'-m': model_path}

        run_params = combine_dicts(params, bootstrap_option[bootstrap],
                                   model_option)

        fit_catboost_gpu(run_params)
        apply_catboost(model_path, test_file, cd_file, eval_path)

    ref_eval_path = yatest.common.test_output_path('test_no.eval')
    assert (filecmp.cmp(ref_eval_path,
                        yatest.common.test_output_path('test_bayes.eval')))
    assert (filecmp.cmp(ref_eval_path,
                        yatest.common.test_output_path('test_bernoulli.eval')))

    return [local_canonical_file(ref_eval_path)]
Beispiel #11
0
def test_ctr_type(ctr_type, boosting_type):
    output_model_path = yatest.common.test_output_path('model.bin')
    output_eval_path = yatest.common.test_output_path('test.eval')

    cd_file = data_file('adult_crossentropy', 'train.cd')
    test_file = data_file('adult_crossentropy', 'test_proba')
    params = ('--use-best-model', 'false', '--loss-function', 'RMSE', '-f',
              data_file('adult_crossentropy', 'train_proba'), '-t', test_file,
              '--column-description', cd_file, '--boosting-type',
              boosting_type, '-i', '3', '-T', '4', '-r', '0', '-m',
              output_model_path, '--ctr', ctr_type)
    fit_catboost_gpu(params)
    apply_catboost(output_model_path, test_file, cd_file, output_eval_path)
    return [local_canonical_file(output_eval_path)]
Beispiel #12
0
def test_train_on_binarized_equal_train_on_float(boosting_type, qwise_loss):
    output_model_path = yatest.common.test_output_path('model.bin')
    output_model_path_binarized = yatest.common.test_output_path('model_binarized.bin')
    test_error_path = yatest.common.test_output_path('test_error.tsv')
    learn_error_path = yatest.common.test_output_path('learn_error.tsv')

    borders_file = yatest.common.test_output_path('borders.tsv')
    borders_file_output = borders_file + '.out'
    predictions_path_learn = yatest.common.test_output_path('predictions_learn.tsv')
    predictions_path_learn_binarized = yatest.common.test_output_path('predictions_learn_binarized.tsv')
    predictions_path_test = yatest.common.test_output_path('predictions_test.tsv')
    predictions_path_test_binarized = yatest.common.test_output_path('predictions_test_binarized.tsv')

    learn_file = data_file('querywise', 'train')
    cd_file = data_file('querywise', 'train.cd')
    test_file = data_file('querywise', 'test')
    params = {"--loss-function": qwise_loss,
              "-f": learn_file,
              "-t": test_file,
              '--column-description': cd_file,
              '--boosting-type': boosting_type,
              '-i': '100',
              '-T': '4',
              '-r': '0',
              '-m': output_model_path,
              '--learn-err-log': learn_error_path,
              '--test-err-log': test_error_path,
              '--use-best-model': 'false',
              '--output-borders-file': borders_file_output,
              }

    params_binarized = dict(params)
    params_binarized['--input-borders-file'] = borders_file_output
    params_binarized['--output-borders-file'] = borders_file
    params_binarized['-m'] = output_model_path_binarized

    fit_catboost_gpu(params)
    apply_catboost(output_model_path, learn_file, cd_file, predictions_path_learn)
    apply_catboost(output_model_path, test_file, cd_file, predictions_path_test)

    # learn_error_path and test_error_path already exist after first fit_catboost_gpu() call
    # and would be automatically marked as input_data for YT operation,
    # which will lead to error, because input files are available only for reading.
    # That's why we explicitly drop files from input_data and implicitly add them to output_data.
    fit_catboost_gpu(params_binarized, input_data={learn_error_path: None, test_error_path: None})

    apply_catboost(output_model_path_binarized, learn_file, cd_file, predictions_path_learn_binarized)
    apply_catboost(output_model_path_binarized, test_file, cd_file, predictions_path_test_binarized)

    assert (filecmp.cmp(predictions_path_learn, predictions_path_learn_binarized))
    assert (filecmp.cmp(predictions_path_test, predictions_path_test_binarized))

    return [local_canonical_file(learn_error_path),
            local_canonical_file(test_error_path),
            local_canonical_file(predictions_path_test),
            local_canonical_file(predictions_path_learn),
            local_canonical_file(borders_file)]
Beispiel #13
0
def test_weighted_pool_leaf_estimation_method(boosting_type, leaf_estimation):
    output_model_path = yatest.common.test_output_path('model.bin')
    output_eval_path = yatest.common.test_output_path('test.eval')
    cd_file = data_file('adult_weight', 'train.cd')
    test_file = data_file('adult_weight', 'test_weight')
    params = {
        '--use-best-model': 'false',
        '--loss-function': 'Logloss',
        '-f': data_file('adult_weight', 'train_weight'),
        '-t': test_file,
        '--column-description': cd_file,
        '--boosting-type': boosting_type,
        '-i': '10',
        '-T': '4',
        '-r': '0',
        '--leaf-estimation-method': leaf_estimation,
        '-m': output_model_path,
    }
    fit_catboost_gpu(params)
    apply_catboost(output_model_path, test_file, cd_file, output_eval_path)
    return [local_canonical_file(output_eval_path)]
Beispiel #14
0
def test_crossentropy(boosting_type):
    output_model_path = yatest.common.test_output_path('model.bin')
    output_eval_path = yatest.common.test_output_path('test.eval')

    cd_file = data_file('adult_crossentropy', 'train.cd')
    test_file = data_file('adult_crossentropy', 'test_proba')
    params = {
        '--loss-function': 'CrossEntropy',
        '-f': data_file('adult_crossentropy', 'train_proba'),
        '-t': test_file,
        '--column-description': cd_file,
        '--boosting-type': boosting_type,
        '-i': '10',
        '-w': '0.03',
        '-T': '4',
        '-r': '0',
        '-m': output_model_path,
    }

    fit_catboost_gpu(params)
    apply_catboost(output_model_path, test_file, cd_file, output_eval_path)
    return [local_canonical_file(output_eval_path)]
Beispiel #15
0
def test_nan_mode(nan_mode, boosting_type):
    output_model_path = yatest.common.test_output_path('model.bin')
    output_eval_path = yatest.common.test_output_path('test.eval')
    test_file = data_file('adult_nan', 'test_small')
    cd_file = data_file('adult_nan', 'train.cd')

    params = {
        '--use-best-model': 'false',
        '-f': data_file('adult_nan', 'train_small'),
        '-t': test_file,
        '--column-description': cd_file,
        '--boosting-type': boosting_type,
        '-i': '20',
        '-T': '4',
        '-r': '0',
        '-m': output_model_path,
        '--nan-mode': nan_mode
    }

    fit_catboost_gpu(params)
    apply_catboost(output_model_path, test_file, cd_file, output_eval_path)
    return [local_canonical_file(output_eval_path)]
Beispiel #16
0
def test_fold_len_mult():
    output_model_path = yatest.common.test_output_path('model.bin')
    output_eval_path = yatest.common.test_output_path('test.eval')
    cd_file = data_file('adult_not_binarized', 'train.cd')
    test_file = data_file('adult_not_binarized', 'test_small')
    params = {
        '--use-best-model': 'false',
        '--loss-function': 'Logloss',
        '-f': data_file('adult_not_binarized', 'train_small'),
        '-t': test_file,
        '--column-description': cd_file,
        '--boosting-type': 'Ordered',
        '-i': '10',
        '-w': '0.03',
        '-T': '4',
        '-r': '0',
        '--fold-len-multiplier': 1.2,
        '-m': output_model_path,
    }
    fit_catboost_gpu(params)
    apply_catboost(output_model_path, test_file, cd_file, output_eval_path)

    return [local_canonical_file(output_eval_path)]
Beispiel #17
0
def test_quantized_pool(loss_function, boosting_type):
    output_model_path = yatest.common.test_output_path('model.bin')
    output_eval_path = yatest.common.test_output_path('test.eval')

    quantized_train_file = 'quantized://' + data_file('quantized_adult',
                                                      'train.qbin')
    quantized_test_file = 'quantized://' + data_file('quantized_adult',
                                                     'test.qbin')
    params = (
        '--use-best-model',
        'false',
        '--loss-function',
        loss_function,
        '-f',
        quantized_train_file,
        '-t',
        quantized_test_file,
        '--boosting-type',
        boosting_type,
        '-i',
        '10',
        '-w',
        '0.03',
        '-T',
        '4',
        '-r',
        '0',
        '-m',
        output_model_path,
    )

    fit_catboost_gpu(params)
    cd_file = data_file('quantized_adult', 'pool.cd')
    test_file = data_file('quantized_adult', 'test_small.tsv')
    apply_catboost(output_model_path, test_file, cd_file, output_eval_path)

    return [local_canonical_file(output_eval_path)]