def simple_on_dataframe(): learn_set_path = tempfile.mkstemp(prefix='catboost_learn_set_')[1] cd_path = tempfile.mkstemp(prefix='catboost_cd_')[1] try: utils.object_list_to_tsv([(0.1, 0.2, 0.11, 0.12), (0.97, 0.82, 0.33, 1.1), (0.13, 0.22, 0.23, 2.1), (0.14, 0.18, 0.1, 0.0), (0.9, 0.67, 0.17, -1.0), (0.66, 0.1, 0.31, 0.62)], learn_set_path) with open(cd_path, 'w') as cd: cd.write('3\tTarget') model = utils.run_dist_train([ '--iterations', '20', '--loss-function', 'RMSE', '--learn-set', learn_set_path, '--cd', cd_path ], model_class=cb.CatBoostRegressor) train_pool = cb.Pool(learn_set_path, column_description=cd_path) result = {'prediction': model.predict(train_pool).tolist()} json.dump(result, fp=open( os.path.join(OUTPUT_DIR, 'regression_simple_on_dataframe.json'), 'w'), allow_nan=True, indent=2) finally: os.remove(learn_set_path) os.remove(cd_path)
def simple1(): learn_set_path = tempfile.mkstemp(prefix='catboost_learn_set_')[1] cd_path = tempfile.mkstemp(prefix='catboost_cd_')[1] try: utils.object_list_to_tsv( [(0.13, 0.22, 0.23, "0.34", "query1", "Site9", 1.0), (0.1, 0.2, 0.11, "0.12", "query0", "site1", 0.12), (0.97, 0.82, 0.33, "0.22", "query0", "site22", 0.18), (0.9, 0.67, 0.17, "0.01", "Query 2", "site22", 1.0), (0.66, 0.1, 0.31, "0.0", "Query 2", "Site45", 2.0), (0.14, 0.18, 0.1, "0.42", "Query 2", "site12", 0.45)], learn_set_path) with open(cd_path, 'w') as cd: cd.write("3\tTarget\n" + "4\tGroupId\n" + "5\tSubgroupId\n" + "6\tWeight\n") model = utils.run_dist_train([ '--iterations', '20', '--loss-function', 'RMSE', '--learn-set', learn_set_path, '--cd', cd_path ], model_class=cb.CatBoostRegressor) train_pool = cb.Pool(learn_set_path, column_description=cd_path) result = {'prediction': model.predict(train_pool).tolist()} json.dump(result, fp=open(os.path.join(OUTPUT_DIR, 'regression_simple1.json'), 'w'), allow_nan=True, indent=2) finally: os.remove(learn_set_path) os.remove(cd_path)
def simple_binary_classification(): learn_set_path = tempfile.mkstemp(prefix='catboost_learn_set_')[1] cd_path = tempfile.mkstemp(prefix='catboost_cd_')[1] try: utils.object_list_to_tsv( [ (0.1, 0.2, 0.11, "0", "query0", 1.0, "site1", 0.12), (0.97, 0.82, 0.33, "0", "query0", 1.0, "site22", 0.18), (0.13, 0.22, 0.23, "1", "query1", 0.0, "Site9", 1.0), (0.14, 0.18, 0.1, "1", "Query 2", 0.5, "site12", 0.45), (0.9, 0.67, 0.17, "0", "Query 2", 0.5, "site22", 1.0), (0.66, 0.1, 0.31, "1", "Query 2", 0.5, "Site45", 2.0) ], learn_set_path ) with open(cd_path, 'w') as cd: cd.write( "3\tTarget\n" + "4\tGroupId\n" + "5\tGroupWeight\n" + "6\tSubgroupId\n" + "7\tWeight\n" ) model = utils.run_dist_train( ['--iterations', '20', '--loss-function', 'Logloss', '--learn-set', learn_set_path, '--cd', cd_path ], model_class=cb.CatBoostClassifier ) train_pool = cb.Pool(learn_set_path, column_description=cd_path) result = {} raw_predictions = np.array(model.predict(train_pool, prediction_type='RawFormulaVal'), ndmin=2).transpose() result['raw_prediction'] = np.hstack((np.negative(raw_predictions / 2), raw_predictions / 2)).tolist() result['probability'] = model.predict_proba(train_pool).tolist() result['prediction'] = model.predict(train_pool).tolist() json.dump( result, fp=open(os.path.join(OUTPUT_DIR, 'simple_binary_classification.json'), 'w'), allow_nan=True, indent=2 ) finally: os.remove(learn_set_path) os.remove(cd_path)
def binary_classification_with_class_weights_map(): learn_set_path = tempfile.mkstemp(prefix='catboost_learn_set_')[1] cd_path = tempfile.mkstemp(prefix='catboost_cd_')[1] try: utils.object_list_to_tsv([(0.1, 0.2, 0.11, 0), (0.97, 0.82, 0.33, 1), (0.13, 0.22, 0.23, 1), (0.14, 0.18, 0.1, 0), (0.9, 0.67, 0.17, 0), (0.66, 0.1, 0.31, 0)], learn_set_path) with open(cd_path, 'w') as cd: cd.write('3\tTarget') model = utils.run_dist_train([ '--iterations', '20', '--class-weights', '1,2', '--loss-function', 'Logloss', '--learn-set', learn_set_path, '--cd', cd_path, ], model_class=cb.CatBoostClassifier) train_pool = cb.Pool(learn_set_path, column_description=cd_path) result = {} raw_predictions = np.array(model.predict( train_pool, prediction_type='RawFormulaVal'), ndmin=2).transpose() result['raw_prediction'] = np.hstack( (np.negative(raw_predictions / 2), raw_predictions / 2)).tolist() result['probability'] = model.predict_proba(train_pool).tolist() result['prediction'] = model.predict(train_pool).tolist() json.dump(result, fp=open( os.path.join( OUTPUT_DIR, 'binary_classification_with_class_weights_map.json'), 'w'), allow_nan=True, indent=2) finally: os.remove(learn_set_path) os.remove(cd_path)
def num_and_one_hot_cat_features(): learn_set_path = tempfile.mkstemp(prefix='catboost_learn_set_')[1] cd_path = tempfile.mkstemp(prefix='catboost_cd_')[1] try: utils.object_list_to_tsv([ (0.13, 0.22, 0.23, 0.72, 0, 0, 0, "0.34", "query1", "Site9", 1.0), (0.1, 0.2, 0.11, -0.7, 1, 1, 0, "0.12", "query0", "site1", 0.12), (0.97, 0.82, 0.33, 0.18, 0, 2, 1, "0.22", "query0", "site22", 0.18), (0.9, 0.67, 0.17, 0.0, 1, 2, 2, "0.01", "Query 2", "site22", 1.0), (0.66, 0.1, 0.31, -0.12, 0, 0, 3, "0.0", "Query 2", "Site45", 2.0), (0.14, 0.18, 0.1, 0.0, 0, 0, 4, "0.42", "Query 2", "site12", 0.45), (1.0, 0.88, 0.21, 0.0, 1, 3, 5, "0.1", "Query 3", "site1", 1.0) ], learn_set_path) with open(cd_path, 'w') as cd: cd.write("0\tNum\tf1\n" + "1\tNum\tf2\n" + "2\tNum\tf3\n" + "3\tNum\tf4\n" + "4\tCateg\tc1\n" + "5\tCateg\tc2\n" + "6\tCateg\tc3\n" + "7\tTarget\n" + "8\tGroupId\n" + "9\tSubgroupId\n" + "10\tWeight\n") model = utils.run_dist_train([ '--iterations', '20', '--loss-function', 'RMSE', '--one-hot-max-size', '6', '--dev-efb-max-buckets', '0', '--has-time', '--learn-set', learn_set_path, '--cd', cd_path ], model_class=cb.CatBoostRegressor) train_pool = cb.Pool(learn_set_path, column_description=cd_path) result = {'prediction': model.predict(train_pool).tolist()} json.dump(result, fp=open( os.path.join( OUTPUT_DIR, 'regression_num_and_one_hot_cat_features.json'), 'w'), allow_nan=True, indent=2) finally: os.remove(learn_set_path) os.remove(cd_path)
def with_eval_sets(): learn_set_path = tempfile.mkstemp(prefix='catboost_learn_set_')[1] eval_sets_paths = [ tempfile.mkstemp(prefix='catboost_eval_set_')[1] for i in range(2) ] cd_path = tempfile.mkstemp(prefix='catboost_cd_')[1] try: utils.object_list_to_tsv( [(0.13, 0.22, 0.23, "0.34", "query1", "Site9", 1.0), (0.1, 0.2, 0.11, "0.12", "query0", "site1", 0.12), (0.97, 0.82, 0.33, "0.22", "query0", "site22", 0.18), (0.9, 0.67, 0.17, "0.01", "Query 2", "site22", 1.0), (0.66, 0.1, 0.31, "0.0", "Query 2", "Site45", 2.0), (0.14, 0.18, 0.1, "0.42", "Query 2", "site12", 0.45)], learn_set_path) utils.object_list_to_tsv( [(0.0, 0.33, 1.1, "0.22", "query3", "site1", 0.1), (0.02, 0.0, 0.38, "0.11", "query5", "Site9", 1.0), (0.86, 0.54, 0.9, "0.48", "query4", "site22", 0.17)], eval_sets_paths[0]) utils.object_list_to_tsv( [(0.12, 0.28, 2.2, "0.1", "query3", "site1", 0.11), (0.0, 0.0, 0.92, "0.9", "query5", "Site9", 1.1), (0.13, 2.1, 0.45, "0.88", "query5", "Site33", 1.2), (0.17, 0.11, 0.0, "0.0", "Query12", "site22", 1.0)], eval_sets_paths[1]) with open(cd_path, 'w') as cd: cd.write("3\tTarget\n" + "4\tGroupId\n" + "5\tSubgroupId\n" + "6\tWeight\n") model = utils.run_dist_train([ '--iterations', '20', '--loss-function', 'RMSE', '--learn-set', learn_set_path, '--test-set', eval_sets_paths[0], '--test-set', eval_sets_paths[1], '--cd', cd_path ], model_class=cb.CatBoostRegressor) eval_pools = [ cb.Pool(eval_set_path, column_description=cd_path) for eval_set_path in eval_sets_paths ] result = dict([(f'prediction{i}', model.predict(eval_pools[i]).tolist()) for i in range(2)]) json.dump(result, fp=open( os.path.join(OUTPUT_DIR, 'regression_with_eval_sets.json'), 'w'), allow_nan=True, indent=2) finally: os.remove(learn_set_path) [os.remove(eval_set_path) for eval_set_path in eval_sets_paths] os.remove(cd_path)
def multi_classification_with_num_and_one_hot_and_ctr_cat_features_with_eval_sets(): learn_set_path = tempfile.mkstemp(prefix='catboost_learn_set_')[1] eval_sets_paths = [tempfile.mkstemp(prefix='catboost_eval_set_')[1] for i in range(2)] cd_path = tempfile.mkstemp(prefix='catboost_cd_')[1] try: utils.object_list_to_tsv( [ (0.13, 0.22, 0.23, 0.72, 0, 0, 0, "0", "query1", "Site9", 1.0), (0.1, 0.2, 0.11, -0.7, 1, 1, 0, "1", "query0", "site1", 0.12), (0.97, 0.82, 0.33, 0.18, 0, 2, 1, "2", "query0", "site22", 0.18), (0.9, 0.67, 0.17, 0.0, 1, 2, 2, "2", "Query 2", "site22", 1.0), (0.66, 0.1, 0.31, -0.12, 0, 0, 3, "1", "Query 2", "Site45", 2.0), (0.14, 0.18, 0.1, 0.0, 0, 0, 4, "0", "Query 2", "site12", 0.45), (1.0, 0.88, 0.21, 0.0, 1, 3, 5, "2", "Query 3", "site1", 1.0), (1.0, 0.88, 0.21, 0.0, 1, 4, 5, "2", "Query 3", "site0", 1.1), (1.0, 0.88, 0.21, 0.0, 1, 1, 5, "0", "Query 4", "site11", 3.0), (1.0, 0.88, 0.21, 0.0, 1, 2, 5, "3", "Query 4", "Site5", 1.2), (1.0, 0.88, 0.21, 0.0, 1, 0, 5, "3", "Query 4", "Site5", 1.2), (1.0, 0.88, 0.21, 0.0, 1, 3, 5, "0", "Query 4", "Site7", 1.8) ], learn_set_path ) utils.object_list_to_tsv( [ (0.0, 0.33, 1.1, 0.01, 0, 1, 2, "0", "query3", "site1", 0.1), (0.02, 0.0, 0.38, -0.3, 1, 2, 3, "1", "query5", "Site9", 1.0), (0.86, 0.54, 0.9, 0.0, 0, 2, 5, "3", "query4", "site22", 0.17) ], eval_sets_paths[0] ) utils.object_list_to_tsv( [ (0.12, 0.28, 2.2, -0.12, 1, 3, 3, "2", "query3", "site1", 0.11), (0.0, 0.0, 0.92, 0.0, 0, 3, 4, "1", "query5", "Site9", 1.1), (0.13, 2.1, 0.45, 1.0, 1, 2, 5, "3", "query5", "Site33", 1.2), (0.17, 0.11, 0.0, 2.11, 1, 0, 2, "1", "Query12", "site22", 1.0) ], eval_sets_paths[1] ) with open(cd_path, 'w') as cd: cd.write( "0\tNum\tf1\n" + "1\tNum\tf2\n" + "2\tNum\tf3\n" + "3\tNum\tf4\n" + "4\tCateg\tc1\n" + "5\tCateg\tc2\n" + "6\tCateg\tc3\n" + "7\tTarget\n" + "8\tGroupId\n" + "9\tSubgroupId\n" + "10\tWeight\n" ) model = utils.run_local_train( ['--iterations', '20', '--loss-function', 'MultiClass', '--dev-efb-max-buckets', '0', '--max-ctr-complexity', '1', '--has-time', '--random-strength', '0', '--bootstrap-type', 'No', '--boosting-type', 'Plain', '--learning-rate', '0.3', '--boost-from-average', '0', '--learn-set', learn_set_path, '--test-set', eval_sets_paths[0], '--test-set', eval_sets_paths[1], '--cd', cd_path, ], model_class=cb.CatBoostClassifier ) eval_pools = [cb.Pool(eval_set_path, column_description=cd_path) for eval_set_path in eval_sets_paths] result = {} for i in range(2): result[f'prediction_for_eval_set_{i}'] = { 'raw_prediction': model.predict(eval_pools[i], prediction_type='RawFormulaVal').tolist(), 'probability': model.predict_proba(eval_pools[i]).tolist(), 'prediction': model.predict(eval_pools[i]).tolist() } json.dump( result, fp=open( os.path.join( OUTPUT_DIR, 'multi_classification_with_num_and_one_hot_and_ctr_cat_features_with_eval_sets.json' ), 'w' ), allow_nan=True, indent=2 ) finally: os.remove(learn_set_path) [os.remove(eval_set_path) for eval_set_path in eval_sets_paths] os.remove(cd_path)
def num_and_one_hot_cat_features_with_eval_sets(): learn_set_path = tempfile.mkstemp(prefix='catboost_learn_set_')[1] eval_sets_paths = [ tempfile.mkstemp(prefix='catboost_eval_set_')[1] for i in range(2) ] cd_path = tempfile.mkstemp(prefix='catboost_cd_')[1] try: utils.object_list_to_tsv([ (0.13, 0.22, 0.23, 0.72, 0, 0, 0, "0.34", "query1", "Site9", 1.0), (0.1, 0.2, 0.11, -0.7, 1, 1, 0, "0.12", "query0", "site1", 0.12), (0.97, 0.82, 0.33, 0.18, 0, 2, 1, "0.22", "query0", "site22", 0.18), (0.9, 0.67, 0.17, 0.0, 1, 2, 2, "0.01", "Query 2", "site22", 1.0), (0.66, 0.1, 0.31, -0.12, 0, 0, 3, "0.0", "Query 2", "Site45", 2.0), (0.14, 0.18, 0.1, 0.0, 0, 0, 4, "0.42", "Query 2", "site12", 0.45), (1.0, 0.88, 0.21, 0.0, 1, 3, 5, "0.1", "Query 3", "site1", 1.0) ], learn_set_path) utils.object_list_to_tsv([ (0.0, 0.33, 1.1, 0.01, 0, 1, 2, "0.22", "query4", "site1", 0.1), (0.02, 0.0, 0.38, -0.3, 1, 2, 3, "0.11", "query5", "Site9", 1.0), (0.86, 0.54, 0.9, 0.0, 0, 2, 5, "0.48", "query5", "site22", 0.17) ], eval_sets_paths[0]) utils.object_list_to_tsv([ (0.12, 0.28, 2.2, -0.12, 1, 3, 3, "0.1", "query6", "site1", 0.11), (0.0, 0.0, 0.92, 0.0, 0, 3, 4, "0.9", "query6", "Site9", 1.1), (0.13, 2.1, 0.45, 1.0, 1, 2, 5, "0.88", "query6", "Site33", 1.2), (0.17, 0.11, 0.0, 2.11, 1, 0, 2, "0.0", "Query12", "site22", 1.0) ], eval_sets_paths[1]) with open(cd_path, 'w') as cd: cd.write("0\tNum\tf1\n" + "1\tNum\tf2\n" + "2\tNum\tf3\n" + "3\tNum\tf4\n" + "4\tCateg\tc1\n" + "5\tCateg\tc2\n" + "6\tCateg\tc3\n" + "7\tTarget\n" + "8\tGroupId\n" + "9\tSubgroupId\n" + "10\tWeight\n") model = utils.run_dist_train([ '--iterations', '20', '--loss-function', 'RMSE', '--one-hot-max-size', '6', '--dev-efb-max-buckets', '0', '--has-time', '--learn-set', learn_set_path, '--test-set', eval_sets_paths[0], '--test-set', eval_sets_paths[1], '--cd', cd_path ], model_class=cb.CatBoostRegressor) eval_pools = [ cb.Pool(eval_set_path, column_description=cd_path) for eval_set_path in eval_sets_paths ] result = dict([(f'prediction{i}', model.predict(eval_pools[i]).tolist()) for i in range(2)]) json.dump( result, fp=open( os.path.join( OUTPUT_DIR, 'regression_num_and_one_hot_cat_features_with_eval_sets.json' ), 'w'), allow_nan=True, indent=2) finally: os.remove(learn_set_path) [os.remove(eval_set_path) for eval_set_path in eval_sets_paths] os.remove(cd_path)