def testSimpleOnDataFrame(): spark = test_helpers.getOrCreateSparkSession(test_helpers.getCurrentMethodName()) import catboost_spark featureNames = ["f1", "f2", "f3"] srcDataSchema = pool_test_helpers.createSchema( [ ("features", VectorUDT()), ("label", DoubleType()) ], featureNames, addFeatureNamesMetadata=True ) srcData = [ Row(Vectors.dense(0.1, 0.2, 0.11), 0.12), Row(Vectors.dense(0.97, 0.82, 0.33), 1.1), Row(Vectors.dense(0.13, 0.22, 0.23), 2.1), Row(Vectors.dense(0.14, 0.18, 0.1), 0.0), Row(Vectors.dense(0.9, 0.67, 0.17), -1.0), Row(Vectors.dense(0.66, 0.1, 0.31), 0.62) ] df = spark.createDataFrame(spark.sparkContext.parallelize(srcData), StructType(srcDataSchema)) regressor = (catboost_spark.CatBoostRegressor() .setIterations(20) .setTrainDir(tempfile.mkdtemp(prefix=test_helpers.getCurrentMethodName()))) model = regressor.fit(df) predictions = model.transform(df) print ("predictions") predictions.show(truncate=False)
def testPredictionDiff(): spark = test_helpers.getOrCreateSparkSession(test_helpers.getCurrentMethodName()) import catboost_spark dataDir = os.path.join(config.CATBOOST_TEST_DATA_DIR, 'higgs') trainPool = catboost_spark.Pool.load( spark, dataPathWithScheme = os.path.join(dataDir, "train_small"), columnDescription = os.path.join(dataDir, "train.cd") ) dataForPredictionDiff = catboost_spark.Pool(trainPool.data.limit(2)) regressor = (catboost_spark.CatBoostRegressor() .setIterations(20) .setTrainDir(tempfile.mkdtemp(prefix=test_helpers.getCurrentMethodName()))) model = regressor.fit(trainPool) featureImportances = model.getFeatureImportance( fstrType=catboost_spark.EFstrType.PredictionDiff, data=dataForPredictionDiff ) print('featureImportancesPredictionDiff=') print(featureImportances) featureImportancesPrettified = model.getFeatureImportancePrettified( fstrType=catboost_spark.EFstrType.PredictionDiff, data=dataForPredictionDiff ) print('featureImportancesPredictionDiffPrettified=') for e in featureImportancesPrettified: print('featureName={},importance={}'.format(e.featureName(), e.importance()))
def testInteraction(): spark = test_helpers.getOrCreateSparkSession(test_helpers.getCurrentMethodName()) import catboost_spark dataDir = os.path.join(config.CATBOOST_TEST_DATA_DIR, 'querywise') trainPool = catboost_spark.Pool.load( spark, dataPathWithScheme = os.path.join(dataDir, "train"), columnDescription = os.path.join(dataDir, "train.cd") ) regressor = (catboost_spark.CatBoostRegressor() .setIterations(20) .setLossFunction("QueryRMSE") .setTrainDir(tempfile.mkdtemp(prefix=test_helpers.getCurrentMethodName()))) model = regressor.fit(trainPool) featureImportancesInteraction = model.getFeatureImportanceInteraction() for e in featureImportancesInteraction: print( 'firstFeatureIdx={},secondFeatureIdx={},score={}'.format( e.firstFeatureIdx(), e.secondFeatureIdx(), e.score() ) )
def testWithEvalSet(): spark = test_helpers.getOrCreateSparkSession(test_helpers.getCurrentMethodName()) import catboost_spark featureNames = ["f1", "f2", "f3"] srcSchemaData = [ ("features", VectorUDT()), ("label", StringType()), ("groupId", LongType()), ("subgroupId", IntegerType()), ("weight", FloatType()) ] srcTrainData = [ Row(Vectors.dense(0.13, 0.22, 0.23), "0.34", 0x86F1B93B695F9E6, 0x23D794E, 1.0), Row(Vectors.dense(0.1, 0.2, 0.11), "0.12", 0xB337C6FEFE2E2F7, 0xD34BFBD, 0.12), Row(Vectors.dense(0.97, 0.82, 0.33), "0.22", 0xB337C6FEFE2E2F7, 0x19CE5B0, 0.18), Row(Vectors.dense(0.9, 0.67, 0.17), "0.01", 0xD9DBDD3199D6518, 0x19CE5B0, 1.0), Row(Vectors.dense(0.66, 0.1, 0.31), "0.0", 0xD9DBDD3199D6518, 0x1FA606F, 2.0), Row(Vectors.dense(0.14, 0.18, 0.1), "0.42", 0xD9DBDD3199D6518, 0x62772D1, 0.45) ] srcTestData = [ Row(Vectors.dense(0.0, 0.33, 1.1), "0.22", 0x4AAFFF456765757, 0xD34BFBD, 0.1), Row(Vectors.dense(0.02, 0.0, 0.38), "0.11", 0x686726738873ABC, 0x23D794E, 1.0), Row(Vectors.dense(0.86, 0.54, 0.9), "0.48", 0x7652786FF37ABBE, 0x19CE5B0, 0.17) ] trainPool = pool_test_helpers.createRawPool( test_helpers.getCurrentMethodName, pool_test_helpers.createSchema( srcSchemaData, featureNames, addFeatureNamesMetadata=True ), srcTrainData, {"groupId": "groupId", "subgroupId": "subgroupId", "weight": "weight"} ) testPool = pool_test_helpers.createRawPool( test_helpers.getCurrentMethodName, pool_test_helpers.createSchema( srcSchemaData, featureNames, addFeatureNamesMetadata=True ), srcTestData, {"groupId": "groupId", "subgroupId": "subgroupId", "weight": "weight"} ) regressor = (catboost_spark.CatBoostRegressor() .setIterations(20) .setTrainDir(tempfile.mkdtemp(prefix=test_helpers.getCurrentMethodName())) ) model = regressor.fit(trainPool, [testPool]) predictions = model.transform(testPool.data) print ("predictions") predictions.show(truncate=False)
def testLossFunctionChange(): spark = test_helpers.getOrCreateSparkSession(test_helpers.getCurrentMethodName()) import catboost_spark dataDir = os.path.join(config.CATBOOST_TEST_DATA_DIR, 'querywise') trainPool = catboost_spark.Pool.load( spark, dataPathWithScheme = os.path.join(dataDir, "train"), columnDescription = os.path.join(dataDir, "train.cd") ) regressor = (catboost_spark.CatBoostRegressor() .setIterations(20) .setLossFunction("QueryRMSE") .setTrainDir(tempfile.mkdtemp(prefix=test_helpers.getCurrentMethodName()))) model = regressor.fit(trainPool) calcTypes = [ catboost_spark.ECalcTypeShapValues.Regular, catboost_spark.ECalcTypeShapValues.Approximate, catboost_spark.ECalcTypeShapValues.Exact ] for calcType in calcTypes: featureImportancesPredictionValuesChange = model.getFeatureImportance( fstrType=catboost_spark.EFstrType.LossFunctionChange, data=trainPool, calcType=calcType ) print('calcType=' + str(calcType) + ',featureImportancesLossFunctionChange=') print(featureImportancesPredictionValuesChange) featureImportancesDefault = model.getFeatureImportance(data=trainPool, calcType=calcType) print('calcType=' + str(calcType) + ',featureImportancesDefault=') print(featureImportancesDefault) featureImportancesPredictionValuesChangePrettified = model.getFeatureImportancePrettified( fstrType=catboost_spark.EFstrType.LossFunctionChange, data=trainPool, calcType=calcType ) print('calcType=' + str(calcType) + ',featureImportancesLossFunctionChangePrettified=') for e in featureImportancesPredictionValuesChangePrettified: print ('featureName={},importance={}'.format(e.featureName(), e.importance())) featureImportancesDefaultPrettified = model.getFeatureImportancePrettified( data=trainPool, calcType=calcType ) print('calcType=' + str(calcType) + ',featureImportancesDefaultPrettified=') for e in featureImportancesDefaultPrettified: print ('featureName={},importance={}'.format(e.featureName(), e.importance()))
def testWithPairs(): spark = test_helpers.getOrCreateSparkSession(test_helpers.getCurrentMethodName()) import catboost_spark featureNames = ["f1", "f2", "f3"] srcSchemaData = [ ("features", VectorUDT()), ("label", StringType()), ("groupId", LongType()), ("sampleId", LongType()), ("weight", FloatType()) ] srcData = [ Row(Vectors.dense(0.13, 0.22, 0.23), "0.34", 0x86F1B93B695F9E6, 0x23D794E, 1.0), Row(Vectors.dense(0.1, 0.2, 0.11), "0.12", 0xB337C6FEFE2E2F7, 0xD34BFBD, 0.12), Row(Vectors.dense(0.97, 0.82, 0.33), "0.22", 0xB337C6FEFE2E2F7, 0x19CE5B0, 0.18), Row(Vectors.dense(0.9, 0.67, 0.17), "0.01", 0xD9DBDD3199D6518, 0x19CE5B0, 1.0), Row(Vectors.dense(0.66, 0.1, 0.31), "0.0", 0xD9DBDD3199D6518, 0x1FA606F, 2.0), Row(Vectors.dense(0.14, 0.18, 0.1), "0.42", 0xD9DBDD3199D6518, 0x62772D1, 0.45) ] srcPairsData = [ Row(0xB337C6FEFE2E2F7, 0xD34BFBD, 0x19CE5B0), Row(0xD9DBDD3199D6518, 0x19CE5B0, 0x62772D1), Row(0xD9DBDD3199D6518, 0x62772D1, 0x1FA606F) ] pool = pool_test_helpers.createRawPool( test_helpers.getCurrentMethodName, pool_test_helpers.createSchema( srcSchemaData, featureNames, addFeatureNamesMetadata=True ), srcData, {"groupId": "groupId", "sampleId": "sampleId", "weight": "weight"}, srcPairsData ) regressor = (catboost_spark.CatBoostRegressor() .setIterations(20) .setTrainDir(tempfile.mkdtemp(prefix=test_helpers.getCurrentMethodName())) .setLossFunction("PairLogit") .setHasTime(True) ) model = regressor.fit(pool) predictions = model.transform(pool.data) print ("predictions") predictions.show(truncate=False)
def testModelSerializationInPipeline(): spark = test_helpers.getOrCreateSparkSession(test_helpers.getCurrentMethodName()) import catboost_spark srcData = [ Row(0.12, "query0", 0.1, "Male", 0.2, "Germany", 0.11), Row(0.22, "query0", 0.97, "Female", 0.82, "Russia", 0.33), Row(0.34, "query1", 0.13, "Male", 0.22, "USA", 0.23), Row(0.42, "Query 2", 0.14, "Male", 0.18, "Finland", 0.1), Row(0.01, "Query 2", 0.9, "Female", 0.67, "USA", 0.17), Row(0.0, "Query 2", 0.66, "Female", 0.1, "UK", 0.31) ] srcDataSchema = [ StructField("Label", DoubleType()), StructField("GroupId", StringType()), StructField("float0", DoubleType()), StructField("Gender1", StringType()), StructField("float2", DoubleType()), StructField("Country3", StringType()), StructField("float4", DoubleType()) ] df = spark.createDataFrame(spark.sparkContext.parallelize(srcData), StructType(srcDataSchema)) indexers = [ StringIndexer(inputCol=catFeature, outputCol=catFeature + "Index") for catFeature in ["Gender1", "Country3"] ] assembler = VectorAssembler( inputCols=["float0", "Gender1Index", "float2", "Country3Index", "float4"], outputCol="features" ) classifier = catboost_spark.CatBoostRegressor(labelCol="Label", iterations=20) pipeline = Pipeline(stages=indexers + [assembler, classifier]) pipelineModel = pipeline.fit(df) serializationDir = tempfile.mkdtemp(prefix=test_helpers.getCurrentMethodName()) modelPath = os.path.join(serializationDir, "serialized_pipeline_model") pipelineModel.write().overwrite().save(modelPath) loadedPipelineModel = PipelineModel.load(modelPath) print ("predictions") pipelineModel.transform(df).show(truncate=False) print ("predictionsLoaded") loadedPipelineModel.transform(df).show(truncate=False) shutil.rmtree(serializationDir)
def testParams(): spark = test_helpers.getOrCreateSparkSession(test_helpers.getCurrentMethodName()) import catboost_spark featureNames = ["f1", "f2", "f3"] srcSchemaData = [ ("features", VectorUDT()), ("label", StringType()), ("groupId", LongType()), ("subgroupId", IntegerType()), ("weight", FloatType()) ] srcData = [ Row(Vectors.dense(0.13, 0.22, 0.23), "0.34", 0x86F1B93B695F9E6, 0x23D794E, 1.0), Row(Vectors.dense(0.1, 0.2, 0.11), "0.12", 0xB337C6FEFE2E2F7, 0xD34BFBD, 0.12), Row(Vectors.dense(0.97, 0.82, 0.33), "0.22", 0xB337C6FEFE2E2F7, 0x19CE5B0, 0.18), Row(Vectors.dense(0.9, 0.67, 0.17), "0.01", 0xD9DBDD3199D6518, 0x19CE5B0, 1.0), Row(Vectors.dense(0.66, 0.1, 0.31), "0.0", 0xD9DBDD3199D6518, 0x1FA606F, 2.0), Row(Vectors.dense(0.14, 0.18, 0.1), "0.42", 0xD9DBDD3199D6518, 0x62772D1, 0.45) ] pool = pool_test_helpers.createRawPool( test_helpers.getCurrentMethodName, pool_test_helpers.createSchema( srcSchemaData, featureNames, addFeatureNamesMetadata=True ), srcData, {"groupId": "groupId", "subgroupId": "subgroupId", "weight": "weight"} ) firstFeatureUsePenaltiesMap = collections.OrderedDict([("f1", 0.0), ("f2", 1.1), ("f3", 2.0)]) regressor = (catboost_spark.CatBoostRegressor() .setIterations(20) .setTrainDir(tempfile.mkdtemp(prefix=test_helpers.getCurrentMethodName())) .setLeafEstimationIterations(10) .setFirstFeatureUsePenaltiesMap(firstFeatureUsePenaltiesMap) .setFeatureWeightsList([1.0, 2.0, 3.0]) ) model = regressor.fit(pool) predictions = model.transform(pool.data) print ("predictions") predictions.show(truncate=False)
def testShapInteractionValuesForRegression(): spark = test_helpers.getOrCreateSparkSession(test_helpers.getCurrentMethodName()) import catboost_spark dataDir = os.path.join(config.CATBOOST_TEST_DATA_DIR, 'higgs') trainPool = catboost_spark.Pool.load( spark, dataPathWithScheme = os.path.join(dataDir, "train_small"), columnDescription = os.path.join(dataDir, "train.cd") ) regressor = (catboost_spark.CatBoostRegressor() .setIterations(20) .setLossFunction("RMSE") .setTrainDir(tempfile.mkdtemp(prefix=test_helpers.getCurrentMethodName()))) model = regressor.fit(trainPool) shapInteractionValuesTestCase('Regression', model, trainPool)
def testModelSerialization(): spark = test_helpers.getOrCreateSparkSession(test_helpers.getCurrentMethodName()) import catboost_spark featureNames = ["f1", "f2", "f3"] srcSchemaData = [ ("features", VectorUDT()), ("label", StringType()), ("groupId", LongType()), ("subgroupId", IntegerType()), ("weight", FloatType()) ] srcData = [ Row(Vectors.dense(0.13, 0.22, 0.23), "0.34", 0x86F1B93B695F9E6, 0x23D794E, 1.0), Row(Vectors.dense(0.1, 0.2, 0.11), "0.12", 0xB337C6FEFE2E2F7, 0xD34BFBD, 0.12), Row(Vectors.dense(0.97, 0.82, 0.33), "0.22", 0xB337C6FEFE2E2F7, 0x19CE5B0, 0.18), Row(Vectors.dense(0.9, 0.67, 0.17), "0.01", 0xD9DBDD3199D6518, 0x19CE5B0, 1.0), Row(Vectors.dense(0.66, 0.1, 0.31), "0.0", 0xD9DBDD3199D6518, 0x1FA606F, 2.0), Row(Vectors.dense(0.14, 0.18, 0.1), "0.42", 0xD9DBDD3199D6518, 0x62772D1, 0.45) ] pool = pool_test_helpers.createRawPool( test_helpers.getCurrentMethodName, pool_test_helpers.createSchema( srcSchemaData, featureNames, addFeatureNamesMetadata=True ), srcData, {"groupId": "groupId", "subgroupId": "subgroupId", "weight": "weight"} ) regressor = (catboost_spark.CatBoostRegressor() .setIterations(20) .setTrainDir(tempfile.mkdtemp(prefix=test_helpers.getCurrentMethodName()))) model = regressor.fit(pool) predictions = model.transform(pool.data) print ("predictions") predictions.show(truncate=False) modelsDir = tempfile.mkdtemp(prefix="catboost_models_") nativeCatBoostModelPath = os.path.join(modelsDir, "regressor_model.cbm") model.saveNativeModel(nativeCatBoostModelPath) loadedCatBoostModel = catboost_spark.CatBoostRegressionModel.loadNativeModel(nativeCatBoostModelPath) predictionsLoadedCatBoost = loadedCatBoostModel.transform(pool.data) print ("predictionsLoadedCatBoost") predictionsLoadedCatBoost.show(truncate=False) nativeJsonModelPath = os.path.join(modelsDir, "regressor_model.json") model.saveNativeModel(nativeJsonModelPath, catboost_spark.EModelType.Json) nativeOnnxModelPath = os.path.join(modelsDir, "regressor_model.onnx") model.saveNativeModel( nativeOnnxModelPath, catboost_spark.EModelType.Onnx, { "onnx_domain": "ai.catboost", "onnx_model_version": 1, "onnx_doc_string": "test model for regression", "onnx_graph_name": "CatBoostModel_for_regression" } ) loadedOnnxModel = catboost_spark.CatBoostRegressionModel.loadNativeModel(nativeOnnxModelPath, catboost_spark.EModelType.Onnx) predictionsLoadedOnnx = loadedOnnxModel.transform(pool.data) print ("predictionsLoadedOnnx") predictionsLoadedOnnx.show(truncate=False) sparkModelPath = os.path.join(modelsDir, "regressor_model") model.write().overwrite().save(sparkModelPath) loadedModel = catboost_spark.CatBoostRegressionModel.load(sparkModelPath) predictionsLoaded = loadedModel.transform(pool.data) print ("predictionsLoaded") predictionsLoaded.show(truncate=False) shutil.rmtree(modelsDir)