def decision_tree_classifier(): spark = SparkSession \ .builder \ .appName("Python Spark SQL basic example") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() df = spark.createDataFrame([(1.0, Vectors.dense(1.0)), (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) stringIndexer = StringIndexer(inputCol="label", outputCol="indexed") si_model = stringIndexer.fit(df) td = si_model.transform(df) dt = DecisionTreeClassifier(maxDepth=2, labelCol="indexed") model = dt.fit(td) # model.numNodes # # 3 # model.depth # # 1 # model.featureImportances # # SparseVector(1, {0: 1.0}) # model.numFeatures # # 1 # model.numClasses # # 2 print(model.toDebugString) # DecisionTreeClassificationModel (uid=...) of depth 1 with 3 nodes... test0 = spark.createDataFrame([(Vectors.dense(-1.0), )], ["features"]) result = model.transform(test0).head() # result.prediction # # 0.0 # result.probability # # DenseVector([1.0, 0.0]) # result.rawPrediction # # DenseVector([1.0, 0.0]) test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]), )], ["features"]) # model.transform(test1).head().prediction # # 1.0 temp_path = "." dtc_path = temp_path + "/dtc" dt.save(dtc_path) dt2 = DecisionTreeClassifier.load(dtc_path) # dt2.getMaxDepth() # # 2 model_path = temp_path + "/dtc_model" model.save(model_path) model2 = DecisionTreeClassificationModel.load(model_path)
def test_decisiontree_regressor(self): dt = DecisionTreeRegressor(maxDepth=1) path = tempfile.mkdtemp() dtr_path = path + "/dtr" dt.save(dtr_path) dt2 = DecisionTreeClassifier.load(dtr_path) self.assertEqual(dt2.uid, dt2.maxDepth.parent, "Loaded DecisionTreeRegressor instance uid (%s) " "did not match Param's uid (%s)" % (dt2.uid, dt2.maxDepth.parent)) self.assertEqual(dt._defaultParamMap[dt.maxDepth], dt2._defaultParamMap[dt2.maxDepth], "Loaded DecisionTreeRegressor instance default params did not match " + "original defaults") try: rmtree(path) except OSError: pass
def test_decisiontree_regressor(self): dt = DecisionTreeRegressor(maxDepth=1) path = tempfile.mkdtemp() dtr_path = path + "/dtr" dt.save(dtr_path) dt2 = DecisionTreeClassifier.load(dtr_path) self.assertEqual(dt2.uid, dt2.maxDepth.parent, "Loaded DecisionTreeRegressor instance uid (%s) " "did not match Param's uid (%s)" % (dt2.uid, dt2.maxDepth.parent)) self.assertEqual(dt._defaultParamMap[dt.maxDepth], dt2._defaultParamMap[dt2.maxDepth], "Loaded DecisionTreeRegressor instance default params did not match " + "original defaults") try: rmtree(path) except OSError: pass