def test_copy(self): dataset = self.spark.createDataFrame([(10, 10.0), (50, 50.0), (100, 100.0), (500, 500.0)] * 10, ["feature", "label"]) iee = InducedErrorEstimator() evaluator = RegressionEvaluator(metricName="r2") grid = ParamGridBuilder().addGrid(iee.inducedError, [100.0, 0.0, 10000.0]).build() tvs = TrainValidationSplit(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator, collectSubModels=True) tvsModel = tvs.fit(dataset) tvsCopied = tvs.copy() tvsModelCopied = tvsModel.copy() for param in [ lambda x: x.getCollectSubModels(), lambda x: x.getParallelism(), lambda x: x.getSeed(), lambda x: x.getTrainRatio(), ]: self.assertEqual(param(tvs), param(tvsCopied)) for param in [ lambda x: x.getSeed(), lambda x: x.getTrainRatio(), ]: self.assertEqual(param(tvsModel), param(tvsModelCopied)) self.assertEqual( tvs.getEstimator().uid, tvsCopied.getEstimator().uid, "Copied TrainValidationSplit has the same uid of Estimator", ) self.assertEqual(tvsModel.bestModel.uid, tvsModelCopied.bestModel.uid) self.assertEqual( len(tvsModel.validationMetrics), len(tvsModelCopied.validationMetrics), "Copied validationMetrics has the same size of the original", ) for index in range(len(tvsModel.validationMetrics)): self.assertEqual(tvsModel.validationMetrics[index], tvsModelCopied.validationMetrics[index]) tvsModel.validationMetrics[0] = "foo" self.assertNotEqual( tvsModelCopied.validationMetrics[0], "foo", "Changing the original validationMetrics should not affect the copied model", ) tvsModel.subModels[0].getInducedError = lambda: "foo" self.assertNotEqual( tvsModelCopied.subModels[0].getInducedError(), "foo", "Changing the original subModels should not affect the copied model", )
def test_save_load_simple_estimator(self): # This tests saving and loading the trained model only. # Save/load for TrainValidationSplit will be added later: SPARK-13786 temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) tvsModel = tvs.fit(dataset) tvsPath = temp_path + "/tvs" tvs.save(tvsPath) loadedTvs = TrainValidationSplit.load(tvsPath) self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid) self.assertEqual(loadedTvs.getEvaluator().uid, tvs.getEvaluator().uid) self.assert_param_maps_equal( loadedTvs.getEstimatorParamMaps(), tvs.getEstimatorParamMaps()) tvsModelPath = temp_path + "/tvsModel" tvsModel.save(tvsModelPath) loadedModel = TrainValidationSplitModel.load(tvsModelPath) self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid)
def test_copy(self): dataset = self.spark.createDataFrame([ (10, 10.0), (50, 50.0), (100, 100.0), (500, 500.0)] * 10, ["feature", "label"]) iee = InducedErrorEstimator() evaluator = RegressionEvaluator(metricName="r2") grid = ParamGridBuilder() \ .addGrid(iee.inducedError, [100.0, 0.0, 10000.0]) \ .build() tvs = TrainValidationSplit(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator) tvsModel = tvs.fit(dataset) tvsCopied = tvs.copy() tvsModelCopied = tvsModel.copy() self.assertEqual(tvs.getEstimator().uid, tvsCopied.getEstimator().uid, "Copied TrainValidationSplit has the same uid of Estimator") self.assertEqual(tvsModel.bestModel.uid, tvsModelCopied.bestModel.uid) self.assertEqual(len(tvsModel.validationMetrics), len(tvsModelCopied.validationMetrics), "Copied validationMetrics has the same size of the original") for index in range(len(tvsModel.validationMetrics)): self.assertEqual(tvsModel.validationMetrics[index], tvsModelCopied.validationMetrics[index])
def test_save_load_simple_estimator(self): # This tests saving and loading the trained model only. # Save/load for TrainValidationSplit will be added later: SPARK-13786 temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) tvsModel = tvs.fit(dataset) tvsPath = temp_path + "/tvs" tvs.save(tvsPath) loadedTvs = TrainValidationSplit.load(tvsPath) self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid) self.assertEqual(loadedTvs.getEvaluator().uid, tvs.getEvaluator().uid) self.assertEqual(loadedTvs.getEstimatorParamMaps(), tvs.getEstimatorParamMaps()) tvsModelPath = temp_path + "/tvsModel" tvsModel.save(tvsModelPath) loadedModel = TrainValidationSplitModel.load(tvsModelPath) self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid)
def test_copy(self): dataset = self.spark.createDataFrame([(10, 10.0), (50, 50.0), (100, 100.0), (500, 500.0)] * 10, ["feature", "label"]) iee = InducedErrorEstimator() evaluator = RegressionEvaluator(metricName="r2") grid = ParamGridBuilder() \ .addGrid(iee.inducedError, [100.0, 0.0, 10000.0]) \ .build() tvs = TrainValidationSplit(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator) tvsModel = tvs.fit(dataset) tvsCopied = tvs.copy() tvsModelCopied = tvsModel.copy() self.assertEqual( tvs.getEstimator().uid, tvsCopied.getEstimator().uid, "Copied TrainValidationSplit has the same uid of Estimator") self.assertEqual(tvsModel.bestModel.uid, tvsModelCopied.bestModel.uid) self.assertEqual( len(tvsModel.validationMetrics), len(tvsModelCopied.validationMetrics), "Copied validationMetrics has the same size of the original") for index in range(len(tvsModel.validationMetrics)): self.assertEqual(tvsModel.validationMetrics[index], tvsModelCopied.validationMetrics[index])
def _run_test_save_load_nested_estimator(self, LogisticRegressionCls): # This tests saving and loading the trained model only. # Save/load for TrainValidationSplit will be added later: SPARK-13786 temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [ (Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0), ] * 10, ["features", "label"], ) ova = OneVsRest(classifier=LogisticRegressionCls()) lr1 = LogisticRegressionCls().setMaxIter(100) lr2 = LogisticRegressionCls().setMaxIter(150) grid = ParamGridBuilder().addGrid(ova.classifier, [lr1, lr2]).build() evaluator = MulticlassClassificationEvaluator() tvs = TrainValidationSplit(estimator=ova, estimatorParamMaps=grid, evaluator=evaluator) tvsModel = tvs.fit(dataset) tvsPath = temp_path + "/tvs" tvs.save(tvsPath) loadedTvs = TrainValidationSplit.load(tvsPath) self.assert_param_maps_equal(loadedTvs.getEstimatorParamMaps(), grid) self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid) self.assertEqual(loadedTvs.getEvaluator().uid, tvs.getEvaluator().uid) originalParamMap = tvs.getEstimatorParamMaps() loadedParamMap = loadedTvs.getEstimatorParamMaps() for i, param in enumerate(loadedParamMap): for p in param: if p.name == "classifier": self.assertEqual(param[p].uid, originalParamMap[i][p].uid) else: self.assertEqual(param[p], originalParamMap[i][p]) tvsModelPath = temp_path + "/tvsModel" tvsModel.save(tvsModelPath) loadedModel = TrainValidationSplitModel.load(tvsModelPath) self.assert_param_maps_equal(loadedModel.getEstimatorParamMaps(), grid) self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid)
def test_save_load_nested_estimator(self): # This tests saving and loading the trained model only. # Save/load for TrainValidationSplit will be added later: SPARK-13786 temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) ova = OneVsRest(classifier=LogisticRegression()) lr1 = LogisticRegression().setMaxIter(100) lr2 = LogisticRegression().setMaxIter(150) grid = ParamGridBuilder().addGrid(ova.classifier, [lr1, lr2]).build() evaluator = MulticlassClassificationEvaluator() tvs = TrainValidationSplit(estimator=ova, estimatorParamMaps=grid, evaluator=evaluator) tvsModel = tvs.fit(dataset) tvsPath = temp_path + "/tvs" tvs.save(tvsPath) loadedTvs = TrainValidationSplit.load(tvsPath) self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid) self.assertEqual(loadedTvs.getEvaluator().uid, tvs.getEvaluator().uid) originalParamMap = tvs.getEstimatorParamMaps() loadedParamMap = loadedTvs.getEstimatorParamMaps() for i, param in enumerate(loadedParamMap): for p in param: if p.name == "classifier": self.assertEqual(param[p].uid, originalParamMap[i][p].uid) else: self.assertEqual(param[p], originalParamMap[i][p]) tvsModelPath = temp_path + "/tvsModel" tvsModel.save(tvsModelPath) loadedModel = TrainValidationSplitModel.load(tvsModelPath) self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid)
def test_save_load_pipeline_estimator(self): temp_path = tempfile.mkdtemp() training = self.spark.createDataFrame([ (0, "a b c d e spark", 1.0), (1, "b d", 0.0), (2, "spark f g h", 1.0), (3, "hadoop mapreduce", 0.0), (4, "b spark who", 1.0), (5, "g d a y", 0.0), (6, "spark fly", 1.0), (7, "was mapreduce", 0.0), ], ["id", "text", "label"]) # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") ova = OneVsRest(classifier=LogisticRegression()) lr1 = LogisticRegression().setMaxIter(5) lr2 = LogisticRegression().setMaxIter(10) pipeline = Pipeline(stages=[tokenizer, hashingTF, ova]) paramGrid = ParamGridBuilder() \ .addGrid(hashingTF.numFeatures, [10, 100]) \ .addGrid(ova.classifier, [lr1, lr2]) \ .build() tvs = TrainValidationSplit(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=MulticlassClassificationEvaluator()) tvsPath = temp_path + "/tvs" tvs.save(tvsPath) loadedTvs = TrainValidationSplit.load(tvsPath) self.assert_param_maps_equal(loadedTvs.getEstimatorParamMaps(), paramGrid) self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid) # Run train validation split, and choose the best set of parameters. tvsModel = tvs.fit(training) # test save/load of CrossValidatorModel tvsModelPath = temp_path + "/tvsModel" tvsModel.save(tvsModelPath) loadedModel = TrainValidationSplitModel.load(tvsModelPath) self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid) self.assertEqual(len(loadedModel.bestModel.stages), len(tvsModel.bestModel.stages)) for loadedStage, originalStage in zip(loadedModel.bestModel.stages, tvsModel.bestModel.stages): self.assertEqual(loadedStage.uid, originalStage.uid) # Test nested pipeline nested_pipeline = Pipeline(stages=[tokenizer, Pipeline(stages=[hashingTF, ova])]) tvs2 = TrainValidationSplit(estimator=nested_pipeline, estimatorParamMaps=paramGrid, evaluator=MulticlassClassificationEvaluator()) tvs2Path = temp_path + "/tvs2" tvs2.save(tvs2Path) loadedTvs2 = TrainValidationSplit.load(tvs2Path) self.assert_param_maps_equal(loadedTvs2.getEstimatorParamMaps(), paramGrid) self.assertEqual(loadedTvs2.getEstimator().uid, tvs2.getEstimator().uid) # Run train validation split, and choose the best set of parameters. tvsModel2 = tvs2.fit(training) # test save/load of CrossValidatorModel tvsModelPath2 = temp_path + "/tvsModel2" tvsModel2.save(tvsModelPath2) loadedModel2 = TrainValidationSplitModel.load(tvsModelPath2) self.assertEqual(loadedModel2.bestModel.uid, tvsModel2.bestModel.uid) loaded_nested_pipeline_model = loadedModel2.bestModel.stages[1] original_nested_pipeline_model = tvsModel2.bestModel.stages[1] self.assertEqual(loaded_nested_pipeline_model.uid, original_nested_pipeline_model.uid) self.assertEqual(len(loaded_nested_pipeline_model.stages), len(original_nested_pipeline_model.stages)) for loadedStage, originalStage in zip(loaded_nested_pipeline_model.stages, original_nested_pipeline_model.stages): self.assertEqual(loadedStage.uid, originalStage.uid)