def test_copy(self): dataset = self.spark.createDataFrame([ (10, 10.0), (50, 50.0), (100, 100.0), (500, 500.0)] * 10, ["feature", "label"]) iee = InducedErrorEstimator() evaluator = RegressionEvaluator(metricName="r2") grid = ParamGridBuilder() \ .addGrid(iee.inducedError, [100.0, 0.0, 10000.0]) \ .build() tvs = TrainValidationSplit(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator) tvsModel = tvs.fit(dataset) tvsCopied = tvs.copy() tvsModelCopied = tvsModel.copy() self.assertEqual(tvs.getEstimator().uid, tvsCopied.getEstimator().uid, "Copied TrainValidationSplit has the same uid of Estimator") self.assertEqual(tvsModel.bestModel.uid, tvsModelCopied.bestModel.uid) self.assertEqual(len(tvsModel.validationMetrics), len(tvsModelCopied.validationMetrics), "Copied validationMetrics has the same size of the original") for index in range(len(tvsModel.validationMetrics)): self.assertEqual(tvsModel.validationMetrics[index], tvsModelCopied.validationMetrics[index])
def test_copy(self): dataset = self.spark.createDataFrame([(10, 10.0), (50, 50.0), (100, 100.0), (500, 500.0)] * 10, ["feature", "label"]) iee = InducedErrorEstimator() evaluator = RegressionEvaluator(metricName="r2") grid = ParamGridBuilder().addGrid(iee.inducedError, [100.0, 0.0, 10000.0]).build() tvs = TrainValidationSplit(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator, collectSubModels=True) tvsModel = tvs.fit(dataset) tvsCopied = tvs.copy() tvsModelCopied = tvsModel.copy() for param in [ lambda x: x.getCollectSubModels(), lambda x: x.getParallelism(), lambda x: x.getSeed(), lambda x: x.getTrainRatio(), ]: self.assertEqual(param(tvs), param(tvsCopied)) for param in [ lambda x: x.getSeed(), lambda x: x.getTrainRatio(), ]: self.assertEqual(param(tvsModel), param(tvsModelCopied)) self.assertEqual( tvs.getEstimator().uid, tvsCopied.getEstimator().uid, "Copied TrainValidationSplit has the same uid of Estimator", ) self.assertEqual(tvsModel.bestModel.uid, tvsModelCopied.bestModel.uid) self.assertEqual( len(tvsModel.validationMetrics), len(tvsModelCopied.validationMetrics), "Copied validationMetrics has the same size of the original", ) for index in range(len(tvsModel.validationMetrics)): self.assertEqual(tvsModel.validationMetrics[index], tvsModelCopied.validationMetrics[index]) tvsModel.validationMetrics[0] = "foo" self.assertNotEqual( tvsModelCopied.validationMetrics[0], "foo", "Changing the original validationMetrics should not affect the copied model", ) tvsModel.subModels[0].getInducedError = lambda: "foo" self.assertNotEqual( tvsModelCopied.subModels[0].getInducedError(), "foo", "Changing the original subModels should not affect the copied model", )
def test_copy(self): dataset = self.spark.createDataFrame([(10, 10.0), (50, 50.0), (100, 100.0), (500, 500.0)] * 10, ["feature", "label"]) iee = InducedErrorEstimator() evaluator = RegressionEvaluator(metricName="r2") grid = ParamGridBuilder() \ .addGrid(iee.inducedError, [100.0, 0.0, 10000.0]) \ .build() tvs = TrainValidationSplit(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator) tvsModel = tvs.fit(dataset) tvsCopied = tvs.copy() tvsModelCopied = tvsModel.copy() self.assertEqual( tvs.getEstimator().uid, tvsCopied.getEstimator().uid, "Copied TrainValidationSplit has the same uid of Estimator") self.assertEqual(tvsModel.bestModel.uid, tvsModelCopied.bestModel.uid) self.assertEqual( len(tvsModel.validationMetrics), len(tvsModelCopied.validationMetrics), "Copied validationMetrics has the same size of the original") for index in range(len(tvsModel.validationMetrics)): self.assertEqual(tvsModel.validationMetrics[index], tvsModelCopied.validationMetrics[index])