def test_default_read_write(self): temp_path = tempfile.mkdtemp() lr = LogisticRegression() lr.setMaxIter(50) lr.setThreshold(0.75) writer = DefaultParamsWriter(lr) savePath = temp_path + "/lr" writer.save(savePath) reader = DefaultParamsReadable.read() lr2 = reader.load(savePath) self.assertEqual(lr.uid, lr2.uid) self.assertEqual(lr.extractParamMap(), lr2.extractParamMap()) # test overwrite lr.setThreshold(0.8) writer.overwrite().save(savePath) reader = DefaultParamsReadable.read() lr3 = reader.load(savePath) self.assertEqual(lr.uid, lr3.uid) self.assertEqual(lr.extractParamMap(), lr3.extractParamMap())
def test_default_read_write(self): temp_path = tempfile.mkdtemp() lr = LogisticRegression() lr.setMaxIter(50) lr.setThreshold(.75) writer = DefaultParamsWriter(lr) savePath = temp_path + "/lr" writer.save(savePath) reader = DefaultParamsReadable.read() lr2 = reader.load(savePath) self.assertEqual(lr.uid, lr2.uid) self.assertEqual(lr.extractParamMap(), lr2.extractParamMap()) # test overwrite lr.setThreshold(.8) writer.overwrite().save(savePath) reader = DefaultParamsReadable.read() lr3 = reader.load(savePath) self.assertEqual(lr.uid, lr3.uid) self.assertEqual(lr.extractParamMap(), lr3.extractParamMap())
def test_default_read_write_default_params(self): lr = LogisticRegression() self.assertFalse(lr.isSet(lr.getParam("threshold"))) lr.setMaxIter(50) lr.setThreshold(0.75) # `threshold` is set by user, default param `predictionCol` is not set by user. self.assertTrue(lr.isSet(lr.getParam("threshold"))) self.assertFalse(lr.isSet(lr.getParam("predictionCol"))) self.assertTrue(lr.hasDefault(lr.getParam("predictionCol"))) writer = DefaultParamsWriter(lr) metadata = json.loads(writer._get_metadata_to_save(lr, self.sc)) self.assertTrue("defaultParamMap" in metadata) reader = DefaultParamsReadable.read() metadataStr = json.dumps(metadata, separators=[",", ":"]) loadedMetadata = reader._parseMetaData( metadataStr, ) reader.getAndSetParams(lr, loadedMetadata) self.assertTrue(lr.isSet(lr.getParam("threshold"))) self.assertFalse(lr.isSet(lr.getParam("predictionCol"))) self.assertTrue(lr.hasDefault(lr.getParam("predictionCol"))) # manually create metadata without `defaultParamMap` section. del metadata["defaultParamMap"] metadataStr = json.dumps(metadata, separators=[",", ":"]) loadedMetadata = reader._parseMetaData( metadataStr, ) with self.assertRaisesRegex(AssertionError, "`defaultParamMap` section not found"): reader.getAndSetParams(lr, loadedMetadata) # Prior to 2.4.0, metadata doesn't have `defaultParamMap`. metadata["sparkVersion"] = "2.3.0" metadataStr = json.dumps(metadata, separators=[",", ":"]) loadedMetadata = reader._parseMetaData( metadataStr, ) reader.getAndSetParams(lr, loadedMetadata)
def test_default_read_write_default_params(self): lr = LogisticRegression() self.assertFalse(lr.isSet(lr.getParam("threshold"))) lr.setMaxIter(50) lr.setThreshold(.75) # `threshold` is set by user, default param `predictionCol` is not set by user. self.assertTrue(lr.isSet(lr.getParam("threshold"))) self.assertFalse(lr.isSet(lr.getParam("predictionCol"))) self.assertTrue(lr.hasDefault(lr.getParam("predictionCol"))) writer = DefaultParamsWriter(lr) metadata = json.loads(writer._get_metadata_to_save(lr, self.sc)) self.assertTrue("defaultParamMap" in metadata) reader = DefaultParamsReadable.read() metadataStr = json.dumps(metadata, separators=[',', ':']) loadedMetadata = reader._parseMetaData(metadataStr, ) reader.getAndSetParams(lr, loadedMetadata) self.assertTrue(lr.isSet(lr.getParam("threshold"))) self.assertFalse(lr.isSet(lr.getParam("predictionCol"))) self.assertTrue(lr.hasDefault(lr.getParam("predictionCol"))) # manually create metadata without `defaultParamMap` section. del metadata['defaultParamMap'] metadataStr = json.dumps(metadata, separators=[',', ':']) loadedMetadata = reader._parseMetaData(metadataStr, ) with self.assertRaisesRegexp(AssertionError, "`defaultParamMap` section not found"): reader.getAndSetParams(lr, loadedMetadata) # Prior to 2.4.0, metadata doesn't have `defaultParamMap`. metadata['sparkVersion'] = '2.3.0' metadataStr = json.dumps(metadata, separators=[',', ':']) loadedMetadata = reader._parseMetaData(metadataStr, ) reader.getAndSetParams(lr, loadedMetadata)
def estimator_transformer(): spark = SparkSession \ .builder \ .appName("Python Spark SQL basic example") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() # Prepare training data from a list of (label, features) tuples. training = spark.createDataFrame([(1.0, Vectors.dense([0.0, 1.1, 0.1])), (0.0, Vectors.dense([2.0, 1.0, -1.0])), (0.0, Vectors.dense([2.0, 1.3, 1.0])), (1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"]) # Create a LogisticRegression instance. This instance is an Estimator. lr = LogisticRegression(maxIter=10, regParam=0.01) # Print out the parameters, documentation, and any default values. print("\nLogisticRegression parameters:\n" + lr.explainParams() + "\n") lr.setMaxIter(10).setRegParam(0.01).setAggregationDepth(5) # Learn a LogisticRegression model. This uses the parameters stored in lr. model1 = lr.fit(training) # Since model1 is a Model (i.e., a transformer produced by an Estimator), # we can view the parameters it used during fit(). # This prints the parameter (name: value) pairs, where names are unique IDs for this # LogisticRegression instance. print("\nModel 1 was fit using parameters: ") print(model1.extractParamMap()) # We may alternatively specify parameters using a Python dictionary as a paramMap paramMap = {lr.maxIter: 20} paramMap[ lr.maxIter] = 30 # Specify 1 Param, overwriting the original maxIter. paramMap.update({ lr.regParam: 0.1, lr.threshold: 0.55 }) # Specify multiple Params. # You can combine paramMaps, which are python dictionaries. paramMap2 = { lr.probabilityCol: "myProbability" } # Change output column name paramMapCombined = paramMap.copy() paramMapCombined.update(paramMap2) # Now learn a new model using the paramMapCombined parameters. # paramMapCombined overrides all parameters set earlier via lr.set* methods. model2 = lr.fit(training, paramMapCombined) print("\nModel 2 was fit using parameters: ") print(model2.extractParamMap()) # Prepare test data test = spark.createDataFrame([(1.0, Vectors.dense([-1.0, 1.5, 1.3])), (0.0, Vectors.dense([3.0, 2.0, -0.1])), (1.0, Vectors.dense([0.0, 2.2, -1.5]))], ["label", "features"]) # Make predictions on test data using the Transformer.transform() method. # LogisticRegression.transform will only use the 'features' column. # Note that model2.transform() outputs a "myProbability" column instead of the usual # 'probability' column since we renamed the lr.probabilityCol parameter previously. prediction = model2.transform(test) result = prediction.select("features", "label", "myProbability", "prediction") \ .collect() for row in result: print("features=%s, label=%s -> prob=%s, prediction=%s" % (row.features, row.label, row.myProbability, row.prediction)) spark.stop()