def test_kmeans(self): kmeans = KMeans(k=2, seed=1) path = tempfile.mkdtemp() km_path = path + "/km" kmeans.save(km_path) kmeans2 = KMeans.load(km_path) self.assertEqual(kmeans.uid, kmeans2.uid) self.assertEqual(type(kmeans.uid), type(kmeans2.uid)) self.assertEqual(kmeans2.uid, kmeans2.k.parent, "Loaded KMeans instance uid (%s) did not match Param's uid (%s)" % (kmeans2.uid, kmeans2.k.parent)) self.assertEqual(kmeans._defaultParamMap[kmeans.k], kmeans2._defaultParamMap[kmeans2.k], "Loaded KMeans instance default params did not match " + "original defaults") try: rmtree(path) except OSError: pass
def test_kmeans(self): kmeans = KMeans(k=2, seed=1) path = tempfile.mkdtemp() km_path = path + "/km" kmeans.save(km_path) kmeans2 = KMeans.load(km_path) self.assertEqual(kmeans.uid, kmeans2.uid) self.assertEqual(type(kmeans.uid), type(kmeans2.uid)) self.assertEqual(kmeans2.uid, kmeans2.k.parent, "Loaded KMeans instance uid (%s) did not match Param's uid (%s)" % (kmeans2.uid, kmeans2.k.parent)) self.assertEqual(kmeans._defaultParamMap[kmeans.k], kmeans2._defaultParamMap[kmeans2.k], "Loaded KMeans instance default params did not match " + "original defaults") try: rmtree(path) except OSError: pass
def k_means(): spark = SparkSession \ .builder \ .appName("Python Spark SQL basic example") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() from pyspark.ml.linalg import Vectors data = [(Vectors.dense([0.0, 0.0]), ), (Vectors.dense([1.0, 1.0]), ), (Vectors.dense([9.0, 8.0]), ), (Vectors.dense([8.0, 9.0]), )] df = spark.createDataFrame(data, ["features"]) kmeans = KMeans(k=2, seed=1) model = kmeans.fit(df) centers = model.clusterCenters() len(centers) # 2 model.computeCost(df) # 2.000... transformed = model.transform(df).select("features", "prediction") rows = transformed.collect() rows[0].prediction == rows[1].prediction # True rows[2].prediction == rows[3].prediction # True model.hasSummary # True summary = model.summary summary.k # 2 summary.clusterSizes # [2, 2] temp_path = "./" kmeans_path = temp_path + "/kmeans" kmeans.save(kmeans_path) kmeans2 = KMeans.load(kmeans_path) kmeans2.getK() # 2 model_path = temp_path + "/kmeans_model" model.save(model_path) model2 = KMeansModel.load(model_path) model2.hasSummary # False model.clusterCenters()[0] == model2.clusterCenters()[0] # array([ True, True], dtype=bool) model.clusterCenters()[1] == model2.clusterCenters()[1]