Esempio n. 1
0
 def test_kmeans(self):
     kmeans = KMeans(k=2, seed=1)
     path = tempfile.mkdtemp()
     km_path = path + "/km"
     kmeans.save(km_path)
     kmeans2 = KMeans.load(km_path)
     self.assertEqual(kmeans.uid, kmeans2.uid)
     self.assertEqual(type(kmeans.uid), type(kmeans2.uid))
     self.assertEqual(kmeans2.uid, kmeans2.k.parent,
                      "Loaded KMeans instance uid (%s) did not match Param's uid (%s)"
                      % (kmeans2.uid, kmeans2.k.parent))
     self.assertEqual(kmeans._defaultParamMap[kmeans.k], kmeans2._defaultParamMap[kmeans2.k],
                      "Loaded KMeans instance default params did not match " +
                      "original defaults")
     try:
         rmtree(path)
     except OSError:
         pass
Esempio n. 2
0
 def test_kmeans(self):
     kmeans = KMeans(k=2, seed=1)
     path = tempfile.mkdtemp()
     km_path = path + "/km"
     kmeans.save(km_path)
     kmeans2 = KMeans.load(km_path)
     self.assertEqual(kmeans.uid, kmeans2.uid)
     self.assertEqual(type(kmeans.uid), type(kmeans2.uid))
     self.assertEqual(kmeans2.uid, kmeans2.k.parent,
                      "Loaded KMeans instance uid (%s) did not match Param's uid (%s)"
                      % (kmeans2.uid, kmeans2.k.parent))
     self.assertEqual(kmeans._defaultParamMap[kmeans.k], kmeans2._defaultParamMap[kmeans2.k],
                      "Loaded KMeans instance default params did not match " +
                      "original defaults")
     try:
         rmtree(path)
     except OSError:
         pass
Esempio n. 3
0
def k_means():
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    from pyspark.ml.linalg import Vectors
    data = [(Vectors.dense([0.0, 0.0]), ), (Vectors.dense([1.0, 1.0]), ),
            (Vectors.dense([9.0, 8.0]), ), (Vectors.dense([8.0, 9.0]), )]
    df = spark.createDataFrame(data, ["features"])
    kmeans = KMeans(k=2, seed=1)
    model = kmeans.fit(df)
    centers = model.clusterCenters()
    len(centers)
    # 2
    model.computeCost(df)
    # 2.000...
    transformed = model.transform(df).select("features", "prediction")
    rows = transformed.collect()
    rows[0].prediction == rows[1].prediction
    # True
    rows[2].prediction == rows[3].prediction
    # True
    model.hasSummary
    # True
    summary = model.summary
    summary.k
    # 2
    summary.clusterSizes
    # [2, 2]
    temp_path = "./"
    kmeans_path = temp_path + "/kmeans"
    kmeans.save(kmeans_path)
    kmeans2 = KMeans.load(kmeans_path)
    kmeans2.getK()
    # 2
    model_path = temp_path + "/kmeans_model"
    model.save(model_path)
    model2 = KMeansModel.load(model_path)
    model2.hasSummary
    # False
    model.clusterCenters()[0] == model2.clusterCenters()[0]
    # array([ True,  True], dtype=bool)
    model.clusterCenters()[1] == model2.clusterCenters()[1]