Esempio n. 1
0
 def test_kmeans_param(self):
     algo = KMeans()
     self.assertEqual(algo.getInitMode(), "k-means||")
     algo.setK(10)
     self.assertEqual(algo.getK(), 10)
     algo.setInitSteps(10)
     self.assertEqual(algo.getInitSteps(), 10)
Esempio n. 2
0
 def test_kmeans_param(self):
     algo = KMeans()
     self.assertEqual(algo.getInitMode(), "k-means||")
     algo.setK(10)
     self.assertEqual(algo.getK(), 10)
     algo.setInitSteps(10)
     self.assertEqual(algo.getInitSteps(), 10)
Esempio n. 3
0
    def _get_kmeans_instance(self, k):
        kmeans = KMeans()

        kmeans.setK(k).setFeaturesCol('features').setPredictionCol(
            'prediction')

        return kmeans
Esempio n. 4
0
 def test_kmeans_param(self):
     algo = KMeans()
     self.assertEqual(algo.getInitMode(), "k-means||")
     algo.setK(10)
     self.assertEqual(algo.getK(), 10)
     algo.setInitSteps(10)
     self.assertEqual(algo.getInitSteps(), 10)
     self.assertEqual(algo.getDistanceMeasure(), "euclidean")
     algo.setDistanceMeasure("cosine")
     self.assertEqual(algo.getDistanceMeasure(), "cosine")
Esempio n. 5
0
 def test_kmeans_param(self):
     algo = KMeans()
     self.assertEqual(algo.getInitMode(), "k-means||")
     algo.setK(10)
     self.assertEqual(algo.getK(), 10)
     algo.setInitSteps(10)
     self.assertEqual(algo.getInitSteps(), 10)
     self.assertEqual(algo.getDistanceMeasure(), "euclidean")
     algo.setDistanceMeasure("cosine")
     self.assertEqual(algo.getDistanceMeasure(), "cosine")
Esempio n. 6
0
from pyspark.sql import Row, SparkSession
from pyspark.ml.clustering import KMeans, KMeansModel
from pyspark.context import SparkContext as sc
from pyspark.ml.linalg import Vectors

if __name__ == "__main__":
    spark = SparkSession \
        .builder \
        .appName("ChiSqSelectorExample") \
        .getOrCreate()
    rawData = spark.sparkContext.textFile("file:///home/tianlei/iris.txt")

    def f(x):
        rel = {}
        rel['features'] = Vectors.dense(float(x[0]), float(x[1]), float(x[2]),
                                        float(x[3]))
        return rel

    df = sc.textFile("file:///usr/local/spark/iris.txt").map(
        lambda line: line.split(',')).map(lambda p: Row(**f(p))).toDF()
    # 在得到数据后,我们即可通过ML包的固有流程:创建Estimator并调用其fit()
    # 方法来生成相应的Transformer对象,很显然,在这里KMeans类是Estimator,而用于保存训练后模型的KMeansModel类则属于Transformer:
    kmeansmodel = KMeans.setK(3).setFeaturesCol('features').setPredictionCol(
        'prediction').fit(df)
    # KMeansModel作为一个Transformer,不再提供predict()样式的方法,
    # 而是提供了一致性的transform()方法,用于将存储在DataFrame中的给定数据集进行整体处理,
    # 生成带有预测簇标签的数据集:
    results = kmeansmodel.transform(df).collect()
    for item in results:

        print(str(item[0]) + ' is predcted as cluster' + str(item[1]))
Esempio n. 7
0
)


def f(x):
    rel = {}
    rel['features'] = Vectors.dense(float(x[0]), float(x[1]), float(x[2]),
                                    float(x[3]))
    return rel


df = rawData.map(lambda line: line.split(',')).map(
    lambda p: Row(**f(p))).toDF()

kmeans = KMeans()

kmeans.setK(3).setFeaturesCol('features').setPredictionCol('prediction')

kmeansmodel = kmeans.fit(df)

# kmeans_path = "./kmeans"
# kmeans.save(kmeans_path)
# kmeans2 = KMeans.load(kmeans_path)
# kmeans2.getK()

model_path = "./kmeans_model"
kmeansmodel.write().overwrite().save(model_path)
model2 = KMeansModel.load(model_path)

print("----------- classified result ---------")
transformed = kmeansmodel.transform(df)
results = transformed.collect()