Python StreamingKMeansModel Examples

Programming Language: Python

Namespace/Package Name: pyspark.mllib.clustering

Examples at hotexamples.com: 7

Python StreamingKMeansModel - 7 examples found. These are the top rated real world Python examples of pyspark.mllib.clustering.StreamingKMeansModel extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

StreamingKMeansModel(6)

predict(3)

update(2)

Example #1

Show file

File: test_streaming_algorithms.py Project: imback82/spark-4

    def test_predictOn_model(self):
        """Test that the model predicts correctly on toy data."""
        stkm = StreamingKMeans()
        stkm._model = StreamingKMeansModel(clusterCenters=[[1.0, 1.0],
                                                           [-1.0, 1.0],
                                                           [-1.0, -1.0],
                                                           [1.0, -1.0]],
                                           clusterWeights=[1.0, 1.0, 1.0, 1.0])

        predict_data = [[[1.5, 1.5]], [[-1.5, 1.5]], [[-1.5, -1.5]],
                        [[1.5, -1.5]]]
        predict_data = [
            self.sc.parallelize(batch, 1) for batch in predict_data
        ]
        predict_stream = self.ssc.queueStream(predict_data)
        predict_val = stkm.predictOn(predict_stream)

        result = []

        def update(rdd):
            rdd_collect = rdd.collect()
            if rdd_collect:
                result.append(rdd_collect)

        predict_val.foreachRDD(update)
        self.ssc.start()

        def condition():
            self.assertEqual(result, [[0], [1], [2], [3]])
            return True

        eventually(condition, catch_assertions=True)

Example #2

Show file

File: tests.py Project: rajsingh7/spark

    def test_predictOn_model(self):
        """Test that the model predicts correctly on toy data."""
        stkm = StreamingKMeans()
        stkm._model = StreamingKMeansModel(clusterCenters=[[1.0, 1.0],
                                                           [-1.0, 1.0],
                                                           [-1.0, -1.0],
                                                           [1.0, -1.0]],
                                           clusterWeights=[1.0, 1.0, 1.0, 1.0])

        predict_data = [[[1.5, 1.5]], [[-1.5, 1.5]], [[-1.5, -1.5]],
                        [[1.5, -1.5]]]
        predict_data = [sc.parallelize(batch, 1) for batch in predict_data]
        predict_stream = self.ssc.queueStream(predict_data)
        predict_val = stkm.predictOn(predict_stream)

        result = []

        def update(rdd):
            rdd_collect = rdd.collect()
            if rdd_collect:
                result.append(rdd_collect)

        predict_val.foreachRDD(update)
        t = time()
        self.ssc.start()
        self._ssc_wait(t, 6.0, 0.01)
        self.assertEquals(result, [[0], [1], [2], [3]])

Example #3

Show file

File: KMeansClusteringSpark.py Project: vishnu693/IOT_SGX

def test_streaming_kmeans():
    records = get_data_from_db()

    conf = SparkConf().setAppName("testingClusters").setMaster("local[2]")
    sc = SparkContext(conf=conf)
    sc.setLogLevel("ERROR")

    initCenters = [[0.0, 0.0, 0.0], [1.0, 1.0, 1.0], [0.5, 0.5, 0.5]]
    initWeights = [1.0, 1.0, 1.0]
    stkm = StreamingKMeansModel(initCenters, initWeights)
    recordRDD = sc.parallelize(records)
    stkm = stkm.update(data=recordRDD, decayFactor=1.0, timeUnit=u"batches")

    for center in stkm.centers:
        print(center)

    sample_data = np.array([46.5, 23.0, 1034.0]).reshape(1, -1)
    record_scaled = scaler.transform(sample_data)
    print("Scaled Record: ", record_scaled)
    ret = stkm.predict(sc.parallelize(record_scaled))
    print("******** Predicted cluster index: ")
    ret.foreach(print)
    for center in stkm.centers:
        distance = np.linalg.norm(center - record_scaled)
        similarity = 1 / (1 + distance)
        print(center, distance, similarity)

    stkm = stkm.update(sc.parallelize(record_scaled), 1.0, u"points")

    sample_data = np.array([46.2, 23.5, 1034.32]).reshape(1, -1)
    record_scaled = scaler.transform(sample_data)
    print("Scaled Record: ", record_scaled)
    ret = stkm.predict(sc.parallelize(record_scaled))
    print("******** Predicted cluster index: ")
    ret.foreach(print)
    for center in stkm.centers:
        distance = np.linalg.norm(center - record_scaled)
        similarity = 1 / (1 + distance)
        print(center, distance, similarity)

Example #4

Show file

File: streaming_clustering.py Project: JASON9620/python

import dautil as dl
from pyspark.mllib.clustering import StreamingKMeansModel
from pyspark import SparkContext

csv_file = dl.data.get_direct_marketing_csv()
csv_rows = dl.data.read_csv(csv_file)

stkm = StreamingKMeansModel(28 * [[0., 0., 0.]], 28 * [1.])
sc = SparkContext()

for row in csv_rows:
    spend = dl.data.centify(row['spend'])

    if spend > 0:
        history = dl.data.centify(row['history'])
        data = sc.parallelize([[int(row['recency']), history, spend]])
        stkm = stkm.update(data, 0., 'points')

print(stkm.centers)

Example #5

Show file

File: streaming_clustering.py Project: PacktPublishing/PythonDataAnalysisCookbook

import dautil as dl
from pyspark.mllib.clustering import StreamingKMeansModel
from pyspark import SparkContext

csv_file = dl.data.get_direct_marketing_csv()
csv_rows = dl.data.read_csv(csv_file)

stkm = StreamingKMeansModel(28 * [[0., 0., 0.]], 28 * [1.])
sc = SparkContext()

for row in csv_rows:
    spend = dl.data.centify(row['spend'])

    if spend > 0:
        history = dl.data.centify(row['history'])
        data = sc.parallelize([[int(row['recency']),
                               history, spend]])
        stkm = stkm.update(data, 0., 'points')

print(stkm.centers)

Example #6

Show file

              if sys.argv[1]==("sset1"):
                 truePredictions.append(int(c[0])-1)
              if sys.argv[1]==("asset1"):
                 truePredictions.append(int(c[0])-1)
              else:
                 truePredictions.append(int(c[0]))

     with open("datasets/"+sys.argv[1]+".txt","r") as fichero:
    	  for linea in fichero:
              points.append(linea.strip("\n").split())

     for document in cursor:
         centers = document["clusterCenters"]
         weights = document["clusterWeights"]

     stkm = StreamingKMeansModel(centers, weights)

     predictions = []

     for point in points:
         predictions.append(stkm.predict(point))

     recall = recall_score(truePredictions,predictions, average='weighted')

     precision = precision_score(truePredictions, predictions, average='weighted')

     f1Score = 2 * (precision * recall) / (precision + recall)

     logging.info("Recall = " + str(recall) )
     logging.info("Precision = " + str(precision) )
     logging.info("F1-Score = " + str(f1Score))

Example #7

Show file

os.environ[
    'PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.0.2 pyspark-shell'

sc = SparkContext(appName="PythonSparkStreamingKafka")

sc.setLogLevel("WARN")

ssc = StreamingContext(sc, 120)

initWeights = [
    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
]

offlineModel = KMeansModel.load(sc, "KMeansModel")

stkm = StreamingKMeansModel(offlineModel.clusterCenters, initWeights)

#kafkaStream = KafkaUtils.createDirectStream(ssc, ['test'], {'metadata.broker.list': 'localhost:9092'})

#lines = kafkaStream.map(lambda line: array([float(x) for x in line.split('\t')]))

consumer = KafkaConsumer('test',
                         bootstrap_servers=['localhost:9092'],
                         value_deserializer=lambda x: loads(x.decode('utf-8')),
                         consumer_timeout_ms=10000)

colors = [
    'r', 'k', 'b', 'grey', 'darkorange', 'm', 'y', 'c', 'gold', 'slateblue',
    'beige', 'coral', 'g', 'peru', 'pink'
]