Python StreamingKMeans Examples, pyspark.mllib.clustering.StreamingKMeans Python Examples

Example #1

0

Show file

File: test_streaming_algorithms.py Project: JingchengDu/spark

    def test_predictOn_model(self):
        """Test that the model predicts correctly on toy data."""
        stkm = StreamingKMeans()
        stkm._model = StreamingKMeansModel(
            clusterCenters=[[1.0, 1.0], [-1.0, 1.0], [-1.0, -1.0], [1.0, -1.0]],
            clusterWeights=[1.0, 1.0, 1.0, 1.0])

        predict_data = [[[1.5, 1.5]], [[-1.5, 1.5]], [[-1.5, -1.5]], [[1.5, -1.5]]]
        predict_data = [self.sc.parallelize(batch, 1) for batch in predict_data]
        predict_stream = self.ssc.queueStream(predict_data)
        predict_val = stkm.predictOn(predict_stream)

        result = []

        def update(rdd):
            rdd_collect = rdd.collect()
            if rdd_collect:
                result.append(rdd_collect)

        predict_val.foreachRDD(update)
        self.ssc.start()

        def condition():
            self.assertEqual(result, [[0], [1], [2], [3]])
            return True

        self._eventually(condition, catch_assertions=True)

Example #2

0

Show file

File: test_streaming_algorithms.py Project: JingchengDu/spark

    def test_trainOn_model(self):
        """Test the model on toy data with four clusters."""
        stkm = StreamingKMeans()
        initCenters = [[1.0, 1.0], [-1.0, 1.0], [-1.0, -1.0], [1.0, -1.0]]
        stkm.setInitialCenters(
            centers=initCenters, weights=[1.0, 1.0, 1.0, 1.0])

        # Create a toy dataset by setting a tiny offset for each point.
        offsets = [[0, 0.1], [0, -0.1], [0.1, 0], [-0.1, 0]]
        batches = []
        for offset in offsets:
            batches.append([[offset[0] + center[0], offset[1] + center[1]]
                            for center in initCenters])

        batches = [self.sc.parallelize(batch, 1) for batch in batches]
        input_stream = self.ssc.queueStream(batches)
        stkm.trainOn(input_stream)
        self.ssc.start()

        # Give enough time to train the model.
        def condition():
            finalModel = stkm.latestModel()
            self.assertTrue(all(finalModel.centers == array(initCenters)))
            self.assertEqual(finalModel.clusterWeights, [5.0, 5.0, 5.0, 5.0])
            return True
        self._eventually(condition, catch_assertions=True)

Example #3

0

Show file

File: test_streaming_algorithms.py Project: imback82/spark-4

    def test_predictOn_model(self):
        """Test that the model predicts correctly on toy data."""
        stkm = StreamingKMeans()
        stkm._model = StreamingKMeansModel(clusterCenters=[[1.0, 1.0],
                                                           [-1.0, 1.0],
                                                           [-1.0, -1.0],
                                                           [1.0, -1.0]],
                                           clusterWeights=[1.0, 1.0, 1.0, 1.0])

        predict_data = [[[1.5, 1.5]], [[-1.5, 1.5]], [[-1.5, -1.5]],
                        [[1.5, -1.5]]]
        predict_data = [
            self.sc.parallelize(batch, 1) for batch in predict_data
        ]
        predict_stream = self.ssc.queueStream(predict_data)
        predict_val = stkm.predictOn(predict_stream)

        result = []

        def update(rdd):
            rdd_collect = rdd.collect()
            if rdd_collect:
                result.append(rdd_collect)

        predict_val.foreachRDD(update)
        self.ssc.start()

        def condition():
            self.assertEqual(result, [[0], [1], [2], [3]])
            return True

        eventually(condition, catch_assertions=True)

Example #4

0

Show file

File: twitter-streaming.py Project: tsiangsun/tdi

def main():
    conf = SparkConf().setAppName("twitterclassifier")
    sc = SparkContext(conf=conf)
    ssc = StreamingContext(sc, 10)

    tweets = ssc.socketTextStream("localhost", PORT) \
                .map(lambda x: json.loads(x)) \
                .filter(lambda x: 'text' in x) \
                .map(lambda x: x['text'].encode('utf-8'))
    hasher = HashingTF(DIM)
    features = tweets.map(lambda x:
                          (x, hasher.transform(featurize(x)))).cache()

    # We create a model with random clusters and specify the number of clusters to find
    # decay = 1: total memory; decay = 0: no memory
    model = StreamingKMeans(k=N, decayFactor=0.1).setRandomCenters(DIM, 1.0, 0)
    model.trainOn(features.map(lambda x: x[1]))
    results = model.predictOnValues(features).cache()

    # Need a closure over i here.
    def print_group(i):
        results.filter(lambda x: x[1] == i).map(lambda x: '%i: %s' %
                                                (x[1], x[0])).pprint(3)

    for i in xrange(N):
        print_group(i)

    ssc.start()
    ssc.awaitTermination()

Example #5

0

Show file

File: tests.py Project: rajsingh7/spark

    def test_predictOn_model(self):
        """Test that the model predicts correctly on toy data."""
        stkm = StreamingKMeans()
        stkm._model = StreamingKMeansModel(clusterCenters=[[1.0, 1.0],
                                                           [-1.0, 1.0],
                                                           [-1.0, -1.0],
                                                           [1.0, -1.0]],
                                           clusterWeights=[1.0, 1.0, 1.0, 1.0])

        predict_data = [[[1.5, 1.5]], [[-1.5, 1.5]], [[-1.5, -1.5]],
                        [[1.5, -1.5]]]
        predict_data = [sc.parallelize(batch, 1) for batch in predict_data]
        predict_stream = self.ssc.queueStream(predict_data)
        predict_val = stkm.predictOn(predict_stream)

        result = []

        def update(rdd):
            rdd_collect = rdd.collect()
            if rdd_collect:
                result.append(rdd_collect)

        predict_val.foreachRDD(update)
        t = time()
        self.ssc.start()
        self._ssc_wait(t, 6.0, 0.01)
        self.assertEquals(result, [[0], [1], [2], [3]])

Example #6

0

Show file

    def perform_training(sc: SparkContext, params_dict: dict):
        batch_duration = 1 if 'batch_duration' not in params_dict else params_dict[
            'batch_duration']
        training_duration = 20 if 'training_duration' not in params_dict else params_dict[
            'training_duration']
        ssc = StreamingContext(sc, batch_duration)
        topics = ['normal-ekg-stream']
        kafka_params = {'metadata.broker.list': 'localhost:9092'}
        kvs = KafkaUtils.createDirectStream(
            ssc,
            topics,
            kafkaParams=kafka_params,
            valueDecoder=lambda val: json.loads(val.decode('utf-8')))

        windowed_signal = kvs.map(lambda msg: Vectors.dense(
            [float(value) for value in msg[1]['signal_values']]))

        # windowed_signal.foreachRDD(Plotter.plot_signal_window)
        model = StreamingKMeans(k=20,
                                decayFactor=1.0).setRandomCenters(188, 1.0, 0)
        model.trainOn(windowed_signal)

        ssc.start()
        ssc.awaitTerminationOrTimeout(training_duration)
        ssc.stop(stopSparkContext=False, stopGraceFully=True)

        return model.latestModel()

Example #7

0

Show file

File: test_streaming_algorithms.py Project: imback82/spark-4

    def test_accuracy_for_single_center(self):
        """Test that parameters obtained are correct for a single center."""
        centers, batches = self.streamingKMeansDataGenerator(batches=5,
                                                             numPoints=5,
                                                             k=1,
                                                             d=5,
                                                             r=0.1,
                                                             seed=0)
        stkm = StreamingKMeans(1)
        stkm.setInitialCenters([[0., 0., 0., 0., 0.]], [0.])
        input_stream = self.ssc.queueStream(
            [self.sc.parallelize(batch, 1) for batch in batches])
        stkm.trainOn(input_stream)

        self.ssc.start()

        def condition():
            self.assertEqual(stkm.latestModel().clusterWeights, [25.0])
            return True

        eventually(condition, catch_assertions=True)

        realCenters = array_sum(array(centers), axis=0)
        for i in range(5):
            modelCenters = stkm.latestModel().centers[0][i]
            self.assertAlmostEqual(centers[0][i], modelCenters, 1)
            self.assertAlmostEqual(realCenters[i], modelCenters, 1)

Example #8

0

Show file

File: tests.py Project: rajsingh7/spark

    def test_trainOn_predictOn(self):
        """Test that prediction happens on the updated model."""
        stkm = StreamingKMeans(decayFactor=0.0, k=2)
        stkm.setInitialCenters([[0.0], [1.0]], [1.0, 1.0])

        # Since decay factor is set to zero, once the first batch
        # is passed the clusterCenters are updated to [-0.5, 0.7]
        # which causes 0.2 & 0.3 to be classified as 1, even though the
        # classification based in the initial model would have been 0
        # proving that the model is updated.
        batches = [[[-0.5], [0.6], [0.8]], [[0.2], [-0.1], [0.3]]]
        batches = [sc.parallelize(batch) for batch in batches]
        input_stream = self.ssc.queueStream(batches)
        predict_results = []

        def collect(rdd):
            rdd_collect = rdd.collect()
            if rdd_collect:
                predict_results.append(rdd_collect)

        stkm.trainOn(input_stream)
        predict_stream = stkm.predictOn(input_stream)
        predict_stream.foreachRDD(collect)

        t = time()
        self.ssc.start()
        self._ssc_wait(t, 6.0, 0.01)
        self.assertEqual(predict_results, [[0, 1, 1], [1, 0, 1]])

Example #9

0

Show file

File: tests.py Project: rajsingh7/spark

    def test_trainOn_model(self):
        """Test the model on toy data with four clusters."""
        stkm = StreamingKMeans()
        initCenters = [[1.0, 1.0], [-1.0, 1.0], [-1.0, -1.0], [1.0, -1.0]]
        stkm.setInitialCenters(centers=initCenters,
                               weights=[1.0, 1.0, 1.0, 1.0])

        # Create a toy dataset by setting a tiny offest for each point.
        offsets = [[0, 0.1], [0, -0.1], [0.1, 0], [-0.1, 0]]
        batches = []
        for offset in offsets:
            batches.append([[offset[0] + center[0], offset[1] + center[1]]
                            for center in initCenters])

        batches = [self.sc.parallelize(batch, 1) for batch in batches]
        input_stream = self.ssc.queueStream(batches)
        stkm.trainOn(input_stream)
        t = time()
        self.ssc.start()

        # Give enough time to train the model.
        self._ssc_wait(t, 6.0, 0.01)
        finalModel = stkm.latestModel()
        self.assertTrue(all(finalModel.centers == array(initCenters)))
        self.assertEquals(finalModel.clusterWeights, [5.0, 5.0, 5.0, 5.0])

Example #10

0

Show file

    def test_model_params(self):
        """Test that the model params are set correctly"""
        stkm = StreamingKMeans()
        stkm.setK(5).setDecayFactor(0.0)
        self.assertEqual(stkm._k, 5)
        self.assertEqual(stkm._decayFactor, 0.0)

        # Model not set yet.
        self.assertIsNone(stkm.latestModel())
        self.assertRaises(ValueError, stkm.trainOn, [0.0, 1.0])

        stkm.setInitialCenters(
            centers=[[0.0, 0.0], [1.0, 1.0]], weights=[1.0, 1.0])
        self.assertEqual(
            stkm.latestModel().centers, [[0.0, 0.0], [1.0, 1.0]])
        self.assertEqual(stkm.latestModel().clusterWeights, [1.0, 1.0])

Example #11

0

Show file

File: tests.py Project: HodaAlemi/spark

    def test_trainOn_predictOn(self):
        """Test that prediction happens on the updated model."""
        stkm = StreamingKMeans(decayFactor=0.0, k=2)
        stkm.setInitialCenters([[0.0], [1.0]], [1.0, 1.0])

        # Since decay factor is set to zero, once the first batch
        # is passed the clusterCenters are updated to [-0.5, 0.7]
        # which causes 0.2 & 0.3 to be classified as 1, even though the
        # classification based in the initial model would have been 0
        # proving that the model is updated.
        batches = [[[-0.5], [0.6], [0.8]], [[0.2], [-0.1], [0.3]]]
        batches = [sc.parallelize(batch) for batch in batches]
        input_stream = self.ssc.queueStream(batches)
        predict_results = []

        def collect(rdd):
            rdd_collect = rdd.collect()
            if rdd_collect:
                predict_results.append(rdd_collect)

        stkm.trainOn(input_stream)
        predict_stream = stkm.predictOn(input_stream)
        predict_stream.foreachRDD(collect)

        t = time()
        self.ssc.start()
        self._ssc_wait(t, 6.0, 0.01)
        self.assertEqual(predict_results, [[0, 1, 1], [1, 0, 1]])

Example #12

0

Show file

File: tests.py Project: HodaAlemi/spark

    def test_model_params(self):
        """Test that the model params are set correctly"""
        stkm = StreamingKMeans()
        stkm.setK(5).setDecayFactor(0.0)
        self.assertEquals(stkm._k, 5)
        self.assertEquals(stkm._decayFactor, 0.0)

        # Model not set yet.
        self.assertIsNone(stkm.latestModel())
        self.assertRaises(ValueError, stkm.trainOn, [0.0, 1.0])

        stkm.setInitialCenters(
            centers=[[0.0, 0.0], [1.0, 1.0]], weights=[1.0, 1.0])
        self.assertEquals(
            stkm.latestModel().centers, [[0.0, 0.0], [1.0, 1.0]])
        self.assertEquals(stkm.latestModel().clusterWeights, [1.0, 1.0])

Example #13

0

Show file

File: tests.py Project: HodaAlemi/spark

    def test_predictOn_model(self):
        """Test that the model predicts correctly on toy data."""
        stkm = StreamingKMeans()
        stkm._model = StreamingKMeansModel(
            clusterCenters=[[1.0, 1.0], [-1.0, 1.0], [-1.0, -1.0], [1.0, -1.0]],
            clusterWeights=[1.0, 1.0, 1.0, 1.0])

        predict_data = [[[1.5, 1.5]], [[-1.5, 1.5]], [[-1.5, -1.5]], [[1.5, -1.5]]]
        predict_data = [sc.parallelize(batch, 1) for batch in predict_data]
        predict_stream = self.ssc.queueStream(predict_data)
        predict_val = stkm.predictOn(predict_stream)

        result = []

        def update(rdd):
            rdd_collect = rdd.collect()
            if rdd_collect:
                result.append(rdd_collect)

        predict_val.foreachRDD(update)
        t = time()
        self.ssc.start()
        self._ssc_wait(t, 6.0, 0.01)
        self.assertEquals(result, [[0], [1], [2], [3]])

Example #14

0

Show file

class StreamingUpdate(object):
    """
    Streaming Update: DStream
    """
    def __init__(self, init_clusters, decay_factor, time_unit, sc, ssc):
        self.init_clusters=init_clusters
        self.decay_factor=decay_factor
        self.time_unit=time_unit
        self.sc=sc
        self.ssc=ssc

    # implement
    def streaming(self, mnk, clusters, init_clusters):
        self.mnk=mnk
        self.clusters=clusters
        self.init_clusters=init_clusters
        self.streaming_kmeans=StreamingKMeans(self.init_clusters, self.decay_factor, self.time_unit)
        self.streaming_kmeans.setInitialCenters(self.mnk.cluster_centers_, np.ones([self.init_clusters]))

    # update shape for centers in StreamingContext
    """
    Từ điển được cập nhật khi có tin tức mới đến thì em cập nhật lại kích thước của các centroid
    VD: Từ điển ban đầu có kích thước 10 từ
    Em biểu diễn một câu có 5 từ bằng sparse vector kích thước 5x10
    Từ điển sau khi cập nhật có 15 từ thì câu trên phải biểu diễn lại bằng sparse vector có kích thước 5x15
    Có cách biểu diễn khác mà không phải cập nhật lại biểu diễn của câu không ạ
    """
    def update_shape(self, docs, dictionary):
        self.streaming_kmeans.setRandomCenters(matrix_tfidf(docs, dictionary).shape[1], 1.0, 0)

    # save matrix update
    def save_matrix_update(self, docs, dictionary):
        np.savetxt('/home/ducvu/input_streaming.txt', matrix_tfidf(docs, dictionary))

    # load dstream
    def load_dstream(self):
        self.dstream = self.sc.textFile("/home/ducvu/input_streaming.txt")\
            .map(lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')]))
        
    # make predict
    def make_predict(self, docs, dictionary):
        self.streaming_kmeans.trainOn(self.load_dstream())
        self.pred_stream=[]
        matrix=matrix_tfidf(docs)
        for x in matrix:
            self.pred_stream.append(self.streaming_kmeans.latest_model.predict(x))
        self.pred_stream=np.array(self.pred_stream)
        
        df = pd.DataFrame(matrix).groupby(self.pred_stream).mean()
        for i, r in df.iterrows():
            print('\nCluster {0}:'.format(i))
            print(','.join([get_tfidf(dictionary).get_feature_names()[t] for t in np.argsort(r)[-15:]]))

Example #15

0

Show file

File: tests.py Project: HodaAlemi/spark

    def test_accuracy_for_single_center(self):
        """Test that parameters obtained are correct for a single center."""
        centers, batches = self.streamingKMeansDataGenerator(
            batches=5, numPoints=5, k=1, d=5, r=0.1, seed=0)
        stkm = StreamingKMeans(1)
        stkm.setInitialCenters([[0., 0., 0., 0., 0.]], [0.])
        input_stream = self.ssc.queueStream(
            [self.sc.parallelize(batch, 1) for batch in batches])
        stkm.trainOn(input_stream)

        t = time()
        self.ssc.start()
        self._ssc_wait(t, 10.0, 0.01)
        self.assertEquals(stkm.latestModel().clusterWeights, [25.0])
        realCenters = array_sum(array(centers), axis=0)
        for i in range(5):
            modelCenters = stkm.latestModel().centers[0][i]
            self.assertAlmostEqual(centers[0][i], modelCenters, 1)
            self.assertAlmostEqual(realCenters[i], modelCenters, 1)

Example #16

0

Show file

File: anomalies_detection_spark_streaming.py Project: yerranagumadhu/Hanhan-Spark-Python

    def detect(self, k, t):
        # Encoding categorical features using one-hot.
        df1 = self.cat2Num(self.rawDF, [0, 1]).cache()
        df1.show(n=2, truncate=False)

        # Clustering points using KMeans
        features = df1.select("features").rdd.map(lambda row: row[0]).cache()
        model = StreamingKMeans(k=7,
                                decayFactor=1.0).setRandomCenters(4, 1.0, 0)
        # model = KMeans.train(features, k, maxIterations=40, runs=10, initializationMode="random", seed=20)

        # Adding the prediction column to df1
        modelBC = sc.broadcast(model)
        predictUDF = udf(lambda x: modelBC.value.predict(x), StringType())
        df2 = df1.withColumn("prediction", predictUDF(df1.features)).cache()
        df2.show(n=3, truncate=False)

        # Adding the score column to df2; The higher the score, the more likely it is an anomaly
        df3 = self.addScore(df2).cache()
        df3.show(n=3, truncate=False)

        return df3.where(df3.score > t)

Example #17

0

Show file

from pyspark.mllib.clustering import StreamingKMeans

if __name__ == "__main__":
    sc = SparkContext(appName="StreamingErrorCount")
    ssc = StreamingContext(sc, 2)

    ssc.checkpoint("/tmp/spark")

    def parseTrainingData(line):
        cells = line.split(",")
        return Vectors.dense([float(cells[0]), float(cells[1])])

    trainingStream = ssc.textFileStream("./training")\
                        .map(parseTrainingData)

    model = StreamingKMeans(k=2, decayFactor=1.0).setRandomCenters(2, 1.0, 0)

    print "Intial centers: {0}".format(model.latestModel().centers)

    model.trainOn(trainingStream)

    ssc.start()

    s = sched.scheduler(time.time, time.sleep)

    def print_cluster_centers(sc, model):
        print "Cluster centers: {0}".format(model.latestModel().centers)
        s.enter(10, 1, print_cluster_centers, (sc, model))

    s.enter(10, 1, print_cluster_centers, (s, model))
    s.run()

Example #18

0

Show file

            #         ptext[clus].set_text(str(clus)+ ':'+str(row[1][1]))
            #         ptext[clus].set_color(colors[clus])
            #     plt.pause(0.0001)
            #


q = multiprocessing.Queue()
f = multiprocessing.Queue()
job_for_another_core2 = multiprocessing.Process(target=data_plotting,
                                                args=(q, ))
job_for_another_core2.start()

sc = SparkContext('local[4]', 'Social Panic Analysis')
# Create a local StreamingContext with two working thread and batch interval of 1 second

ssc = StreamingContext(sc, 10)
dstream = ssc.socketTextStream("localhost", 9998)
trainingData = dstream.map(Vectors.parse)
trainingData.pprint()
testData = trainingData.map(lambda x: (x, x))
testData.pprint()
model = StreamingKMeans(k=clusterNum,
                        decayFactor=0.1).setRandomCenters(2, 1.0, 0)
model.trainOn(trainingData)
print(model.latestModel().clusterCenters)
clust = model.predictOnValues(testData)
clust.pprint()
#print(model.predictOnValues(testData.map(lambda lp: (lp.label, lp.features))))
clust.foreachRDD(lambda time, rdd: q.put(rdd.collect()))
ssc.start()
ssc.awaitTermination()

Example #19

0

Show file

File: example.py Project: AAB94/RP


trainingData = sc.textFile("data/datatraining.txt")\
    .map(lambda line: line.split(',')[2:-1]).map(lambda arr: Vectors.dense([float(x) for x in arr]))

centers = KMeans.train(trainingData, 2).centers


trainingQueue = [trainingData]


trainingStream = ssc.queueStream(trainingQueue)


# We create a model with random clusters and specify the number of clusters to find
model = StreamingKMeans(k=2, decayFactor=0.3)#.setRandomCenters(5, 1.0, 0)
model.setInitialCenters( centers, [1.0,1.0,1.0,1.0,1.0])
# Now register the streams for training and testing and start the job,
# printing the predicted cluster assignments on new data points as they arrive.
model.trainOn(trainingStream)

def parse(lp):
    #label = float(lp[lp.find('(') + 1: lp.find(')')])
    #vec = Vectors.dense(lp[lp.find('[') + 1: lp.find(']')].split(','))
    arr = lp.split(',')[2:-1]
    label = lp.split(',')[0]
    label = label[1:-1]
    vec = Vectors.dense([float(x) for x in arr])
    print(model.latestModel().centers)
    return LabeledPoint(label, vec)

Example #20

0

Show file

    for center in file:
        initialCenters.append(center.split())

logging.info(initialCenters)

initialWeights = []
for i in initialCenters:
    initialWeights.append(1.0)

config = sc.broadcast(parameters)
numberClusters = config.value[0]
mongoIP = config.value[1]
mongoDataBase = config.value[2]
mongoCollection = config.value[3]

stkm = StreamingKMeans(k=numberClusters, decayFactor=1).setInitialCenters(
    initialCenters, initialWeights)

#stkm = StreamingKMeans(k=int(numberClusters),decayFactor=1.0).setRandomCenters(2,1.0,100)

directKafkaStream = KafkaUtils.createDirectStream(
    ssc, ['StreamingKMeansTFG'], {
        "metadata.broker.list": "localhost:9092",
        "auto_offset_reset": 'earliest'
    })

parsed = directKafkaStream.map(lambda v: loads(v[1]))

parsed = parsed.map(
    lambda line: Vectors.dense([float(x) for x in line.strip().split()]))

stkm.trainOn(parsed)

Example #21

0

Show file

File: streaming_k_means.py Project: ohliumliu/flash_deals_c9

    def parse(lp):
        label = float(lp[lp.find('(') + 1: lp.find(')')])
        vec = Vectors.dense(lp[lp.find('[') + 1: lp.find(']')].split(','))
        return LabeledPoint(label, vec)

    trainingData = sc.textFile("spark-2.0.1-bin-hadoop2.7/data/mllib/kmeans_data.txt")\
        .map(lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')]))


    trainingQueue = [trainingData]

    trainingStream = ssc.queueStream(trainingQueue)
    testingStream = ssc.textFileStream('history').map(parse)

    # We create a model with random clusters and specify the number of clusters to find
    model = StreamingKMeans(k=2, decayFactor=1.0).setRandomCenters(3, 1.0, 0)

    # Now register the streams for training and testing and start the job,
    # printing the predicted cluster assignments on new data points as they arrive.
    model.trainOn(trainingStream)

    result = model.predictOnValues(testingStream.map(lambda lp: (lp.label, lp.features)))
    result.pprint()

    ssc.start()
    ssc.awaitTermination()
    #ssc.stop(stopSparkContext=True, stopGraceFully=True)
    # $example off$

    print("Final centers: " + str(model.latestModel().centers))

Example #22

0

Show file

File: Saistudy - split-csv.py Project: meetreks/SaiStudyAll

from pyspark.mllib.linalg import Vectors
from pyspark.mllib.clustering import StreamingKMeans

if __name__ == "__main__":
    sc = SparkContext(appName="sai twitter feed")
    ssc = StreamingContext(sc, 10)

    ssc.checkpoint("chkpfile")

    def parserData(line):
        cells = line.split(",")
        return Vectors.dense([float(cells[0]), float(cells[1])])

    trainingStream = ssc.textFileStream("/files").map(parserData)

    model = StreamingKMeans(k=2, decayFactor=1.0).setRandomCenters(2, 1.0, 0)
    print("Initial Centres" + str(model.latestModel().centers))
    model.trainOn(trainingStream)
    ssc.start()

    s = sched.scheduler(time.time, time.sleep)

    def print_cluster_centres(sc, model):
        print(str(model.latestModel().centers))
        s.enter(10, 1, print_cluster_centres, (s, model))

    s.enter(10, 1, print_cluster_centres, (s, model))
    s.run()

    ssc.awaitTermination()
# to make this work

Example #23

0

Show file

File: analyzer_twiterSpark.py Project: gachet/TwitterSparkStreamClustering

         .filter(lambda post: 'created_at' in post)\
         .map(lambda post: (get_coord2(post)[0],get_coord2(post)[1],post["text"]))\
         .filter(lambda tpl: tpl[0] != 0)\
         .filter(lambda tpl: tpl[2] != '')\
         .map(lambda tpl: (tpl[0],tpl[1],tokenize(tpl[2])))\
         .map(lambda tpl:(tpl[0],tpl[1],tpl[2],doc2vec(tpl[2])))
    #dstream_tweets.pprint()



    trainingData=dstream_tweets.map(lambda tpl: [tpl[0],tpl[1]]+tpl[3].tolist())
    #trainingData.pprint()
    testdata=dstream_tweets.map(lambda tpl: (([tpl[0],tpl[1]],tpl[2]),[tpl[0],tpl[1]]+tpl[3].tolist()))
    #testdata.pprint()
    #
    model = StreamingKMeans(k=clusterNum, decayFactor=0.6).setRandomCenters(102, 1.0, 3)
    model.trainOn(trainingData)
    clust=model.predictOnValues(testdata)
    #clust.pprint()
    #words = lines.flatMap(lambda line: line.split(" "))
    topic=clust.map(lambda x: (x[1],x[0][1]))
    #topic.pprint()
    topicAgg = topic.reduceByKey(lambda x,y: x+y)
    #wordCollect.pprint()
    topicAgg.map(lambda x: (x[0],freqcount(x[1]))).pprint()

    clust.foreachRDD(lambda time, rdd: q.put(rdd.collect()))

    # Run!
    ssc.start()
    ssc.awaitTermination()

Example #24

0

Show file

)
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.clustering import StreamingKMeans


def parse(lp):
    label = float(lp[lp.find('(') + 1:lp.find(',')])
    vec = Vectors.dense(lp[lp.find('[') + 1:lp.find(']')].split(','))
    return LabeledPoint(label, vec)


sc = SparkContext('local[4]', 'Social Panic Analysis')
# Create a local StreamingContext with two working thread and batch interval of 1 second

ssc = StreamingContext(sc, 10)
trainingData = ssc.textFileStream("./training/").map(Vectors.parse)
trainingData.pprint()
testData = ssc.textFileStream("./testing/").map(parse)
testData.pprint()
model = StreamingKMeans(k=5, decayFactor=1.0).setRandomCenters(3, 1.0, 0)
model.trainOn(trainingData)
model.predictOnValues(
    testData.map(lambda lp: (lp.label, lp.features))).pprint()
#print(model.predictOnValues(testData.map(lambda lp: (lp.label, lp.features))))

ssc.start()
ssc.awaitTermination()

Example #25

0

Show file

File: clustering2.py Project: saeedaghabozorgi/streamclusteringspark

            #         clus=row[0]
            #         #ptext[clus].set_text(str(clus)+ ':'+str([x[0] for x in row[1][1]]))
            #         ptext[clus].set_text(str(clus)+ ':'+str(row[1][1]))
            #         ptext[clus].set_color(colors[clus])
            #     plt.pause(0.0001)
            #


q = multiprocessing.Queue()
f = multiprocessing.Queue()
job_for_another_core2 = multiprocessing.Process(target=data_plotting,args=(q,))
job_for_another_core2.start()

sc  = SparkContext('local[4]', 'Social Panic Analysis')
# Create a local StreamingContext with two working thread and batch interval of 1 second

ssc = StreamingContext(sc, 10)
dstream = ssc.socketTextStream("localhost", 9998)
trainingData = dstream.map(Vectors.parse)
trainingData.pprint()
testData=trainingData.map(lambda x: (x,x))
testData.pprint()
model = StreamingKMeans(k=clusterNum, decayFactor=0.1).setRandomCenters(2, 1.0, 0)
model.trainOn(trainingData)
print(model.latestModel().clusterCenters)
clust=model.predictOnValues(testData)
clust.pprint()
#print(model.predictOnValues(testData.map(lambda lp: (lp.label, lp.features))))
clust.foreachRDD(lambda time, rdd: q.put(rdd.collect()))
ssc.start()
ssc.awaitTermination()

Example #26

0

Show file

File: StreamingKMeans.py Project: radical-experiments/midas_exps

    os.makedirs("results")
except:
    pass

output_file = open(RESULT_FILE, "w")

start = time.time()

#output_file.write("Measurement,Number_Partitions, Time\n")
#output_file.write("Spark Startup, %s, %.5f\n"%(NUMBER_PARTITIONS, time.time()-start))
#output_file.flush()
#######################################################################################

decayFactor = 1.0
timeUnit = "batches"
model = StreamingKMeans(k=10, decayFactor=decayFactor,
                        timeUnit=timeUnit).setRandomCenters(3, 1.0, 0)

#def printOffsetRanges(rdd):
#    for o in offsetRanges:
#        print "%s %s %s %s" % (o.topic, o.partition, o.fromOffset, o.untilOffset)


def count_records(rdd):
    print str(type(rdd))
    if rdd != None:
        return rdd.collect()

    return [0]


## OK

Example #27

0

Show file

File: streamingmllib-decayfactorone.py Project: faameem/apache

    sc = SparkContext(appName="StreamingKMeansClustering")
    ssc = StreamingContext(sc, 10)

    ssc.checkpoint("file:///tmp/spark")

    def parseTrainingData(line):
      cells = line.split(",")
      return Vectors.dense([float(cells[0]), float(cells[1])])

    trainingStream = ssc.textFileStream("file:///Users/jananiravi/spark/spark-2.1.0-bin-without-hadoop/tweets/training")\
      .map(parseTrainingData)

    trainingStream.pprint();

    model = StreamingKMeans(k=2, decayFactor=1.0).setRandomCenters(2, 1.0, 0)

    print("Initial centers: " + str(model.latestModel().centers))

    model.trainOn(trainingStream)

    ssc.start()

    s = sched.scheduler(time.time, time.sleep)
    def print_cluster_centers(sc, model): 
        print("Cluster centers: " + str(model.latestModel().centers))
        s.enter(10, 1, print_cluster_centers, (sc, model))

    s.enter(10, 1, print_cluster_centers, (s, model))
    s.run()

Example #28

0

Show file

File: StreamingKmeansClassification.py Project: swipswaps/TwitterSentimentAnalysis-1

        print('Applying PCA on training data...')
        PCA_model = PCA(low_dim).fit(tfidf_training)
        tfidf_training = PCA_model.transform(tfidf_training)
        k = low_dim

    # pcArray = model.transform(tfidf_training.first()).toArray()

    #setting checkpoint
    # ssc.checkpoint("/Users/davidenardone/Desktop/checkpoint")

    # CREATING DStream FROM TRAINING'S RDD
    trainingQueue = [tfidf_training]
    trainingStream = ssc.queueStream(trainingQueue)

    # CREATING A K-MEANS MODEL WITH RANDOM CLUSTERS SPECIFYING THE NUMBER OF CLUSTERS TO FIND
    model = StreamingKMeans(k=2, decayFactor=1.0,
                            timeUnit='batches').setRandomCenters(k, 1.0, 0)

    # print("K centers: " + str(model.latestModel().centers))

    # TRAINING THE MODEL ON THE TRAINING TWEET'S DATA
    print('Training K-means Model...')
    model.trainOn(trainingStream)
    print('done!')

    # CREATE DIRECT KAFKA STREAM WITH BROKERS AND TOPICS
    streamData = KafkaUtils.createDirectStream(
        ssc, [kafka_topic], {"metadata.broker.list": kafka_brokers})

    ######### FROM NOW ON, EACH ACTION OR TRANSFORMATION IS DONE ON A SINGLE INCOMING BATCH OF TWEETS #########

    # PRE-PROCESSING TWEETS DATA (TESTING)

Example #29

0

Show file

sc = SparkContext(appName="StreamingKMeans")
ssc = StreamingContext(sc, 10)
ssc.checkpoint("/tmp/checkpoints/")

initialCenters = [[604328, 574379], [801908, 318382], [416383, 786204],
                  [822771, 732034], [850993, 157873], [338586, 563537],
                  [169274, 348574], [619259, 397671], [241071, 844424],
                  [321801, 165319], [139493, 557352], [508785, 174800],
                  [398934, 404142], [860858, 546059], [674365, 860464]]

initialWeights = [
    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
]

stkm = StreamingKMeans(k=sys.argv[1], decayFactor=1.0).setInitialCenters(
    [[500, 500], [600, 600]], [1.0, 1.0])

directKafkaStream = KafkaUtils.createDirectStream(
    ssc, ['StreamingKMeansTFG'], {
        "metadata.broker.list": "localhost:9092",
        "auto_offset_reset": 'earliest'
    })

parsed = directKafkaStream.map(lambda v: loads(v[1]))

parsed = parsed.map(
    lambda line: Vectors.dense([float(x) for x in line.strip().split()]))

stkm.trainOn(parsed)

Example #30

0

Show file

File: clustering.py Project: saeedaghabozorgi/streamclusteringspark

sys.path.append('/usr/local/Cellar/apache-spark/spark-1.5.2-bin-hadoop2.6/python/')
# Append the python/build to PYTHONPATH so that py4j could be found
sys.path.append('/usr/local/Cellar/apache-spark/spark-1.5.2-bin-hadoop2.6/python/lib/py4j-0.8.2.1-src.zip')
from pyspark import SparkContext
from pyspark.streaming import StreamingContext


from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.clustering import StreamingKMeans
def parse(lp):
    label = float(lp[lp.find('(') + 1: lp.find(',')])
    vec = Vectors.dense(lp[lp.find('[') + 1: lp.find(']')].split(','))
    return LabeledPoint(label, vec)


sc  = SparkContext('local[4]', 'Social Panic Analysis')
# Create a local StreamingContext with two working thread and batch interval of 1 second

ssc = StreamingContext(sc, 10)
trainingData = ssc.textFileStream("./training/").map(Vectors.parse)
trainingData.pprint()
testData = ssc.textFileStream("./testing/").map(parse)
testData.pprint()
model = StreamingKMeans(k=5, decayFactor=1.0).setRandomCenters(3, 1.0, 0)
model.trainOn(trainingData)
model.predictOnValues(testData.map(lambda lp: (lp.label, lp.features))).pprint()
#print(model.predictOnValues(testData.map(lambda lp: (lp.label, lp.features))))

ssc.start()
ssc.awaitTermination()

Example #31

0

Show file

        return LabeledPoint(label, vec)

    trainingData = sc.textFile("/Users/tung/Documents/spark-2.4.3/data/mllib/kmeans_data.txt")\
        .map(lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')]))

    testingData = sc.textFile(
        "/Users/tung/Documents/spark-2.4.3/data/mllib/streaming_kmeans_data_test.txt"
    ).map(parse)

    trainingQueue = [trainingData]
    testingQueue = [testingData]

    trainingStream = ssc.queueStream(trainingQueue)
    testingStream = ssc.queueStream(testingQueue)

    # We create a model with random clusters and specify the number of clusters to find
    model = StreamingKMeans(k=2, decayFactor=1.0).setRandomCenters(3, 1.0, 0)

    # Now register the streams for training and testing and start the job,
    # printing the predicted cluster assignments on new data points as they arrive.
    model.trainOn(trainingStream)

    result = model.predictOnValues(
        testingStream.map(lambda lp: (lp.label, lp.features)))
    result.pprint()

    ssc.start()
    ssc.stop(stopSparkContext=True, stopGraceFully=True)

    print("Final centers: " + str(model.latestModel().centers))

Example #32

0

Show file

from pyspark.mllib.clustering import StreamingKMeans
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import Vectors
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

# Create a local StreamingContext with two working thread and batch interval of 1 second
sc = SparkContext("local[2]", "NetworkWordCount")
ssc = StreamingContext(sc, 1)

# continuous training
trainigData = ssc.textFileStream("/training/data/dir").map(Vectors.parse)
testData = ssc.textFileStream("/training/data/dir").map(Vectors.parse)
testData = ssc.textFileStream("/testing/data/dir").map(
    lambda s: LabeledPoint.parse(s))

model = StreamingKMeans()\
        .setK(3)\
        .setDecayFactor(1.0)\
        .setRandomCenters(dim=3, weight=0.0, seed=42)

model.trainOn(trainingData)
prediction = model.predictOnValues(testData)
print(prediction)

Example #33

0

Show file

    return LabeledPoint(label_, vec)


#testingData = sc.textFile("data/mllib/streaming_kmeans_data_test.txt").map(parse)
#testingData = sc.textFile("streaming_kmeans_data_test.txt").map(parse)
testingData = sc.parallelize(train_vec).map(parse_vec)

trainingQueue = [trainingData]
testingQueue = [testingData]

trainingStream = ssc.queueStream(trainingQueue)
testingStream = ssc.queueStream(testingQueue)

# We create a model with random clusters and specify the number of clusters to find
#model = StreamingKMeans(k=2, decayFactor=1.0).setRandomCenters(3, 1.0, 0)
model = StreamingKMeans(k=3, decayFactor=1.0).setRandomCenters(100, 1.0, 0)

# Now register the streams for training and testing and start the job,
# printing the predicted cluster assignments on new data points as they arrive.

#model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print()
#model.trainOn(trainingStream)
#print("TEST HERE")
#result = model.predictOnValues(testingStream.map(lambda lp: (lp.label, lp.features)))
model.trainOn(trainingStream)
print("TEST HERE")
result = model.predictOnValues(
    testingStream.map(lambda lp: (lp.label, lp.features)))
#result.pprint(num = 20)

#if result.count() != 0:

Example #34

0

Show file

    sc = SparkContext(master="local[4]", appName="Streaming-KMeans", conf=conf)

    ssc = StreamingContext(sc, 5)

    # Kafka Stream
    ks = KafkaUtils.createDirectStream(
        ssc, ["test"], {"metadata.broker.list": "localhost:9092"})

    trainingData = sc.textFile("data/datatraining.txt")\
        .map(lambda line: line.split(',')[2:-1]).map(lambda arr: Vectors.dense([float(x) for x in arr]))

    # Supplied to Streaming KMeans as the centers by StreamingKmeans are not giving good predictions
    init_centers = KMeans.train(trainingData, 2).centers

    model = StreamingKMeans(k=2, decayFactor=0.1)\
        .setInitialCenters(init_centers, [1.0, 1.0, 1.0, 1.0, 1.0])

    model.trainOn(ssc.queueStream([trainingData]))

    def parse(lp):
        arr = lp.split(',')[2:-1]
        label = lp.split(',')[0]
        vec = Vectors.dense([float(x) for x in arr])
        return LabeledPoint(label, vec)

    test_stream = ks.map(lambda x: x[1]).map(parse)

    result = model.predictOnValues(
        test_stream.map(lambda lp: (lp.label, lp.features)))

    # Prints Prediction Prediction and Cluster Centers

Example #35

0

Show file

File: analyzer_twiterSpark.py Project: pluketic/TwitterSparkStreamClustering

         .filter(lambda post: 'created_at' in post)\
         .map(lambda post: (get_coord2(post)[0],get_coord2(post)[1],post["text"]))\
         .filter(lambda tpl: tpl[0] != 0)\
         .filter(lambda tpl: tpl[2] != '')\
         .map(lambda tpl: (tpl[0],tpl[1],tokenize(tpl[2])))\
         .map(lambda tpl:(tpl[0],tpl[1],tpl[2],doc2vec(tpl[2])))
    #dstream_tweets.pprint()

    trainingData = dstream_tweets.map(
        lambda tpl: [tpl[0], tpl[1]] + tpl[3].tolist())
    #trainingData.pprint()
    testdata = dstream_tweets.map(lambda tpl: (
        ([tpl[0], tpl[1]], tpl[2]), [tpl[0], tpl[1]] + tpl[3].tolist()))
    #testdata.pprint()
    #
    model = StreamingKMeans(k=clusterNum,
                            decayFactor=0.6).setRandomCenters(102, 1.0, 3)
    model.trainOn(trainingData)
    clust = model.predictOnValues(testdata)
    #clust.pprint()
    #words = lines.flatMap(lambda line: line.split(" "))
    topic = clust.map(lambda x: (x[1], x[0][1]))
    #topic.pprint()
    topicAgg = topic.reduceByKey(lambda x, y: x + y)
    #wordCollect.pprint()
    topicAgg.map(lambda x: (x[0], freqcount(x[1]))).pprint()

    clust.foreachRDD(lambda time, rdd: q.put(rdd.collect()))

    # Run!
    ssc.start()
    ssc.awaitTermination()

Example #36

0

Show file

 def streaming(self, mnk, clusters, init_clusters):
     self.mnk=mnk
     self.clusters=clusters
     self.init_clusters=init_clusters
     self.streaming_kmeans=StreamingKMeans(self.init_clusters, self.decay_factor, self.time_unit)
     self.streaming_kmeans.setInitialCenters(self.mnk.cluster_centers_, np.ones([self.init_clusters]))