コード例 #1
0
    def test_predictOn_model(self):
        """Test that the model predicts correctly on toy data."""
        stkm = StreamingKMeans()
        stkm._model = StreamingKMeansModel(
            clusterCenters=[[1.0, 1.0], [-1.0, 1.0], [-1.0, -1.0], [1.0, -1.0]],
            clusterWeights=[1.0, 1.0, 1.0, 1.0])

        predict_data = [[[1.5, 1.5]], [[-1.5, 1.5]], [[-1.5, -1.5]], [[1.5, -1.5]]]
        predict_data = [self.sc.parallelize(batch, 1) for batch in predict_data]
        predict_stream = self.ssc.queueStream(predict_data)
        predict_val = stkm.predictOn(predict_stream)

        result = []

        def update(rdd):
            rdd_collect = rdd.collect()
            if rdd_collect:
                result.append(rdd_collect)

        predict_val.foreachRDD(update)
        self.ssc.start()

        def condition():
            self.assertEqual(result, [[0], [1], [2], [3]])
            return True

        self._eventually(condition, catch_assertions=True)
コード例 #2
0
    def test_trainOn_model(self):
        """Test the model on toy data with four clusters."""
        stkm = StreamingKMeans()
        initCenters = [[1.0, 1.0], [-1.0, 1.0], [-1.0, -1.0], [1.0, -1.0]]
        stkm.setInitialCenters(
            centers=initCenters, weights=[1.0, 1.0, 1.0, 1.0])

        # Create a toy dataset by setting a tiny offset for each point.
        offsets = [[0, 0.1], [0, -0.1], [0.1, 0], [-0.1, 0]]
        batches = []
        for offset in offsets:
            batches.append([[offset[0] + center[0], offset[1] + center[1]]
                            for center in initCenters])

        batches = [self.sc.parallelize(batch, 1) for batch in batches]
        input_stream = self.ssc.queueStream(batches)
        stkm.trainOn(input_stream)
        self.ssc.start()

        # Give enough time to train the model.
        def condition():
            finalModel = stkm.latestModel()
            self.assertTrue(all(finalModel.centers == array(initCenters)))
            self.assertEqual(finalModel.clusterWeights, [5.0, 5.0, 5.0, 5.0])
            return True
        self._eventually(condition, catch_assertions=True)
コード例 #3
0
    def test_predictOn_model(self):
        """Test that the model predicts correctly on toy data."""
        stkm = StreamingKMeans()
        stkm._model = StreamingKMeansModel(clusterCenters=[[1.0, 1.0],
                                                           [-1.0, 1.0],
                                                           [-1.0, -1.0],
                                                           [1.0, -1.0]],
                                           clusterWeights=[1.0, 1.0, 1.0, 1.0])

        predict_data = [[[1.5, 1.5]], [[-1.5, 1.5]], [[-1.5, -1.5]],
                        [[1.5, -1.5]]]
        predict_data = [
            self.sc.parallelize(batch, 1) for batch in predict_data
        ]
        predict_stream = self.ssc.queueStream(predict_data)
        predict_val = stkm.predictOn(predict_stream)

        result = []

        def update(rdd):
            rdd_collect = rdd.collect()
            if rdd_collect:
                result.append(rdd_collect)

        predict_val.foreachRDD(update)
        self.ssc.start()

        def condition():
            self.assertEqual(result, [[0], [1], [2], [3]])
            return True

        eventually(condition, catch_assertions=True)
コード例 #4
0
ファイル: twitter-streaming.py プロジェクト: tsiangsun/tdi
def main():
    conf = SparkConf().setAppName("twitterclassifier")
    sc = SparkContext(conf=conf)
    ssc = StreamingContext(sc, 10)

    tweets = ssc.socketTextStream("localhost", PORT) \
                .map(lambda x: json.loads(x)) \
                .filter(lambda x: 'text' in x) \
                .map(lambda x: x['text'].encode('utf-8'))
    hasher = HashingTF(DIM)
    features = tweets.map(lambda x:
                          (x, hasher.transform(featurize(x)))).cache()

    # We create a model with random clusters and specify the number of clusters to find
    # decay = 1: total memory; decay = 0: no memory
    model = StreamingKMeans(k=N, decayFactor=0.1).setRandomCenters(DIM, 1.0, 0)
    model.trainOn(features.map(lambda x: x[1]))
    results = model.predictOnValues(features).cache()

    # Need a closure over i here.
    def print_group(i):
        results.filter(lambda x: x[1] == i).map(lambda x: '%i: %s' %
                                                (x[1], x[0])).pprint(3)

    for i in xrange(N):
        print_group(i)

    ssc.start()
    ssc.awaitTermination()
コード例 #5
0
ファイル: tests.py プロジェクト: rajsingh7/spark
    def test_predictOn_model(self):
        """Test that the model predicts correctly on toy data."""
        stkm = StreamingKMeans()
        stkm._model = StreamingKMeansModel(clusterCenters=[[1.0, 1.0],
                                                           [-1.0, 1.0],
                                                           [-1.0, -1.0],
                                                           [1.0, -1.0]],
                                           clusterWeights=[1.0, 1.0, 1.0, 1.0])

        predict_data = [[[1.5, 1.5]], [[-1.5, 1.5]], [[-1.5, -1.5]],
                        [[1.5, -1.5]]]
        predict_data = [sc.parallelize(batch, 1) for batch in predict_data]
        predict_stream = self.ssc.queueStream(predict_data)
        predict_val = stkm.predictOn(predict_stream)

        result = []

        def update(rdd):
            rdd_collect = rdd.collect()
            if rdd_collect:
                result.append(rdd_collect)

        predict_val.foreachRDD(update)
        t = time()
        self.ssc.start()
        self._ssc_wait(t, 6.0, 0.01)
        self.assertEquals(result, [[0], [1], [2], [3]])
コード例 #6
0
    def perform_training(sc: SparkContext, params_dict: dict):
        batch_duration = 1 if 'batch_duration' not in params_dict else params_dict[
            'batch_duration']
        training_duration = 20 if 'training_duration' not in params_dict else params_dict[
            'training_duration']
        ssc = StreamingContext(sc, batch_duration)
        topics = ['normal-ekg-stream']
        kafka_params = {'metadata.broker.list': 'localhost:9092'}
        kvs = KafkaUtils.createDirectStream(
            ssc,
            topics,
            kafkaParams=kafka_params,
            valueDecoder=lambda val: json.loads(val.decode('utf-8')))

        windowed_signal = kvs.map(lambda msg: Vectors.dense(
            [float(value) for value in msg[1]['signal_values']]))

        # windowed_signal.foreachRDD(Plotter.plot_signal_window)
        model = StreamingKMeans(k=20,
                                decayFactor=1.0).setRandomCenters(188, 1.0, 0)
        model.trainOn(windowed_signal)

        ssc.start()
        ssc.awaitTerminationOrTimeout(training_duration)
        ssc.stop(stopSparkContext=False, stopGraceFully=True)

        return model.latestModel()
コード例 #7
0
    def test_accuracy_for_single_center(self):
        """Test that parameters obtained are correct for a single center."""
        centers, batches = self.streamingKMeansDataGenerator(batches=5,
                                                             numPoints=5,
                                                             k=1,
                                                             d=5,
                                                             r=0.1,
                                                             seed=0)
        stkm = StreamingKMeans(1)
        stkm.setInitialCenters([[0., 0., 0., 0., 0.]], [0.])
        input_stream = self.ssc.queueStream(
            [self.sc.parallelize(batch, 1) for batch in batches])
        stkm.trainOn(input_stream)

        self.ssc.start()

        def condition():
            self.assertEqual(stkm.latestModel().clusterWeights, [25.0])
            return True

        eventually(condition, catch_assertions=True)

        realCenters = array_sum(array(centers), axis=0)
        for i in range(5):
            modelCenters = stkm.latestModel().centers[0][i]
            self.assertAlmostEqual(centers[0][i], modelCenters, 1)
            self.assertAlmostEqual(realCenters[i], modelCenters, 1)
コード例 #8
0
ファイル: tests.py プロジェクト: rajsingh7/spark
    def test_trainOn_predictOn(self):
        """Test that prediction happens on the updated model."""
        stkm = StreamingKMeans(decayFactor=0.0, k=2)
        stkm.setInitialCenters([[0.0], [1.0]], [1.0, 1.0])

        # Since decay factor is set to zero, once the first batch
        # is passed the clusterCenters are updated to [-0.5, 0.7]
        # which causes 0.2 & 0.3 to be classified as 1, even though the
        # classification based in the initial model would have been 0
        # proving that the model is updated.
        batches = [[[-0.5], [0.6], [0.8]], [[0.2], [-0.1], [0.3]]]
        batches = [sc.parallelize(batch) for batch in batches]
        input_stream = self.ssc.queueStream(batches)
        predict_results = []

        def collect(rdd):
            rdd_collect = rdd.collect()
            if rdd_collect:
                predict_results.append(rdd_collect)

        stkm.trainOn(input_stream)
        predict_stream = stkm.predictOn(input_stream)
        predict_stream.foreachRDD(collect)

        t = time()
        self.ssc.start()
        self._ssc_wait(t, 6.0, 0.01)
        self.assertEqual(predict_results, [[0, 1, 1], [1, 0, 1]])
コード例 #9
0
ファイル: tests.py プロジェクト: rajsingh7/spark
    def test_trainOn_model(self):
        """Test the model on toy data with four clusters."""
        stkm = StreamingKMeans()
        initCenters = [[1.0, 1.0], [-1.0, 1.0], [-1.0, -1.0], [1.0, -1.0]]
        stkm.setInitialCenters(centers=initCenters,
                               weights=[1.0, 1.0, 1.0, 1.0])

        # Create a toy dataset by setting a tiny offest for each point.
        offsets = [[0, 0.1], [0, -0.1], [0.1, 0], [-0.1, 0]]
        batches = []
        for offset in offsets:
            batches.append([[offset[0] + center[0], offset[1] + center[1]]
                            for center in initCenters])

        batches = [self.sc.parallelize(batch, 1) for batch in batches]
        input_stream = self.ssc.queueStream(batches)
        stkm.trainOn(input_stream)
        t = time()
        self.ssc.start()

        # Give enough time to train the model.
        self._ssc_wait(t, 6.0, 0.01)
        finalModel = stkm.latestModel()
        self.assertTrue(all(finalModel.centers == array(initCenters)))
        self.assertEquals(finalModel.clusterWeights, [5.0, 5.0, 5.0, 5.0])
コード例 #10
0
    def test_model_params(self):
        """Test that the model params are set correctly"""
        stkm = StreamingKMeans()
        stkm.setK(5).setDecayFactor(0.0)
        self.assertEqual(stkm._k, 5)
        self.assertEqual(stkm._decayFactor, 0.0)

        # Model not set yet.
        self.assertIsNone(stkm.latestModel())
        self.assertRaises(ValueError, stkm.trainOn, [0.0, 1.0])

        stkm.setInitialCenters(
            centers=[[0.0, 0.0], [1.0, 1.0]], weights=[1.0, 1.0])
        self.assertEqual(
            stkm.latestModel().centers, [[0.0, 0.0], [1.0, 1.0]])
        self.assertEqual(stkm.latestModel().clusterWeights, [1.0, 1.0])
コード例 #11
0
ファイル: tests.py プロジェクト: HodaAlemi/spark
    def test_trainOn_predictOn(self):
        """Test that prediction happens on the updated model."""
        stkm = StreamingKMeans(decayFactor=0.0, k=2)
        stkm.setInitialCenters([[0.0], [1.0]], [1.0, 1.0])

        # Since decay factor is set to zero, once the first batch
        # is passed the clusterCenters are updated to [-0.5, 0.7]
        # which causes 0.2 & 0.3 to be classified as 1, even though the
        # classification based in the initial model would have been 0
        # proving that the model is updated.
        batches = [[[-0.5], [0.6], [0.8]], [[0.2], [-0.1], [0.3]]]
        batches = [sc.parallelize(batch) for batch in batches]
        input_stream = self.ssc.queueStream(batches)
        predict_results = []

        def collect(rdd):
            rdd_collect = rdd.collect()
            if rdd_collect:
                predict_results.append(rdd_collect)

        stkm.trainOn(input_stream)
        predict_stream = stkm.predictOn(input_stream)
        predict_stream.foreachRDD(collect)

        t = time()
        self.ssc.start()
        self._ssc_wait(t, 6.0, 0.01)
        self.assertEqual(predict_results, [[0, 1, 1], [1, 0, 1]])
コード例 #12
0
ファイル: tests.py プロジェクト: HodaAlemi/spark
    def test_model_params(self):
        """Test that the model params are set correctly"""
        stkm = StreamingKMeans()
        stkm.setK(5).setDecayFactor(0.0)
        self.assertEquals(stkm._k, 5)
        self.assertEquals(stkm._decayFactor, 0.0)

        # Model not set yet.
        self.assertIsNone(stkm.latestModel())
        self.assertRaises(ValueError, stkm.trainOn, [0.0, 1.0])

        stkm.setInitialCenters(
            centers=[[0.0, 0.0], [1.0, 1.0]], weights=[1.0, 1.0])
        self.assertEquals(
            stkm.latestModel().centers, [[0.0, 0.0], [1.0, 1.0]])
        self.assertEquals(stkm.latestModel().clusterWeights, [1.0, 1.0])
コード例 #13
0
ファイル: tests.py プロジェクト: HodaAlemi/spark
    def test_predictOn_model(self):
        """Test that the model predicts correctly on toy data."""
        stkm = StreamingKMeans()
        stkm._model = StreamingKMeansModel(
            clusterCenters=[[1.0, 1.0], [-1.0, 1.0], [-1.0, -1.0], [1.0, -1.0]],
            clusterWeights=[1.0, 1.0, 1.0, 1.0])

        predict_data = [[[1.5, 1.5]], [[-1.5, 1.5]], [[-1.5, -1.5]], [[1.5, -1.5]]]
        predict_data = [sc.parallelize(batch, 1) for batch in predict_data]
        predict_stream = self.ssc.queueStream(predict_data)
        predict_val = stkm.predictOn(predict_stream)

        result = []

        def update(rdd):
            rdd_collect = rdd.collect()
            if rdd_collect:
                result.append(rdd_collect)

        predict_val.foreachRDD(update)
        t = time()
        self.ssc.start()
        self._ssc_wait(t, 6.0, 0.01)
        self.assertEquals(result, [[0], [1], [2], [3]])
コード例 #14
0
class StreamingUpdate(object):
    """
    Streaming Update: DStream
    """
    def __init__(self, init_clusters, decay_factor, time_unit, sc, ssc):
        self.init_clusters=init_clusters
        self.decay_factor=decay_factor
        self.time_unit=time_unit
        self.sc=sc
        self.ssc=ssc

    # implement
    def streaming(self, mnk, clusters, init_clusters):
        self.mnk=mnk
        self.clusters=clusters
        self.init_clusters=init_clusters
        self.streaming_kmeans=StreamingKMeans(self.init_clusters, self.decay_factor, self.time_unit)
        self.streaming_kmeans.setInitialCenters(self.mnk.cluster_centers_, np.ones([self.init_clusters]))

    # update shape for centers in StreamingContext
    """
    Từ điển được cập nhật khi có tin tức mới đến thì em cập nhật lại kích thước của các centroid
    VD: Từ điển ban đầu có kích thước 10 từ
    Em biểu diễn một câu có 5 từ bằng sparse vector kích thước 5x10
    Từ điển sau khi cập nhật có 15 từ thì câu trên phải biểu diễn lại bằng sparse vector có kích thước 5x15
    Có cách biểu diễn khác mà không phải cập nhật lại biểu diễn của câu không ạ
    """
    def update_shape(self, docs, dictionary):
        self.streaming_kmeans.setRandomCenters(matrix_tfidf(docs, dictionary).shape[1], 1.0, 0)

    # save matrix update
    def save_matrix_update(self, docs, dictionary):
        np.savetxt('/home/ducvu/input_streaming.txt', matrix_tfidf(docs, dictionary))

    # load dstream
    def load_dstream(self):
        self.dstream = self.sc.textFile("/home/ducvu/input_streaming.txt")\
            .map(lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')]))
        
    # make predict
    def make_predict(self, docs, dictionary):
        self.streaming_kmeans.trainOn(self.load_dstream())
        self.pred_stream=[]
        matrix=matrix_tfidf(docs)
        for x in matrix:
            self.pred_stream.append(self.streaming_kmeans.latest_model.predict(x))
        self.pred_stream=np.array(self.pred_stream)
        
        df = pd.DataFrame(matrix).groupby(self.pred_stream).mean()
        for i, r in df.iterrows():
            print('\nCluster {0}:'.format(i))
            print(','.join([get_tfidf(dictionary).get_feature_names()[t] for t in np.argsort(r)[-15:]]))
コード例 #15
0
ファイル: tests.py プロジェクト: HodaAlemi/spark
    def test_accuracy_for_single_center(self):
        """Test that parameters obtained are correct for a single center."""
        centers, batches = self.streamingKMeansDataGenerator(
            batches=5, numPoints=5, k=1, d=5, r=0.1, seed=0)
        stkm = StreamingKMeans(1)
        stkm.setInitialCenters([[0., 0., 0., 0., 0.]], [0.])
        input_stream = self.ssc.queueStream(
            [self.sc.parallelize(batch, 1) for batch in batches])
        stkm.trainOn(input_stream)

        t = time()
        self.ssc.start()
        self._ssc_wait(t, 10.0, 0.01)
        self.assertEquals(stkm.latestModel().clusterWeights, [25.0])
        realCenters = array_sum(array(centers), axis=0)
        for i in range(5):
            modelCenters = stkm.latestModel().centers[0][i]
            self.assertAlmostEqual(centers[0][i], modelCenters, 1)
            self.assertAlmostEqual(realCenters[i], modelCenters, 1)
    def detect(self, k, t):
        # Encoding categorical features using one-hot.
        df1 = self.cat2Num(self.rawDF, [0, 1]).cache()
        df1.show(n=2, truncate=False)

        # Clustering points using KMeans
        features = df1.select("features").rdd.map(lambda row: row[0]).cache()
        model = StreamingKMeans(k=7,
                                decayFactor=1.0).setRandomCenters(4, 1.0, 0)
        # model = KMeans.train(features, k, maxIterations=40, runs=10, initializationMode="random", seed=20)

        # Adding the prediction column to df1
        modelBC = sc.broadcast(model)
        predictUDF = udf(lambda x: modelBC.value.predict(x), StringType())
        df2 = df1.withColumn("prediction", predictUDF(df1.features)).cache()
        df2.show(n=3, truncate=False)

        # Adding the score column to df2; The higher the score, the more likely it is an anomaly
        df3 = self.addScore(df2).cache()
        df3.show(n=3, truncate=False)

        return df3.where(df3.score > t)
コード例 #17
0
from pyspark.mllib.clustering import StreamingKMeans

if __name__ == "__main__":
    sc = SparkContext(appName="StreamingErrorCount")
    ssc = StreamingContext(sc, 2)

    ssc.checkpoint("/tmp/spark")

    def parseTrainingData(line):
        cells = line.split(",")
        return Vectors.dense([float(cells[0]), float(cells[1])])

    trainingStream = ssc.textFileStream("./training")\
                        .map(parseTrainingData)

    model = StreamingKMeans(k=2, decayFactor=1.0).setRandomCenters(2, 1.0, 0)

    print "Intial centers: {0}".format(model.latestModel().centers)

    model.trainOn(trainingStream)

    ssc.start()

    s = sched.scheduler(time.time, time.sleep)

    def print_cluster_centers(sc, model):
        print "Cluster centers: {0}".format(model.latestModel().centers)
        s.enter(10, 1, print_cluster_centers, (sc, model))

    s.enter(10, 1, print_cluster_centers, (s, model))
    s.run()
コード例 #18
0
            #         ptext[clus].set_text(str(clus)+ ':'+str(row[1][1]))
            #         ptext[clus].set_color(colors[clus])
            #     plt.pause(0.0001)
            #


q = multiprocessing.Queue()
f = multiprocessing.Queue()
job_for_another_core2 = multiprocessing.Process(target=data_plotting,
                                                args=(q, ))
job_for_another_core2.start()

sc = SparkContext('local[4]', 'Social Panic Analysis')
# Create a local StreamingContext with two working thread and batch interval of 1 second

ssc = StreamingContext(sc, 10)
dstream = ssc.socketTextStream("localhost", 9998)
trainingData = dstream.map(Vectors.parse)
trainingData.pprint()
testData = trainingData.map(lambda x: (x, x))
testData.pprint()
model = StreamingKMeans(k=clusterNum,
                        decayFactor=0.1).setRandomCenters(2, 1.0, 0)
model.trainOn(trainingData)
print(model.latestModel().clusterCenters)
clust = model.predictOnValues(testData)
clust.pprint()
#print(model.predictOnValues(testData.map(lambda lp: (lp.label, lp.features))))
clust.foreachRDD(lambda time, rdd: q.put(rdd.collect()))
ssc.start()
ssc.awaitTermination()
コード例 #19
0
ファイル: example.py プロジェクト: AAB94/RP

trainingData = sc.textFile("data/datatraining.txt")\
    .map(lambda line: line.split(',')[2:-1]).map(lambda arr: Vectors.dense([float(x) for x in arr]))

centers = KMeans.train(trainingData, 2).centers


trainingQueue = [trainingData]


trainingStream = ssc.queueStream(trainingQueue)


# We create a model with random clusters and specify the number of clusters to find
model = StreamingKMeans(k=2, decayFactor=0.3)#.setRandomCenters(5, 1.0, 0)
model.setInitialCenters( centers, [1.0,1.0,1.0,1.0,1.0])
# Now register the streams for training and testing and start the job,
# printing the predicted cluster assignments on new data points as they arrive.
model.trainOn(trainingStream)

def parse(lp):
    #label = float(lp[lp.find('(') + 1: lp.find(')')])
    #vec = Vectors.dense(lp[lp.find('[') + 1: lp.find(']')].split(','))
    arr = lp.split(',')[2:-1]
    label = lp.split(',')[0]
    label = label[1:-1]
    vec = Vectors.dense([float(x) for x in arr])
    print(model.latestModel().centers)
    return LabeledPoint(label, vec)
コード例 #20
0
    for center in file:
        initialCenters.append(center.split())

logging.info(initialCenters)

initialWeights = []
for i in initialCenters:
    initialWeights.append(1.0)

config = sc.broadcast(parameters)
numberClusters = config.value[0]
mongoIP = config.value[1]
mongoDataBase = config.value[2]
mongoCollection = config.value[3]

stkm = StreamingKMeans(k=numberClusters, decayFactor=1).setInitialCenters(
    initialCenters, initialWeights)

#stkm = StreamingKMeans(k=int(numberClusters),decayFactor=1.0).setRandomCenters(2,1.0,100)

directKafkaStream = KafkaUtils.createDirectStream(
    ssc, ['StreamingKMeansTFG'], {
        "metadata.broker.list": "localhost:9092",
        "auto_offset_reset": 'earliest'
    })

parsed = directKafkaStream.map(lambda v: loads(v[1]))

parsed = parsed.map(
    lambda line: Vectors.dense([float(x) for x in line.strip().split()]))

stkm.trainOn(parsed)
コード例 #21
0
    def parse(lp):
        label = float(lp[lp.find('(') + 1: lp.find(')')])
        vec = Vectors.dense(lp[lp.find('[') + 1: lp.find(']')].split(','))
        return LabeledPoint(label, vec)

    trainingData = sc.textFile("spark-2.0.1-bin-hadoop2.7/data/mllib/kmeans_data.txt")\
        .map(lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')]))


    trainingQueue = [trainingData]

    trainingStream = ssc.queueStream(trainingQueue)
    testingStream = ssc.textFileStream('history').map(parse)

    # We create a model with random clusters and specify the number of clusters to find
    model = StreamingKMeans(k=2, decayFactor=1.0).setRandomCenters(3, 1.0, 0)

    # Now register the streams for training and testing and start the job,
    # printing the predicted cluster assignments on new data points as they arrive.
    model.trainOn(trainingStream)

    result = model.predictOnValues(testingStream.map(lambda lp: (lp.label, lp.features)))
    result.pprint()

    ssc.start()
    ssc.awaitTermination()
    #ssc.stop(stopSparkContext=True, stopGraceFully=True)
    # $example off$

    print("Final centers: " + str(model.latestModel().centers))
コード例 #22
0
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.clustering import StreamingKMeans

if __name__ == "__main__":
    sc = SparkContext(appName="sai twitter feed")
    ssc = StreamingContext(sc, 10)

    ssc.checkpoint("chkpfile")

    def parserData(line):
        cells = line.split(",")
        return Vectors.dense([float(cells[0]), float(cells[1])])

    trainingStream = ssc.textFileStream("/files").map(parserData)

    model = StreamingKMeans(k=2, decayFactor=1.0).setRandomCenters(2, 1.0, 0)
    print("Initial Centres" + str(model.latestModel().centers))
    model.trainOn(trainingStream)
    ssc.start()

    s = sched.scheduler(time.time, time.sleep)

    def print_cluster_centres(sc, model):
        print(str(model.latestModel().centers))
        s.enter(10, 1, print_cluster_centres, (s, model))

    s.enter(10, 1, print_cluster_centres, (s, model))
    s.run()

    ssc.awaitTermination()
# to make this work
コード例 #23
0
         .filter(lambda post: 'created_at' in post)\
         .map(lambda post: (get_coord2(post)[0],get_coord2(post)[1],post["text"]))\
         .filter(lambda tpl: tpl[0] != 0)\
         .filter(lambda tpl: tpl[2] != '')\
         .map(lambda tpl: (tpl[0],tpl[1],tokenize(tpl[2])))\
         .map(lambda tpl:(tpl[0],tpl[1],tpl[2],doc2vec(tpl[2])))
    #dstream_tweets.pprint()



    trainingData=dstream_tweets.map(lambda tpl: [tpl[0],tpl[1]]+tpl[3].tolist())
    #trainingData.pprint()
    testdata=dstream_tweets.map(lambda tpl: (([tpl[0],tpl[1]],tpl[2]),[tpl[0],tpl[1]]+tpl[3].tolist()))
    #testdata.pprint()
    #
    model = StreamingKMeans(k=clusterNum, decayFactor=0.6).setRandomCenters(102, 1.0, 3)
    model.trainOn(trainingData)
    clust=model.predictOnValues(testdata)
    #clust.pprint()
    #words = lines.flatMap(lambda line: line.split(" "))
    topic=clust.map(lambda x: (x[1],x[0][1]))
    #topic.pprint()
    topicAgg = topic.reduceByKey(lambda x,y: x+y)
    #wordCollect.pprint()
    topicAgg.map(lambda x: (x[0],freqcount(x[1]))).pprint()

    clust.foreachRDD(lambda time, rdd: q.put(rdd.collect()))

    # Run!
    ssc.start()
    ssc.awaitTermination()
コード例 #24
0
)
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.clustering import StreamingKMeans


def parse(lp):
    label = float(lp[lp.find('(') + 1:lp.find(',')])
    vec = Vectors.dense(lp[lp.find('[') + 1:lp.find(']')].split(','))
    return LabeledPoint(label, vec)


sc = SparkContext('local[4]', 'Social Panic Analysis')
# Create a local StreamingContext with two working thread and batch interval of 1 second

ssc = StreamingContext(sc, 10)
trainingData = ssc.textFileStream("./training/").map(Vectors.parse)
trainingData.pprint()
testData = ssc.textFileStream("./testing/").map(parse)
testData.pprint()
model = StreamingKMeans(k=5, decayFactor=1.0).setRandomCenters(3, 1.0, 0)
model.trainOn(trainingData)
model.predictOnValues(
    testData.map(lambda lp: (lp.label, lp.features))).pprint()
#print(model.predictOnValues(testData.map(lambda lp: (lp.label, lp.features))))

ssc.start()
ssc.awaitTermination()
コード例 #25
0
            #         clus=row[0]
            #         #ptext[clus].set_text(str(clus)+ ':'+str([x[0] for x in row[1][1]]))
            #         ptext[clus].set_text(str(clus)+ ':'+str(row[1][1]))
            #         ptext[clus].set_color(colors[clus])
            #     plt.pause(0.0001)
            #


q = multiprocessing.Queue()
f = multiprocessing.Queue()
job_for_another_core2 = multiprocessing.Process(target=data_plotting,args=(q,))
job_for_another_core2.start()

sc  = SparkContext('local[4]', 'Social Panic Analysis')
# Create a local StreamingContext with two working thread and batch interval of 1 second

ssc = StreamingContext(sc, 10)
dstream = ssc.socketTextStream("localhost", 9998)
trainingData = dstream.map(Vectors.parse)
trainingData.pprint()
testData=trainingData.map(lambda x: (x,x))
testData.pprint()
model = StreamingKMeans(k=clusterNum, decayFactor=0.1).setRandomCenters(2, 1.0, 0)
model.trainOn(trainingData)
print(model.latestModel().clusterCenters)
clust=model.predictOnValues(testData)
clust.pprint()
#print(model.predictOnValues(testData.map(lambda lp: (lp.label, lp.features))))
clust.foreachRDD(lambda time, rdd: q.put(rdd.collect()))
ssc.start()
ssc.awaitTermination()
コード例 #26
0
    os.makedirs("results")
except:
    pass

output_file = open(RESULT_FILE, "w")

start = time.time()

#output_file.write("Measurement,Number_Partitions, Time\n")
#output_file.write("Spark Startup, %s, %.5f\n"%(NUMBER_PARTITIONS, time.time()-start))
#output_file.flush()
#######################################################################################

decayFactor = 1.0
timeUnit = "batches"
model = StreamingKMeans(k=10, decayFactor=decayFactor,
                        timeUnit=timeUnit).setRandomCenters(3, 1.0, 0)

#def printOffsetRanges(rdd):
#    for o in offsetRanges:
#        print "%s %s %s %s" % (o.topic, o.partition, o.fromOffset, o.untilOffset)


def count_records(rdd):
    print str(type(rdd))
    if rdd != None:
        return rdd.collect()

    return [0]


## OK
コード例 #27
0
    sc = SparkContext(appName="StreamingKMeansClustering")
    ssc = StreamingContext(sc, 10)

    ssc.checkpoint("file:///tmp/spark")

    def parseTrainingData(line):
      cells = line.split(",")
      return Vectors.dense([float(cells[0]), float(cells[1])])

    trainingStream = ssc.textFileStream("file:///Users/jananiravi/spark/spark-2.1.0-bin-without-hadoop/tweets/training")\
      .map(parseTrainingData)

    trainingStream.pprint();

    model = StreamingKMeans(k=2, decayFactor=1.0).setRandomCenters(2, 1.0, 0)

    print("Initial centers: " + str(model.latestModel().centers))

    model.trainOn(trainingStream)

    ssc.start()

    s = sched.scheduler(time.time, time.sleep)
    def print_cluster_centers(sc, model): 
        print("Cluster centers: " + str(model.latestModel().centers))
        s.enter(10, 1, print_cluster_centers, (sc, model))

    s.enter(10, 1, print_cluster_centers, (s, model))
    s.run()
        print('Applying PCA on training data...')
        PCA_model = PCA(low_dim).fit(tfidf_training)
        tfidf_training = PCA_model.transform(tfidf_training)
        k = low_dim

    # pcArray = model.transform(tfidf_training.first()).toArray()

    #setting checkpoint
    # ssc.checkpoint("/Users/davidenardone/Desktop/checkpoint")

    # CREATING DStream FROM TRAINING'S RDD
    trainingQueue = [tfidf_training]
    trainingStream = ssc.queueStream(trainingQueue)

    # CREATING A K-MEANS MODEL WITH RANDOM CLUSTERS SPECIFYING THE NUMBER OF CLUSTERS TO FIND
    model = StreamingKMeans(k=2, decayFactor=1.0,
                            timeUnit='batches').setRandomCenters(k, 1.0, 0)

    # print("K centers: " + str(model.latestModel().centers))

    # TRAINING THE MODEL ON THE TRAINING TWEET'S DATA
    print('Training K-means Model...')
    model.trainOn(trainingStream)
    print('done!')

    # CREATE DIRECT KAFKA STREAM WITH BROKERS AND TOPICS
    streamData = KafkaUtils.createDirectStream(
        ssc, [kafka_topic], {"metadata.broker.list": kafka_brokers})

    ######### FROM NOW ON, EACH ACTION OR TRANSFORMATION IS DONE ON A SINGLE INCOMING BATCH OF TWEETS #########

    # PRE-PROCESSING TWEETS DATA (TESTING)
コード例 #29
0
sc = SparkContext(appName="StreamingKMeans")
ssc = StreamingContext(sc, 10)
ssc.checkpoint("/tmp/checkpoints/")

initialCenters = [[604328, 574379], [801908, 318382], [416383, 786204],
                  [822771, 732034], [850993, 157873], [338586, 563537],
                  [169274, 348574], [619259, 397671], [241071, 844424],
                  [321801, 165319], [139493, 557352], [508785, 174800],
                  [398934, 404142], [860858, 546059], [674365, 860464]]

initialWeights = [
    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
]

stkm = StreamingKMeans(k=sys.argv[1], decayFactor=1.0).setInitialCenters(
    [[500, 500], [600, 600]], [1.0, 1.0])

directKafkaStream = KafkaUtils.createDirectStream(
    ssc, ['StreamingKMeansTFG'], {
        "metadata.broker.list": "localhost:9092",
        "auto_offset_reset": 'earliest'
    })

parsed = directKafkaStream.map(lambda v: loads(v[1]))

parsed = parsed.map(
    lambda line: Vectors.dense([float(x) for x in line.strip().split()]))

stkm.trainOn(parsed)

コード例 #30
0
sys.path.append('/usr/local/Cellar/apache-spark/spark-1.5.2-bin-hadoop2.6/python/')
# Append the python/build to PYTHONPATH so that py4j could be found
sys.path.append('/usr/local/Cellar/apache-spark/spark-1.5.2-bin-hadoop2.6/python/lib/py4j-0.8.2.1-src.zip')
from pyspark import SparkContext
from pyspark.streaming import StreamingContext


from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.clustering import StreamingKMeans
def parse(lp):
    label = float(lp[lp.find('(') + 1: lp.find(',')])
    vec = Vectors.dense(lp[lp.find('[') + 1: lp.find(']')].split(','))
    return LabeledPoint(label, vec)


sc  = SparkContext('local[4]', 'Social Panic Analysis')
# Create a local StreamingContext with two working thread and batch interval of 1 second

ssc = StreamingContext(sc, 10)
trainingData = ssc.textFileStream("./training/").map(Vectors.parse)
trainingData.pprint()
testData = ssc.textFileStream("./testing/").map(parse)
testData.pprint()
model = StreamingKMeans(k=5, decayFactor=1.0).setRandomCenters(3, 1.0, 0)
model.trainOn(trainingData)
model.predictOnValues(testData.map(lambda lp: (lp.label, lp.features))).pprint()
#print(model.predictOnValues(testData.map(lambda lp: (lp.label, lp.features))))

ssc.start()
ssc.awaitTermination()
コード例 #31
0
        return LabeledPoint(label, vec)

    trainingData = sc.textFile("/Users/tung/Documents/spark-2.4.3/data/mllib/kmeans_data.txt")\
        .map(lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')]))

    testingData = sc.textFile(
        "/Users/tung/Documents/spark-2.4.3/data/mllib/streaming_kmeans_data_test.txt"
    ).map(parse)

    trainingQueue = [trainingData]
    testingQueue = [testingData]

    trainingStream = ssc.queueStream(trainingQueue)
    testingStream = ssc.queueStream(testingQueue)

    # We create a model with random clusters and specify the number of clusters to find
    model = StreamingKMeans(k=2, decayFactor=1.0).setRandomCenters(3, 1.0, 0)

    # Now register the streams for training and testing and start the job,
    # printing the predicted cluster assignments on new data points as they arrive.
    model.trainOn(trainingStream)

    result = model.predictOnValues(
        testingStream.map(lambda lp: (lp.label, lp.features)))
    result.pprint()

    ssc.start()
    ssc.stop(stopSparkContext=True, stopGraceFully=True)

    print("Final centers: " + str(model.latestModel().centers))
コード例 #32
0
from pyspark.mllib.clustering import StreamingKMeans
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import Vectors
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

# Create a local StreamingContext with two working thread and batch interval of 1 second
sc = SparkContext("local[2]", "NetworkWordCount")
ssc = StreamingContext(sc, 1)

# continuous training
trainigData = ssc.textFileStream("/training/data/dir").map(Vectors.parse)
testData = ssc.textFileStream("/training/data/dir").map(Vectors.parse)
testData = ssc.textFileStream("/testing/data/dir").map(
    lambda s: LabeledPoint.parse(s))

model = StreamingKMeans()\
        .setK(3)\
        .setDecayFactor(1.0)\
        .setRandomCenters(dim=3, weight=0.0, seed=42)

model.trainOn(trainingData)
prediction = model.predictOnValues(testData)
print(prediction)
コード例 #33
0
    return LabeledPoint(label_, vec)


#testingData = sc.textFile("data/mllib/streaming_kmeans_data_test.txt").map(parse)
#testingData = sc.textFile("streaming_kmeans_data_test.txt").map(parse)
testingData = sc.parallelize(train_vec).map(parse_vec)

trainingQueue = [trainingData]
testingQueue = [testingData]

trainingStream = ssc.queueStream(trainingQueue)
testingStream = ssc.queueStream(testingQueue)

# We create a model with random clusters and specify the number of clusters to find
#model = StreamingKMeans(k=2, decayFactor=1.0).setRandomCenters(3, 1.0, 0)
model = StreamingKMeans(k=3, decayFactor=1.0).setRandomCenters(100, 1.0, 0)

# Now register the streams for training and testing and start the job,
# printing the predicted cluster assignments on new data points as they arrive.

#model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print()
#model.trainOn(trainingStream)
#print("TEST HERE")
#result = model.predictOnValues(testingStream.map(lambda lp: (lp.label, lp.features)))
model.trainOn(trainingStream)
print("TEST HERE")
result = model.predictOnValues(
    testingStream.map(lambda lp: (lp.label, lp.features)))
#result.pprint(num = 20)

#if result.count() != 0:
コード例 #34
0
    sc = SparkContext(master="local[4]", appName="Streaming-KMeans", conf=conf)

    ssc = StreamingContext(sc, 5)

    # Kafka Stream
    ks = KafkaUtils.createDirectStream(
        ssc, ["test"], {"metadata.broker.list": "localhost:9092"})

    trainingData = sc.textFile("data/datatraining.txt")\
        .map(lambda line: line.split(',')[2:-1]).map(lambda arr: Vectors.dense([float(x) for x in arr]))

    # Supplied to Streaming KMeans as the centers by StreamingKmeans are not giving good predictions
    init_centers = KMeans.train(trainingData, 2).centers

    model = StreamingKMeans(k=2, decayFactor=0.1)\
        .setInitialCenters(init_centers, [1.0, 1.0, 1.0, 1.0, 1.0])

    model.trainOn(ssc.queueStream([trainingData]))

    def parse(lp):
        arr = lp.split(',')[2:-1]
        label = lp.split(',')[0]
        vec = Vectors.dense([float(x) for x in arr])
        return LabeledPoint(label, vec)

    test_stream = ks.map(lambda x: x[1]).map(parse)

    result = model.predictOnValues(
        test_stream.map(lambda lp: (lp.label, lp.features)))

    # Prints Prediction Prediction and Cluster Centers
コード例 #35
0
         .filter(lambda post: 'created_at' in post)\
         .map(lambda post: (get_coord2(post)[0],get_coord2(post)[1],post["text"]))\
         .filter(lambda tpl: tpl[0] != 0)\
         .filter(lambda tpl: tpl[2] != '')\
         .map(lambda tpl: (tpl[0],tpl[1],tokenize(tpl[2])))\
         .map(lambda tpl:(tpl[0],tpl[1],tpl[2],doc2vec(tpl[2])))
    #dstream_tweets.pprint()

    trainingData = dstream_tweets.map(
        lambda tpl: [tpl[0], tpl[1]] + tpl[3].tolist())
    #trainingData.pprint()
    testdata = dstream_tweets.map(lambda tpl: (
        ([tpl[0], tpl[1]], tpl[2]), [tpl[0], tpl[1]] + tpl[3].tolist()))
    #testdata.pprint()
    #
    model = StreamingKMeans(k=clusterNum,
                            decayFactor=0.6).setRandomCenters(102, 1.0, 3)
    model.trainOn(trainingData)
    clust = model.predictOnValues(testdata)
    #clust.pprint()
    #words = lines.flatMap(lambda line: line.split(" "))
    topic = clust.map(lambda x: (x[1], x[0][1]))
    #topic.pprint()
    topicAgg = topic.reduceByKey(lambda x, y: x + y)
    #wordCollect.pprint()
    topicAgg.map(lambda x: (x[0], freqcount(x[1]))).pprint()

    clust.foreachRDD(lambda time, rdd: q.put(rdd.collect()))

    # Run!
    ssc.start()
    ssc.awaitTermination()
コード例 #36
0
 def streaming(self, mnk, clusters, init_clusters):
     self.mnk=mnk
     self.clusters=clusters
     self.init_clusters=init_clusters
     self.streaming_kmeans=StreamingKMeans(self.init_clusters, self.decay_factor, self.time_unit)
     self.streaming_kmeans.setInitialCenters(self.mnk.cluster_centers_, np.ones([self.init_clusters]))