def test_predictOn_model(self):
        """Test that the model predicts correctly on toy data."""
        stkm = StreamingKMeans()
        stkm._model = StreamingKMeansModel(
            clusterCenters=[[1.0, 1.0], [-1.0, 1.0], [-1.0, -1.0], [1.0, -1.0]],
            clusterWeights=[1.0, 1.0, 1.0, 1.0])

        predict_data = [[[1.5, 1.5]], [[-1.5, 1.5]], [[-1.5, -1.5]], [[1.5, -1.5]]]
        predict_data = [self.sc.parallelize(batch, 1) for batch in predict_data]
        predict_stream = self.ssc.queueStream(predict_data)
        predict_val = stkm.predictOn(predict_stream)

        result = []

        def update(rdd):
            rdd_collect = rdd.collect()
            if rdd_collect:
                result.append(rdd_collect)

        predict_val.foreachRDD(update)
        self.ssc.start()

        def condition():
            self.assertEqual(result, [[0], [1], [2], [3]])
            return True

        self._eventually(condition, catch_assertions=True)
    def test_trainOn_model(self):
        """Test the model on toy data with four clusters."""
        stkm = StreamingKMeans()
        initCenters = [[1.0, 1.0], [-1.0, 1.0], [-1.0, -1.0], [1.0, -1.0]]
        stkm.setInitialCenters(
            centers=initCenters, weights=[1.0, 1.0, 1.0, 1.0])

        # Create a toy dataset by setting a tiny offset for each point.
        offsets = [[0, 0.1], [0, -0.1], [0.1, 0], [-0.1, 0]]
        batches = []
        for offset in offsets:
            batches.append([[offset[0] + center[0], offset[1] + center[1]]
                            for center in initCenters])

        batches = [self.sc.parallelize(batch, 1) for batch in batches]
        input_stream = self.ssc.queueStream(batches)
        stkm.trainOn(input_stream)
        self.ssc.start()

        # Give enough time to train the model.
        def condition():
            finalModel = stkm.latestModel()
            self.assertTrue(all(finalModel.centers == array(initCenters)))
            self.assertEqual(finalModel.clusterWeights, [5.0, 5.0, 5.0, 5.0])
            return True
        self._eventually(condition, catch_assertions=True)
    def test_predictOn_model(self):
        """Test that the model predicts correctly on toy data."""
        stkm = StreamingKMeans()
        stkm._model = StreamingKMeansModel(clusterCenters=[[1.0, 1.0],
                                                           [-1.0, 1.0],
                                                           [-1.0, -1.0],
                                                           [1.0, -1.0]],
                                           clusterWeights=[1.0, 1.0, 1.0, 1.0])

        predict_data = [[[1.5, 1.5]], [[-1.5, 1.5]], [[-1.5, -1.5]],
                        [[1.5, -1.5]]]
        predict_data = [
            self.sc.parallelize(batch, 1) for batch in predict_data
        ]
        predict_stream = self.ssc.queueStream(predict_data)
        predict_val = stkm.predictOn(predict_stream)

        result = []

        def update(rdd):
            rdd_collect = rdd.collect()
            if rdd_collect:
                result.append(rdd_collect)

        predict_val.foreachRDD(update)
        self.ssc.start()

        def condition():
            self.assertEqual(result, [[0], [1], [2], [3]])
            return True

        eventually(condition, catch_assertions=True)
Example #4
0
def main():
    conf = SparkConf().setAppName("twitterclassifier")
    sc = SparkContext(conf=conf)
    ssc = StreamingContext(sc, 10)

    tweets = ssc.socketTextStream("localhost", PORT) \
                .map(lambda x: json.loads(x)) \
                .filter(lambda x: 'text' in x) \
                .map(lambda x: x['text'].encode('utf-8'))
    hasher = HashingTF(DIM)
    features = tweets.map(lambda x:
                          (x, hasher.transform(featurize(x)))).cache()

    # We create a model with random clusters and specify the number of clusters to find
    # decay = 1: total memory; decay = 0: no memory
    model = StreamingKMeans(k=N, decayFactor=0.1).setRandomCenters(DIM, 1.0, 0)
    model.trainOn(features.map(lambda x: x[1]))
    results = model.predictOnValues(features).cache()

    # Need a closure over i here.
    def print_group(i):
        results.filter(lambda x: x[1] == i).map(lambda x: '%i: %s' %
                                                (x[1], x[0])).pprint(3)

    for i in xrange(N):
        print_group(i)

    ssc.start()
    ssc.awaitTermination()
Example #5
0
    def test_predictOn_model(self):
        """Test that the model predicts correctly on toy data."""
        stkm = StreamingKMeans()
        stkm._model = StreamingKMeansModel(clusterCenters=[[1.0, 1.0],
                                                           [-1.0, 1.0],
                                                           [-1.0, -1.0],
                                                           [1.0, -1.0]],
                                           clusterWeights=[1.0, 1.0, 1.0, 1.0])

        predict_data = [[[1.5, 1.5]], [[-1.5, 1.5]], [[-1.5, -1.5]],
                        [[1.5, -1.5]]]
        predict_data = [sc.parallelize(batch, 1) for batch in predict_data]
        predict_stream = self.ssc.queueStream(predict_data)
        predict_val = stkm.predictOn(predict_stream)

        result = []

        def update(rdd):
            rdd_collect = rdd.collect()
            if rdd_collect:
                result.append(rdd_collect)

        predict_val.foreachRDD(update)
        t = time()
        self.ssc.start()
        self._ssc_wait(t, 6.0, 0.01)
        self.assertEquals(result, [[0], [1], [2], [3]])
Example #6
0
    def perform_training(sc: SparkContext, params_dict: dict):
        batch_duration = 1 if 'batch_duration' not in params_dict else params_dict[
            'batch_duration']
        training_duration = 20 if 'training_duration' not in params_dict else params_dict[
            'training_duration']
        ssc = StreamingContext(sc, batch_duration)
        topics = ['normal-ekg-stream']
        kafka_params = {'metadata.broker.list': 'localhost:9092'}
        kvs = KafkaUtils.createDirectStream(
            ssc,
            topics,
            kafkaParams=kafka_params,
            valueDecoder=lambda val: json.loads(val.decode('utf-8')))

        windowed_signal = kvs.map(lambda msg: Vectors.dense(
            [float(value) for value in msg[1]['signal_values']]))

        # windowed_signal.foreachRDD(Plotter.plot_signal_window)
        model = StreamingKMeans(k=20,
                                decayFactor=1.0).setRandomCenters(188, 1.0, 0)
        model.trainOn(windowed_signal)

        ssc.start()
        ssc.awaitTerminationOrTimeout(training_duration)
        ssc.stop(stopSparkContext=False, stopGraceFully=True)

        return model.latestModel()
    def test_accuracy_for_single_center(self):
        """Test that parameters obtained are correct for a single center."""
        centers, batches = self.streamingKMeansDataGenerator(batches=5,
                                                             numPoints=5,
                                                             k=1,
                                                             d=5,
                                                             r=0.1,
                                                             seed=0)
        stkm = StreamingKMeans(1)
        stkm.setInitialCenters([[0., 0., 0., 0., 0.]], [0.])
        input_stream = self.ssc.queueStream(
            [self.sc.parallelize(batch, 1) for batch in batches])
        stkm.trainOn(input_stream)

        self.ssc.start()

        def condition():
            self.assertEqual(stkm.latestModel().clusterWeights, [25.0])
            return True

        eventually(condition, catch_assertions=True)

        realCenters = array_sum(array(centers), axis=0)
        for i in range(5):
            modelCenters = stkm.latestModel().centers[0][i]
            self.assertAlmostEqual(centers[0][i], modelCenters, 1)
            self.assertAlmostEqual(realCenters[i], modelCenters, 1)
Example #8
0
    def test_trainOn_predictOn(self):
        """Test that prediction happens on the updated model."""
        stkm = StreamingKMeans(decayFactor=0.0, k=2)
        stkm.setInitialCenters([[0.0], [1.0]], [1.0, 1.0])

        # Since decay factor is set to zero, once the first batch
        # is passed the clusterCenters are updated to [-0.5, 0.7]
        # which causes 0.2 & 0.3 to be classified as 1, even though the
        # classification based in the initial model would have been 0
        # proving that the model is updated.
        batches = [[[-0.5], [0.6], [0.8]], [[0.2], [-0.1], [0.3]]]
        batches = [sc.parallelize(batch) for batch in batches]
        input_stream = self.ssc.queueStream(batches)
        predict_results = []

        def collect(rdd):
            rdd_collect = rdd.collect()
            if rdd_collect:
                predict_results.append(rdd_collect)

        stkm.trainOn(input_stream)
        predict_stream = stkm.predictOn(input_stream)
        predict_stream.foreachRDD(collect)

        t = time()
        self.ssc.start()
        self._ssc_wait(t, 6.0, 0.01)
        self.assertEqual(predict_results, [[0, 1, 1], [1, 0, 1]])
Example #9
0
    def test_trainOn_model(self):
        """Test the model on toy data with four clusters."""
        stkm = StreamingKMeans()
        initCenters = [[1.0, 1.0], [-1.0, 1.0], [-1.0, -1.0], [1.0, -1.0]]
        stkm.setInitialCenters(centers=initCenters,
                               weights=[1.0, 1.0, 1.0, 1.0])

        # Create a toy dataset by setting a tiny offest for each point.
        offsets = [[0, 0.1], [0, -0.1], [0.1, 0], [-0.1, 0]]
        batches = []
        for offset in offsets:
            batches.append([[offset[0] + center[0], offset[1] + center[1]]
                            for center in initCenters])

        batches = [self.sc.parallelize(batch, 1) for batch in batches]
        input_stream = self.ssc.queueStream(batches)
        stkm.trainOn(input_stream)
        t = time()
        self.ssc.start()

        # Give enough time to train the model.
        self._ssc_wait(t, 6.0, 0.01)
        finalModel = stkm.latestModel()
        self.assertTrue(all(finalModel.centers == array(initCenters)))
        self.assertEquals(finalModel.clusterWeights, [5.0, 5.0, 5.0, 5.0])
Example #10
0
    def test_model_params(self):
        """Test that the model params are set correctly"""
        stkm = StreamingKMeans()
        stkm.setK(5).setDecayFactor(0.0)
        self.assertEqual(stkm._k, 5)
        self.assertEqual(stkm._decayFactor, 0.0)

        # Model not set yet.
        self.assertIsNone(stkm.latestModel())
        self.assertRaises(ValueError, stkm.trainOn, [0.0, 1.0])

        stkm.setInitialCenters(
            centers=[[0.0, 0.0], [1.0, 1.0]], weights=[1.0, 1.0])
        self.assertEqual(
            stkm.latestModel().centers, [[0.0, 0.0], [1.0, 1.0]])
        self.assertEqual(stkm.latestModel().clusterWeights, [1.0, 1.0])
Example #11
0
    def test_trainOn_predictOn(self):
        """Test that prediction happens on the updated model."""
        stkm = StreamingKMeans(decayFactor=0.0, k=2)
        stkm.setInitialCenters([[0.0], [1.0]], [1.0, 1.0])

        # Since decay factor is set to zero, once the first batch
        # is passed the clusterCenters are updated to [-0.5, 0.7]
        # which causes 0.2 & 0.3 to be classified as 1, even though the
        # classification based in the initial model would have been 0
        # proving that the model is updated.
        batches = [[[-0.5], [0.6], [0.8]], [[0.2], [-0.1], [0.3]]]
        batches = [sc.parallelize(batch) for batch in batches]
        input_stream = self.ssc.queueStream(batches)
        predict_results = []

        def collect(rdd):
            rdd_collect = rdd.collect()
            if rdd_collect:
                predict_results.append(rdd_collect)

        stkm.trainOn(input_stream)
        predict_stream = stkm.predictOn(input_stream)
        predict_stream.foreachRDD(collect)

        t = time()
        self.ssc.start()
        self._ssc_wait(t, 6.0, 0.01)
        self.assertEqual(predict_results, [[0, 1, 1], [1, 0, 1]])
Example #12
0
    def test_model_params(self):
        """Test that the model params are set correctly"""
        stkm = StreamingKMeans()
        stkm.setK(5).setDecayFactor(0.0)
        self.assertEquals(stkm._k, 5)
        self.assertEquals(stkm._decayFactor, 0.0)

        # Model not set yet.
        self.assertIsNone(stkm.latestModel())
        self.assertRaises(ValueError, stkm.trainOn, [0.0, 1.0])

        stkm.setInitialCenters(
            centers=[[0.0, 0.0], [1.0, 1.0]], weights=[1.0, 1.0])
        self.assertEquals(
            stkm.latestModel().centers, [[0.0, 0.0], [1.0, 1.0]])
        self.assertEquals(stkm.latestModel().clusterWeights, [1.0, 1.0])
Example #13
0
    def test_predictOn_model(self):
        """Test that the model predicts correctly on toy data."""
        stkm = StreamingKMeans()
        stkm._model = StreamingKMeansModel(
            clusterCenters=[[1.0, 1.0], [-1.0, 1.0], [-1.0, -1.0], [1.0, -1.0]],
            clusterWeights=[1.0, 1.0, 1.0, 1.0])

        predict_data = [[[1.5, 1.5]], [[-1.5, 1.5]], [[-1.5, -1.5]], [[1.5, -1.5]]]
        predict_data = [sc.parallelize(batch, 1) for batch in predict_data]
        predict_stream = self.ssc.queueStream(predict_data)
        predict_val = stkm.predictOn(predict_stream)

        result = []

        def update(rdd):
            rdd_collect = rdd.collect()
            if rdd_collect:
                result.append(rdd_collect)

        predict_val.foreachRDD(update)
        t = time()
        self.ssc.start()
        self._ssc_wait(t, 6.0, 0.01)
        self.assertEquals(result, [[0], [1], [2], [3]])
Example #14
0
class StreamingUpdate(object):
    """
    Streaming Update: DStream
    """
    def __init__(self, init_clusters, decay_factor, time_unit, sc, ssc):
        self.init_clusters=init_clusters
        self.decay_factor=decay_factor
        self.time_unit=time_unit
        self.sc=sc
        self.ssc=ssc

    # implement
    def streaming(self, mnk, clusters, init_clusters):
        self.mnk=mnk
        self.clusters=clusters
        self.init_clusters=init_clusters
        self.streaming_kmeans=StreamingKMeans(self.init_clusters, self.decay_factor, self.time_unit)
        self.streaming_kmeans.setInitialCenters(self.mnk.cluster_centers_, np.ones([self.init_clusters]))

    # update shape for centers in StreamingContext
    """
    Từ điển được cập nhật khi có tin tức mới đến thì em cập nhật lại kích thước của các centroid
    VD: Từ điển ban đầu có kích thước 10 từ
    Em biểu diễn một câu có 5 từ bằng sparse vector kích thước 5x10
    Từ điển sau khi cập nhật có 15 từ thì câu trên phải biểu diễn lại bằng sparse vector có kích thước 5x15
    Có cách biểu diễn khác mà không phải cập nhật lại biểu diễn của câu không ạ
    """
    def update_shape(self, docs, dictionary):
        self.streaming_kmeans.setRandomCenters(matrix_tfidf(docs, dictionary).shape[1], 1.0, 0)

    # save matrix update
    def save_matrix_update(self, docs, dictionary):
        np.savetxt('/home/ducvu/input_streaming.txt', matrix_tfidf(docs, dictionary))

    # load dstream
    def load_dstream(self):
        self.dstream = self.sc.textFile("/home/ducvu/input_streaming.txt")\
            .map(lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')]))
        
    # make predict
    def make_predict(self, docs, dictionary):
        self.streaming_kmeans.trainOn(self.load_dstream())
        self.pred_stream=[]
        matrix=matrix_tfidf(docs)
        for x in matrix:
            self.pred_stream.append(self.streaming_kmeans.latest_model.predict(x))
        self.pred_stream=np.array(self.pred_stream)
        
        df = pd.DataFrame(matrix).groupby(self.pred_stream).mean()
        for i, r in df.iterrows():
            print('\nCluster {0}:'.format(i))
            print(','.join([get_tfidf(dictionary).get_feature_names()[t] for t in np.argsort(r)[-15:]]))
Example #15
0
    def test_accuracy_for_single_center(self):
        """Test that parameters obtained are correct for a single center."""
        centers, batches = self.streamingKMeansDataGenerator(
            batches=5, numPoints=5, k=1, d=5, r=0.1, seed=0)
        stkm = StreamingKMeans(1)
        stkm.setInitialCenters([[0., 0., 0., 0., 0.]], [0.])
        input_stream = self.ssc.queueStream(
            [self.sc.parallelize(batch, 1) for batch in batches])
        stkm.trainOn(input_stream)

        t = time()
        self.ssc.start()
        self._ssc_wait(t, 10.0, 0.01)
        self.assertEquals(stkm.latestModel().clusterWeights, [25.0])
        realCenters = array_sum(array(centers), axis=0)
        for i in range(5):
            modelCenters = stkm.latestModel().centers[0][i]
            self.assertAlmostEqual(centers[0][i], modelCenters, 1)
            self.assertAlmostEqual(realCenters[i], modelCenters, 1)
    def detect(self, k, t):
        # Encoding categorical features using one-hot.
        df1 = self.cat2Num(self.rawDF, [0, 1]).cache()
        df1.show(n=2, truncate=False)

        # Clustering points using KMeans
        features = df1.select("features").rdd.map(lambda row: row[0]).cache()
        model = StreamingKMeans(k=7,
                                decayFactor=1.0).setRandomCenters(4, 1.0, 0)
        # model = KMeans.train(features, k, maxIterations=40, runs=10, initializationMode="random", seed=20)

        # Adding the prediction column to df1
        modelBC = sc.broadcast(model)
        predictUDF = udf(lambda x: modelBC.value.predict(x), StringType())
        df2 = df1.withColumn("prediction", predictUDF(df1.features)).cache()
        df2.show(n=3, truncate=False)

        # Adding the score column to df2; The higher the score, the more likely it is an anomaly
        df3 = self.addScore(df2).cache()
        df3.show(n=3, truncate=False)

        return df3.where(df3.score > t)
Example #17
0
from pyspark.mllib.clustering import StreamingKMeans

if __name__ == "__main__":
    sc = SparkContext(appName="StreamingErrorCount")
    ssc = StreamingContext(sc, 2)

    ssc.checkpoint("/tmp/spark")

    def parseTrainingData(line):
        cells = line.split(",")
        return Vectors.dense([float(cells[0]), float(cells[1])])

    trainingStream = ssc.textFileStream("./training")\
                        .map(parseTrainingData)

    model = StreamingKMeans(k=2, decayFactor=1.0).setRandomCenters(2, 1.0, 0)

    print "Intial centers: {0}".format(model.latestModel().centers)

    model.trainOn(trainingStream)

    ssc.start()

    s = sched.scheduler(time.time, time.sleep)

    def print_cluster_centers(sc, model):
        print "Cluster centers: {0}".format(model.latestModel().centers)
        s.enter(10, 1, print_cluster_centers, (sc, model))

    s.enter(10, 1, print_cluster_centers, (s, model))
    s.run()
Example #18
0
            #         ptext[clus].set_text(str(clus)+ ':'+str(row[1][1]))
            #         ptext[clus].set_color(colors[clus])
            #     plt.pause(0.0001)
            #


q = multiprocessing.Queue()
f = multiprocessing.Queue()
job_for_another_core2 = multiprocessing.Process(target=data_plotting,
                                                args=(q, ))
job_for_another_core2.start()

sc = SparkContext('local[4]', 'Social Panic Analysis')
# Create a local StreamingContext with two working thread and batch interval of 1 second

ssc = StreamingContext(sc, 10)
dstream = ssc.socketTextStream("localhost", 9998)
trainingData = dstream.map(Vectors.parse)
trainingData.pprint()
testData = trainingData.map(lambda x: (x, x))
testData.pprint()
model = StreamingKMeans(k=clusterNum,
                        decayFactor=0.1).setRandomCenters(2, 1.0, 0)
model.trainOn(trainingData)
print(model.latestModel().clusterCenters)
clust = model.predictOnValues(testData)
clust.pprint()
#print(model.predictOnValues(testData.map(lambda lp: (lp.label, lp.features))))
clust.foreachRDD(lambda time, rdd: q.put(rdd.collect()))
ssc.start()
ssc.awaitTermination()
Example #19
0
File: example.py Project: AAB94/RP

trainingData = sc.textFile("data/datatraining.txt")\
    .map(lambda line: line.split(',')[2:-1]).map(lambda arr: Vectors.dense([float(x) for x in arr]))

centers = KMeans.train(trainingData, 2).centers


trainingQueue = [trainingData]


trainingStream = ssc.queueStream(trainingQueue)


# We create a model with random clusters and specify the number of clusters to find
model = StreamingKMeans(k=2, decayFactor=0.3)#.setRandomCenters(5, 1.0, 0)
model.setInitialCenters( centers, [1.0,1.0,1.0,1.0,1.0])
# Now register the streams for training and testing and start the job,
# printing the predicted cluster assignments on new data points as they arrive.
model.trainOn(trainingStream)

def parse(lp):
    #label = float(lp[lp.find('(') + 1: lp.find(')')])
    #vec = Vectors.dense(lp[lp.find('[') + 1: lp.find(']')].split(','))
    arr = lp.split(',')[2:-1]
    label = lp.split(',')[0]
    label = label[1:-1]
    vec = Vectors.dense([float(x) for x in arr])
    print(model.latestModel().centers)
    return LabeledPoint(label, vec)
Example #20
0
    for center in file:
        initialCenters.append(center.split())

logging.info(initialCenters)

initialWeights = []
for i in initialCenters:
    initialWeights.append(1.0)

config = sc.broadcast(parameters)
numberClusters = config.value[0]
mongoIP = config.value[1]
mongoDataBase = config.value[2]
mongoCollection = config.value[3]

stkm = StreamingKMeans(k=numberClusters, decayFactor=1).setInitialCenters(
    initialCenters, initialWeights)

#stkm = StreamingKMeans(k=int(numberClusters),decayFactor=1.0).setRandomCenters(2,1.0,100)

directKafkaStream = KafkaUtils.createDirectStream(
    ssc, ['StreamingKMeansTFG'], {
        "metadata.broker.list": "localhost:9092",
        "auto_offset_reset": 'earliest'
    })

parsed = directKafkaStream.map(lambda v: loads(v[1]))

parsed = parsed.map(
    lambda line: Vectors.dense([float(x) for x in line.strip().split()]))

stkm.trainOn(parsed)
    def parse(lp):
        label = float(lp[lp.find('(') + 1: lp.find(')')])
        vec = Vectors.dense(lp[lp.find('[') + 1: lp.find(']')].split(','))
        return LabeledPoint(label, vec)

    trainingData = sc.textFile("spark-2.0.1-bin-hadoop2.7/data/mllib/kmeans_data.txt")\
        .map(lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')]))


    trainingQueue = [trainingData]

    trainingStream = ssc.queueStream(trainingQueue)
    testingStream = ssc.textFileStream('history').map(parse)

    # We create a model with random clusters and specify the number of clusters to find
    model = StreamingKMeans(k=2, decayFactor=1.0).setRandomCenters(3, 1.0, 0)

    # Now register the streams for training and testing and start the job,
    # printing the predicted cluster assignments on new data points as they arrive.
    model.trainOn(trainingStream)

    result = model.predictOnValues(testingStream.map(lambda lp: (lp.label, lp.features)))
    result.pprint()

    ssc.start()
    ssc.awaitTermination()
    #ssc.stop(stopSparkContext=True, stopGraceFully=True)
    # $example off$

    print("Final centers: " + str(model.latestModel().centers))
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.clustering import StreamingKMeans

if __name__ == "__main__":
    sc = SparkContext(appName="sai twitter feed")
    ssc = StreamingContext(sc, 10)

    ssc.checkpoint("chkpfile")

    def parserData(line):
        cells = line.split(",")
        return Vectors.dense([float(cells[0]), float(cells[1])])

    trainingStream = ssc.textFileStream("/files").map(parserData)

    model = StreamingKMeans(k=2, decayFactor=1.0).setRandomCenters(2, 1.0, 0)
    print("Initial Centres" + str(model.latestModel().centers))
    model.trainOn(trainingStream)
    ssc.start()

    s = sched.scheduler(time.time, time.sleep)

    def print_cluster_centres(sc, model):
        print(str(model.latestModel().centers))
        s.enter(10, 1, print_cluster_centres, (s, model))

    s.enter(10, 1, print_cluster_centres, (s, model))
    s.run()

    ssc.awaitTermination()
# to make this work
         .filter(lambda post: 'created_at' in post)\
         .map(lambda post: (get_coord2(post)[0],get_coord2(post)[1],post["text"]))\
         .filter(lambda tpl: tpl[0] != 0)\
         .filter(lambda tpl: tpl[2] != '')\
         .map(lambda tpl: (tpl[0],tpl[1],tokenize(tpl[2])))\
         .map(lambda tpl:(tpl[0],tpl[1],tpl[2],doc2vec(tpl[2])))
    #dstream_tweets.pprint()



    trainingData=dstream_tweets.map(lambda tpl: [tpl[0],tpl[1]]+tpl[3].tolist())
    #trainingData.pprint()
    testdata=dstream_tweets.map(lambda tpl: (([tpl[0],tpl[1]],tpl[2]),[tpl[0],tpl[1]]+tpl[3].tolist()))
    #testdata.pprint()
    #
    model = StreamingKMeans(k=clusterNum, decayFactor=0.6).setRandomCenters(102, 1.0, 3)
    model.trainOn(trainingData)
    clust=model.predictOnValues(testdata)
    #clust.pprint()
    #words = lines.flatMap(lambda line: line.split(" "))
    topic=clust.map(lambda x: (x[1],x[0][1]))
    #topic.pprint()
    topicAgg = topic.reduceByKey(lambda x,y: x+y)
    #wordCollect.pprint()
    topicAgg.map(lambda x: (x[0],freqcount(x[1]))).pprint()

    clust.foreachRDD(lambda time, rdd: q.put(rdd.collect()))

    # Run!
    ssc.start()
    ssc.awaitTermination()
Example #24
0
)
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.clustering import StreamingKMeans


def parse(lp):
    label = float(lp[lp.find('(') + 1:lp.find(',')])
    vec = Vectors.dense(lp[lp.find('[') + 1:lp.find(']')].split(','))
    return LabeledPoint(label, vec)


sc = SparkContext('local[4]', 'Social Panic Analysis')
# Create a local StreamingContext with two working thread and batch interval of 1 second

ssc = StreamingContext(sc, 10)
trainingData = ssc.textFileStream("./training/").map(Vectors.parse)
trainingData.pprint()
testData = ssc.textFileStream("./testing/").map(parse)
testData.pprint()
model = StreamingKMeans(k=5, decayFactor=1.0).setRandomCenters(3, 1.0, 0)
model.trainOn(trainingData)
model.predictOnValues(
    testData.map(lambda lp: (lp.label, lp.features))).pprint()
#print(model.predictOnValues(testData.map(lambda lp: (lp.label, lp.features))))

ssc.start()
ssc.awaitTermination()
            #         clus=row[0]
            #         #ptext[clus].set_text(str(clus)+ ':'+str([x[0] for x in row[1][1]]))
            #         ptext[clus].set_text(str(clus)+ ':'+str(row[1][1]))
            #         ptext[clus].set_color(colors[clus])
            #     plt.pause(0.0001)
            #


q = multiprocessing.Queue()
f = multiprocessing.Queue()
job_for_another_core2 = multiprocessing.Process(target=data_plotting,args=(q,))
job_for_another_core2.start()

sc  = SparkContext('local[4]', 'Social Panic Analysis')
# Create a local StreamingContext with two working thread and batch interval of 1 second

ssc = StreamingContext(sc, 10)
dstream = ssc.socketTextStream("localhost", 9998)
trainingData = dstream.map(Vectors.parse)
trainingData.pprint()
testData=trainingData.map(lambda x: (x,x))
testData.pprint()
model = StreamingKMeans(k=clusterNum, decayFactor=0.1).setRandomCenters(2, 1.0, 0)
model.trainOn(trainingData)
print(model.latestModel().clusterCenters)
clust=model.predictOnValues(testData)
clust.pprint()
#print(model.predictOnValues(testData.map(lambda lp: (lp.label, lp.features))))
clust.foreachRDD(lambda time, rdd: q.put(rdd.collect()))
ssc.start()
ssc.awaitTermination()
    os.makedirs("results")
except:
    pass

output_file = open(RESULT_FILE, "w")

start = time.time()

#output_file.write("Measurement,Number_Partitions, Time\n")
#output_file.write("Spark Startup, %s, %.5f\n"%(NUMBER_PARTITIONS, time.time()-start))
#output_file.flush()
#######################################################################################

decayFactor = 1.0
timeUnit = "batches"
model = StreamingKMeans(k=10, decayFactor=decayFactor,
                        timeUnit=timeUnit).setRandomCenters(3, 1.0, 0)

#def printOffsetRanges(rdd):
#    for o in offsetRanges:
#        print "%s %s %s %s" % (o.topic, o.partition, o.fromOffset, o.untilOffset)


def count_records(rdd):
    print str(type(rdd))
    if rdd != None:
        return rdd.collect()

    return [0]


## OK
    sc = SparkContext(appName="StreamingKMeansClustering")
    ssc = StreamingContext(sc, 10)

    ssc.checkpoint("file:///tmp/spark")

    def parseTrainingData(line):
      cells = line.split(",")
      return Vectors.dense([float(cells[0]), float(cells[1])])

    trainingStream = ssc.textFileStream("file:///Users/jananiravi/spark/spark-2.1.0-bin-without-hadoop/tweets/training")\
      .map(parseTrainingData)

    trainingStream.pprint();

    model = StreamingKMeans(k=2, decayFactor=1.0).setRandomCenters(2, 1.0, 0)

    print("Initial centers: " + str(model.latestModel().centers))

    model.trainOn(trainingStream)

    ssc.start()

    s = sched.scheduler(time.time, time.sleep)
    def print_cluster_centers(sc, model): 
        print("Cluster centers: " + str(model.latestModel().centers))
        s.enter(10, 1, print_cluster_centers, (sc, model))

    s.enter(10, 1, print_cluster_centers, (s, model))
    s.run()
        print('Applying PCA on training data...')
        PCA_model = PCA(low_dim).fit(tfidf_training)
        tfidf_training = PCA_model.transform(tfidf_training)
        k = low_dim

    # pcArray = model.transform(tfidf_training.first()).toArray()

    #setting checkpoint
    # ssc.checkpoint("/Users/davidenardone/Desktop/checkpoint")

    # CREATING DStream FROM TRAINING'S RDD
    trainingQueue = [tfidf_training]
    trainingStream = ssc.queueStream(trainingQueue)

    # CREATING A K-MEANS MODEL WITH RANDOM CLUSTERS SPECIFYING THE NUMBER OF CLUSTERS TO FIND
    model = StreamingKMeans(k=2, decayFactor=1.0,
                            timeUnit='batches').setRandomCenters(k, 1.0, 0)

    # print("K centers: " + str(model.latestModel().centers))

    # TRAINING THE MODEL ON THE TRAINING TWEET'S DATA
    print('Training K-means Model...')
    model.trainOn(trainingStream)
    print('done!')

    # CREATE DIRECT KAFKA STREAM WITH BROKERS AND TOPICS
    streamData = KafkaUtils.createDirectStream(
        ssc, [kafka_topic], {"metadata.broker.list": kafka_brokers})

    ######### FROM NOW ON, EACH ACTION OR TRANSFORMATION IS DONE ON A SINGLE INCOMING BATCH OF TWEETS #########

    # PRE-PROCESSING TWEETS DATA (TESTING)
Example #29
0
sc = SparkContext(appName="StreamingKMeans")
ssc = StreamingContext(sc, 10)
ssc.checkpoint("/tmp/checkpoints/")

initialCenters = [[604328, 574379], [801908, 318382], [416383, 786204],
                  [822771, 732034], [850993, 157873], [338586, 563537],
                  [169274, 348574], [619259, 397671], [241071, 844424],
                  [321801, 165319], [139493, 557352], [508785, 174800],
                  [398934, 404142], [860858, 546059], [674365, 860464]]

initialWeights = [
    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
]

stkm = StreamingKMeans(k=sys.argv[1], decayFactor=1.0).setInitialCenters(
    [[500, 500], [600, 600]], [1.0, 1.0])

directKafkaStream = KafkaUtils.createDirectStream(
    ssc, ['StreamingKMeansTFG'], {
        "metadata.broker.list": "localhost:9092",
        "auto_offset_reset": 'earliest'
    })

parsed = directKafkaStream.map(lambda v: loads(v[1]))

parsed = parsed.map(
    lambda line: Vectors.dense([float(x) for x in line.strip().split()]))

stkm.trainOn(parsed)

sys.path.append('/usr/local/Cellar/apache-spark/spark-1.5.2-bin-hadoop2.6/python/')
# Append the python/build to PYTHONPATH so that py4j could be found
sys.path.append('/usr/local/Cellar/apache-spark/spark-1.5.2-bin-hadoop2.6/python/lib/py4j-0.8.2.1-src.zip')
from pyspark import SparkContext
from pyspark.streaming import StreamingContext


from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.clustering import StreamingKMeans
def parse(lp):
    label = float(lp[lp.find('(') + 1: lp.find(',')])
    vec = Vectors.dense(lp[lp.find('[') + 1: lp.find(']')].split(','))
    return LabeledPoint(label, vec)


sc  = SparkContext('local[4]', 'Social Panic Analysis')
# Create a local StreamingContext with two working thread and batch interval of 1 second

ssc = StreamingContext(sc, 10)
trainingData = ssc.textFileStream("./training/").map(Vectors.parse)
trainingData.pprint()
testData = ssc.textFileStream("./testing/").map(parse)
testData.pprint()
model = StreamingKMeans(k=5, decayFactor=1.0).setRandomCenters(3, 1.0, 0)
model.trainOn(trainingData)
model.predictOnValues(testData.map(lambda lp: (lp.label, lp.features))).pprint()
#print(model.predictOnValues(testData.map(lambda lp: (lp.label, lp.features))))

ssc.start()
ssc.awaitTermination()
Example #31
0
        return LabeledPoint(label, vec)

    trainingData = sc.textFile("/Users/tung/Documents/spark-2.4.3/data/mllib/kmeans_data.txt")\
        .map(lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')]))

    testingData = sc.textFile(
        "/Users/tung/Documents/spark-2.4.3/data/mllib/streaming_kmeans_data_test.txt"
    ).map(parse)

    trainingQueue = [trainingData]
    testingQueue = [testingData]

    trainingStream = ssc.queueStream(trainingQueue)
    testingStream = ssc.queueStream(testingQueue)

    # We create a model with random clusters and specify the number of clusters to find
    model = StreamingKMeans(k=2, decayFactor=1.0).setRandomCenters(3, 1.0, 0)

    # Now register the streams for training and testing and start the job,
    # printing the predicted cluster assignments on new data points as they arrive.
    model.trainOn(trainingStream)

    result = model.predictOnValues(
        testingStream.map(lambda lp: (lp.label, lp.features)))
    result.pprint()

    ssc.start()
    ssc.stop(stopSparkContext=True, stopGraceFully=True)

    print("Final centers: " + str(model.latestModel().centers))
Example #32
0
from pyspark.mllib.clustering import StreamingKMeans
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import Vectors
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

# Create a local StreamingContext with two working thread and batch interval of 1 second
sc = SparkContext("local[2]", "NetworkWordCount")
ssc = StreamingContext(sc, 1)

# continuous training
trainigData = ssc.textFileStream("/training/data/dir").map(Vectors.parse)
testData = ssc.textFileStream("/training/data/dir").map(Vectors.parse)
testData = ssc.textFileStream("/testing/data/dir").map(
    lambda s: LabeledPoint.parse(s))

model = StreamingKMeans()\
        .setK(3)\
        .setDecayFactor(1.0)\
        .setRandomCenters(dim=3, weight=0.0, seed=42)

model.trainOn(trainingData)
prediction = model.predictOnValues(testData)
print(prediction)
Example #33
0
    return LabeledPoint(label_, vec)


#testingData = sc.textFile("data/mllib/streaming_kmeans_data_test.txt").map(parse)
#testingData = sc.textFile("streaming_kmeans_data_test.txt").map(parse)
testingData = sc.parallelize(train_vec).map(parse_vec)

trainingQueue = [trainingData]
testingQueue = [testingData]

trainingStream = ssc.queueStream(trainingQueue)
testingStream = ssc.queueStream(testingQueue)

# We create a model with random clusters and specify the number of clusters to find
#model = StreamingKMeans(k=2, decayFactor=1.0).setRandomCenters(3, 1.0, 0)
model = StreamingKMeans(k=3, decayFactor=1.0).setRandomCenters(100, 1.0, 0)

# Now register the streams for training and testing and start the job,
# printing the predicted cluster assignments on new data points as they arrive.

#model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print()
#model.trainOn(trainingStream)
#print("TEST HERE")
#result = model.predictOnValues(testingStream.map(lambda lp: (lp.label, lp.features)))
model.trainOn(trainingStream)
print("TEST HERE")
result = model.predictOnValues(
    testingStream.map(lambda lp: (lp.label, lp.features)))
#result.pprint(num = 20)

#if result.count() != 0:
Example #34
0
    sc = SparkContext(master="local[4]", appName="Streaming-KMeans", conf=conf)

    ssc = StreamingContext(sc, 5)

    # Kafka Stream
    ks = KafkaUtils.createDirectStream(
        ssc, ["test"], {"metadata.broker.list": "localhost:9092"})

    trainingData = sc.textFile("data/datatraining.txt")\
        .map(lambda line: line.split(',')[2:-1]).map(lambda arr: Vectors.dense([float(x) for x in arr]))

    # Supplied to Streaming KMeans as the centers by StreamingKmeans are not giving good predictions
    init_centers = KMeans.train(trainingData, 2).centers

    model = StreamingKMeans(k=2, decayFactor=0.1)\
        .setInitialCenters(init_centers, [1.0, 1.0, 1.0, 1.0, 1.0])

    model.trainOn(ssc.queueStream([trainingData]))

    def parse(lp):
        arr = lp.split(',')[2:-1]
        label = lp.split(',')[0]
        vec = Vectors.dense([float(x) for x in arr])
        return LabeledPoint(label, vec)

    test_stream = ks.map(lambda x: x[1]).map(parse)

    result = model.predictOnValues(
        test_stream.map(lambda lp: (lp.label, lp.features)))

    # Prints Prediction Prediction and Cluster Centers
         .filter(lambda post: 'created_at' in post)\
         .map(lambda post: (get_coord2(post)[0],get_coord2(post)[1],post["text"]))\
         .filter(lambda tpl: tpl[0] != 0)\
         .filter(lambda tpl: tpl[2] != '')\
         .map(lambda tpl: (tpl[0],tpl[1],tokenize(tpl[2])))\
         .map(lambda tpl:(tpl[0],tpl[1],tpl[2],doc2vec(tpl[2])))
    #dstream_tweets.pprint()

    trainingData = dstream_tweets.map(
        lambda tpl: [tpl[0], tpl[1]] + tpl[3].tolist())
    #trainingData.pprint()
    testdata = dstream_tweets.map(lambda tpl: (
        ([tpl[0], tpl[1]], tpl[2]), [tpl[0], tpl[1]] + tpl[3].tolist()))
    #testdata.pprint()
    #
    model = StreamingKMeans(k=clusterNum,
                            decayFactor=0.6).setRandomCenters(102, 1.0, 3)
    model.trainOn(trainingData)
    clust = model.predictOnValues(testdata)
    #clust.pprint()
    #words = lines.flatMap(lambda line: line.split(" "))
    topic = clust.map(lambda x: (x[1], x[0][1]))
    #topic.pprint()
    topicAgg = topic.reduceByKey(lambda x, y: x + y)
    #wordCollect.pprint()
    topicAgg.map(lambda x: (x[0], freqcount(x[1]))).pprint()

    clust.foreachRDD(lambda time, rdd: q.put(rdd.collect()))

    # Run!
    ssc.start()
    ssc.awaitTermination()
Example #36
0
 def streaming(self, mnk, clusters, init_clusters):
     self.mnk=mnk
     self.clusters=clusters
     self.init_clusters=init_clusters
     self.streaming_kmeans=StreamingKMeans(self.init_clusters, self.decay_factor, self.time_unit)
     self.streaming_kmeans.setInitialCenters(self.mnk.cluster_centers_, np.ones([self.init_clusters]))