def test_predictOn_model(self): """Test that the model predicts correctly on toy data.""" stkm = StreamingKMeans() stkm._model = StreamingKMeansModel( clusterCenters=[[1.0, 1.0], [-1.0, 1.0], [-1.0, -1.0], [1.0, -1.0]], clusterWeights=[1.0, 1.0, 1.0, 1.0]) predict_data = [[[1.5, 1.5]], [[-1.5, 1.5]], [[-1.5, -1.5]], [[1.5, -1.5]]] predict_data = [self.sc.parallelize(batch, 1) for batch in predict_data] predict_stream = self.ssc.queueStream(predict_data) predict_val = stkm.predictOn(predict_stream) result = [] def update(rdd): rdd_collect = rdd.collect() if rdd_collect: result.append(rdd_collect) predict_val.foreachRDD(update) self.ssc.start() def condition(): self.assertEqual(result, [[0], [1], [2], [3]]) return True self._eventually(condition, catch_assertions=True)
def test_trainOn_model(self): """Test the model on toy data with four clusters.""" stkm = StreamingKMeans() initCenters = [[1.0, 1.0], [-1.0, 1.0], [-1.0, -1.0], [1.0, -1.0]] stkm.setInitialCenters( centers=initCenters, weights=[1.0, 1.0, 1.0, 1.0]) # Create a toy dataset by setting a tiny offset for each point. offsets = [[0, 0.1], [0, -0.1], [0.1, 0], [-0.1, 0]] batches = [] for offset in offsets: batches.append([[offset[0] + center[0], offset[1] + center[1]] for center in initCenters]) batches = [self.sc.parallelize(batch, 1) for batch in batches] input_stream = self.ssc.queueStream(batches) stkm.trainOn(input_stream) self.ssc.start() # Give enough time to train the model. def condition(): finalModel = stkm.latestModel() self.assertTrue(all(finalModel.centers == array(initCenters))) self.assertEqual(finalModel.clusterWeights, [5.0, 5.0, 5.0, 5.0]) return True self._eventually(condition, catch_assertions=True)
def test_predictOn_model(self): """Test that the model predicts correctly on toy data.""" stkm = StreamingKMeans() stkm._model = StreamingKMeansModel(clusterCenters=[[1.0, 1.0], [-1.0, 1.0], [-1.0, -1.0], [1.0, -1.0]], clusterWeights=[1.0, 1.0, 1.0, 1.0]) predict_data = [[[1.5, 1.5]], [[-1.5, 1.5]], [[-1.5, -1.5]], [[1.5, -1.5]]] predict_data = [ self.sc.parallelize(batch, 1) for batch in predict_data ] predict_stream = self.ssc.queueStream(predict_data) predict_val = stkm.predictOn(predict_stream) result = [] def update(rdd): rdd_collect = rdd.collect() if rdd_collect: result.append(rdd_collect) predict_val.foreachRDD(update) self.ssc.start() def condition(): self.assertEqual(result, [[0], [1], [2], [3]]) return True eventually(condition, catch_assertions=True)
def main(): conf = SparkConf().setAppName("twitterclassifier") sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 10) tweets = ssc.socketTextStream("localhost", PORT) \ .map(lambda x: json.loads(x)) \ .filter(lambda x: 'text' in x) \ .map(lambda x: x['text'].encode('utf-8')) hasher = HashingTF(DIM) features = tweets.map(lambda x: (x, hasher.transform(featurize(x)))).cache() # We create a model with random clusters and specify the number of clusters to find # decay = 1: total memory; decay = 0: no memory model = StreamingKMeans(k=N, decayFactor=0.1).setRandomCenters(DIM, 1.0, 0) model.trainOn(features.map(lambda x: x[1])) results = model.predictOnValues(features).cache() # Need a closure over i here. def print_group(i): results.filter(lambda x: x[1] == i).map(lambda x: '%i: %s' % (x[1], x[0])).pprint(3) for i in xrange(N): print_group(i) ssc.start() ssc.awaitTermination()
def test_predictOn_model(self): """Test that the model predicts correctly on toy data.""" stkm = StreamingKMeans() stkm._model = StreamingKMeansModel(clusterCenters=[[1.0, 1.0], [-1.0, 1.0], [-1.0, -1.0], [1.0, -1.0]], clusterWeights=[1.0, 1.0, 1.0, 1.0]) predict_data = [[[1.5, 1.5]], [[-1.5, 1.5]], [[-1.5, -1.5]], [[1.5, -1.5]]] predict_data = [sc.parallelize(batch, 1) for batch in predict_data] predict_stream = self.ssc.queueStream(predict_data) predict_val = stkm.predictOn(predict_stream) result = [] def update(rdd): rdd_collect = rdd.collect() if rdd_collect: result.append(rdd_collect) predict_val.foreachRDD(update) t = time() self.ssc.start() self._ssc_wait(t, 6.0, 0.01) self.assertEquals(result, [[0], [1], [2], [3]])
def perform_training(sc: SparkContext, params_dict: dict): batch_duration = 1 if 'batch_duration' not in params_dict else params_dict[ 'batch_duration'] training_duration = 20 if 'training_duration' not in params_dict else params_dict[ 'training_duration'] ssc = StreamingContext(sc, batch_duration) topics = ['normal-ekg-stream'] kafka_params = {'metadata.broker.list': 'localhost:9092'} kvs = KafkaUtils.createDirectStream( ssc, topics, kafkaParams=kafka_params, valueDecoder=lambda val: json.loads(val.decode('utf-8'))) windowed_signal = kvs.map(lambda msg: Vectors.dense( [float(value) for value in msg[1]['signal_values']])) # windowed_signal.foreachRDD(Plotter.plot_signal_window) model = StreamingKMeans(k=20, decayFactor=1.0).setRandomCenters(188, 1.0, 0) model.trainOn(windowed_signal) ssc.start() ssc.awaitTerminationOrTimeout(training_duration) ssc.stop(stopSparkContext=False, stopGraceFully=True) return model.latestModel()
def test_accuracy_for_single_center(self): """Test that parameters obtained are correct for a single center.""" centers, batches = self.streamingKMeansDataGenerator(batches=5, numPoints=5, k=1, d=5, r=0.1, seed=0) stkm = StreamingKMeans(1) stkm.setInitialCenters([[0., 0., 0., 0., 0.]], [0.]) input_stream = self.ssc.queueStream( [self.sc.parallelize(batch, 1) for batch in batches]) stkm.trainOn(input_stream) self.ssc.start() def condition(): self.assertEqual(stkm.latestModel().clusterWeights, [25.0]) return True eventually(condition, catch_assertions=True) realCenters = array_sum(array(centers), axis=0) for i in range(5): modelCenters = stkm.latestModel().centers[0][i] self.assertAlmostEqual(centers[0][i], modelCenters, 1) self.assertAlmostEqual(realCenters[i], modelCenters, 1)
def test_trainOn_predictOn(self): """Test that prediction happens on the updated model.""" stkm = StreamingKMeans(decayFactor=0.0, k=2) stkm.setInitialCenters([[0.0], [1.0]], [1.0, 1.0]) # Since decay factor is set to zero, once the first batch # is passed the clusterCenters are updated to [-0.5, 0.7] # which causes 0.2 & 0.3 to be classified as 1, even though the # classification based in the initial model would have been 0 # proving that the model is updated. batches = [[[-0.5], [0.6], [0.8]], [[0.2], [-0.1], [0.3]]] batches = [sc.parallelize(batch) for batch in batches] input_stream = self.ssc.queueStream(batches) predict_results = [] def collect(rdd): rdd_collect = rdd.collect() if rdd_collect: predict_results.append(rdd_collect) stkm.trainOn(input_stream) predict_stream = stkm.predictOn(input_stream) predict_stream.foreachRDD(collect) t = time() self.ssc.start() self._ssc_wait(t, 6.0, 0.01) self.assertEqual(predict_results, [[0, 1, 1], [1, 0, 1]])
def test_trainOn_model(self): """Test the model on toy data with four clusters.""" stkm = StreamingKMeans() initCenters = [[1.0, 1.0], [-1.0, 1.0], [-1.0, -1.0], [1.0, -1.0]] stkm.setInitialCenters(centers=initCenters, weights=[1.0, 1.0, 1.0, 1.0]) # Create a toy dataset by setting a tiny offest for each point. offsets = [[0, 0.1], [0, -0.1], [0.1, 0], [-0.1, 0]] batches = [] for offset in offsets: batches.append([[offset[0] + center[0], offset[1] + center[1]] for center in initCenters]) batches = [self.sc.parallelize(batch, 1) for batch in batches] input_stream = self.ssc.queueStream(batches) stkm.trainOn(input_stream) t = time() self.ssc.start() # Give enough time to train the model. self._ssc_wait(t, 6.0, 0.01) finalModel = stkm.latestModel() self.assertTrue(all(finalModel.centers == array(initCenters))) self.assertEquals(finalModel.clusterWeights, [5.0, 5.0, 5.0, 5.0])
def test_model_params(self): """Test that the model params are set correctly""" stkm = StreamingKMeans() stkm.setK(5).setDecayFactor(0.0) self.assertEqual(stkm._k, 5) self.assertEqual(stkm._decayFactor, 0.0) # Model not set yet. self.assertIsNone(stkm.latestModel()) self.assertRaises(ValueError, stkm.trainOn, [0.0, 1.0]) stkm.setInitialCenters( centers=[[0.0, 0.0], [1.0, 1.0]], weights=[1.0, 1.0]) self.assertEqual( stkm.latestModel().centers, [[0.0, 0.0], [1.0, 1.0]]) self.assertEqual(stkm.latestModel().clusterWeights, [1.0, 1.0])
def test_model_params(self): """Test that the model params are set correctly""" stkm = StreamingKMeans() stkm.setK(5).setDecayFactor(0.0) self.assertEquals(stkm._k, 5) self.assertEquals(stkm._decayFactor, 0.0) # Model not set yet. self.assertIsNone(stkm.latestModel()) self.assertRaises(ValueError, stkm.trainOn, [0.0, 1.0]) stkm.setInitialCenters( centers=[[0.0, 0.0], [1.0, 1.0]], weights=[1.0, 1.0]) self.assertEquals( stkm.latestModel().centers, [[0.0, 0.0], [1.0, 1.0]]) self.assertEquals(stkm.latestModel().clusterWeights, [1.0, 1.0])
def test_predictOn_model(self): """Test that the model predicts correctly on toy data.""" stkm = StreamingKMeans() stkm._model = StreamingKMeansModel( clusterCenters=[[1.0, 1.0], [-1.0, 1.0], [-1.0, -1.0], [1.0, -1.0]], clusterWeights=[1.0, 1.0, 1.0, 1.0]) predict_data = [[[1.5, 1.5]], [[-1.5, 1.5]], [[-1.5, -1.5]], [[1.5, -1.5]]] predict_data = [sc.parallelize(batch, 1) for batch in predict_data] predict_stream = self.ssc.queueStream(predict_data) predict_val = stkm.predictOn(predict_stream) result = [] def update(rdd): rdd_collect = rdd.collect() if rdd_collect: result.append(rdd_collect) predict_val.foreachRDD(update) t = time() self.ssc.start() self._ssc_wait(t, 6.0, 0.01) self.assertEquals(result, [[0], [1], [2], [3]])
class StreamingUpdate(object): """ Streaming Update: DStream """ def __init__(self, init_clusters, decay_factor, time_unit, sc, ssc): self.init_clusters=init_clusters self.decay_factor=decay_factor self.time_unit=time_unit self.sc=sc self.ssc=ssc # implement def streaming(self, mnk, clusters, init_clusters): self.mnk=mnk self.clusters=clusters self.init_clusters=init_clusters self.streaming_kmeans=StreamingKMeans(self.init_clusters, self.decay_factor, self.time_unit) self.streaming_kmeans.setInitialCenters(self.mnk.cluster_centers_, np.ones([self.init_clusters])) # update shape for centers in StreamingContext """ Từ Ä‘iển được cáºp nháºt khi có tin tức má»›i đến thì em cáºp nháºt lại kÃch thÆ°á»›c của các centroid VD: Từ Ä‘iển ban đầu có kÃch thÆ°á»›c 10 từ Em biểu diá»…n má»™t câu có 5 từ bằng sparse vector kÃch thÆ°á»›c 5x10 Từ Ä‘iển sau khi cáºp nháºt có 15 từ thì câu trên phải biểu diá»…n lại bằng sparse vector có kÃch thÆ°á»›c 5x15 Có cách biểu diá»…n khác mà không phải cáºp nháºt lại biểu diá»…n của câu không ạ """ def update_shape(self, docs, dictionary): self.streaming_kmeans.setRandomCenters(matrix_tfidf(docs, dictionary).shape[1], 1.0, 0) # save matrix update def save_matrix_update(self, docs, dictionary): np.savetxt('/home/ducvu/input_streaming.txt', matrix_tfidf(docs, dictionary)) # load dstream def load_dstream(self): self.dstream = self.sc.textFile("/home/ducvu/input_streaming.txt")\ .map(lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')])) # make predict def make_predict(self, docs, dictionary): self.streaming_kmeans.trainOn(self.load_dstream()) self.pred_stream=[] matrix=matrix_tfidf(docs) for x in matrix: self.pred_stream.append(self.streaming_kmeans.latest_model.predict(x)) self.pred_stream=np.array(self.pred_stream) df = pd.DataFrame(matrix).groupby(self.pred_stream).mean() for i, r in df.iterrows(): print('\nCluster {0}:'.format(i)) print(','.join([get_tfidf(dictionary).get_feature_names()[t] for t in np.argsort(r)[-15:]]))
def test_accuracy_for_single_center(self): """Test that parameters obtained are correct for a single center.""" centers, batches = self.streamingKMeansDataGenerator( batches=5, numPoints=5, k=1, d=5, r=0.1, seed=0) stkm = StreamingKMeans(1) stkm.setInitialCenters([[0., 0., 0., 0., 0.]], [0.]) input_stream = self.ssc.queueStream( [self.sc.parallelize(batch, 1) for batch in batches]) stkm.trainOn(input_stream) t = time() self.ssc.start() self._ssc_wait(t, 10.0, 0.01) self.assertEquals(stkm.latestModel().clusterWeights, [25.0]) realCenters = array_sum(array(centers), axis=0) for i in range(5): modelCenters = stkm.latestModel().centers[0][i] self.assertAlmostEqual(centers[0][i], modelCenters, 1) self.assertAlmostEqual(realCenters[i], modelCenters, 1)
def detect(self, k, t): # Encoding categorical features using one-hot. df1 = self.cat2Num(self.rawDF, [0, 1]).cache() df1.show(n=2, truncate=False) # Clustering points using KMeans features = df1.select("features").rdd.map(lambda row: row[0]).cache() model = StreamingKMeans(k=7, decayFactor=1.0).setRandomCenters(4, 1.0, 0) # model = KMeans.train(features, k, maxIterations=40, runs=10, initializationMode="random", seed=20) # Adding the prediction column to df1 modelBC = sc.broadcast(model) predictUDF = udf(lambda x: modelBC.value.predict(x), StringType()) df2 = df1.withColumn("prediction", predictUDF(df1.features)).cache() df2.show(n=3, truncate=False) # Adding the score column to df2; The higher the score, the more likely it is an anomaly df3 = self.addScore(df2).cache() df3.show(n=3, truncate=False) return df3.where(df3.score > t)
from pyspark.mllib.clustering import StreamingKMeans if __name__ == "__main__": sc = SparkContext(appName="StreamingErrorCount") ssc = StreamingContext(sc, 2) ssc.checkpoint("/tmp/spark") def parseTrainingData(line): cells = line.split(",") return Vectors.dense([float(cells[0]), float(cells[1])]) trainingStream = ssc.textFileStream("./training")\ .map(parseTrainingData) model = StreamingKMeans(k=2, decayFactor=1.0).setRandomCenters(2, 1.0, 0) print "Intial centers: {0}".format(model.latestModel().centers) model.trainOn(trainingStream) ssc.start() s = sched.scheduler(time.time, time.sleep) def print_cluster_centers(sc, model): print "Cluster centers: {0}".format(model.latestModel().centers) s.enter(10, 1, print_cluster_centers, (sc, model)) s.enter(10, 1, print_cluster_centers, (s, model)) s.run()
# ptext[clus].set_text(str(clus)+ ':'+str(row[1][1])) # ptext[clus].set_color(colors[clus]) # plt.pause(0.0001) # q = multiprocessing.Queue() f = multiprocessing.Queue() job_for_another_core2 = multiprocessing.Process(target=data_plotting, args=(q, )) job_for_another_core2.start() sc = SparkContext('local[4]', 'Social Panic Analysis') # Create a local StreamingContext with two working thread and batch interval of 1 second ssc = StreamingContext(sc, 10) dstream = ssc.socketTextStream("localhost", 9998) trainingData = dstream.map(Vectors.parse) trainingData.pprint() testData = trainingData.map(lambda x: (x, x)) testData.pprint() model = StreamingKMeans(k=clusterNum, decayFactor=0.1).setRandomCenters(2, 1.0, 0) model.trainOn(trainingData) print(model.latestModel().clusterCenters) clust = model.predictOnValues(testData) clust.pprint() #print(model.predictOnValues(testData.map(lambda lp: (lp.label, lp.features)))) clust.foreachRDD(lambda time, rdd: q.put(rdd.collect())) ssc.start() ssc.awaitTermination()
trainingData = sc.textFile("data/datatraining.txt")\ .map(lambda line: line.split(',')[2:-1]).map(lambda arr: Vectors.dense([float(x) for x in arr])) centers = KMeans.train(trainingData, 2).centers trainingQueue = [trainingData] trainingStream = ssc.queueStream(trainingQueue) # We create a model with random clusters and specify the number of clusters to find model = StreamingKMeans(k=2, decayFactor=0.3)#.setRandomCenters(5, 1.0, 0) model.setInitialCenters( centers, [1.0,1.0,1.0,1.0,1.0]) # Now register the streams for training and testing and start the job, # printing the predicted cluster assignments on new data points as they arrive. model.trainOn(trainingStream) def parse(lp): #label = float(lp[lp.find('(') + 1: lp.find(')')]) #vec = Vectors.dense(lp[lp.find('[') + 1: lp.find(']')].split(',')) arr = lp.split(',')[2:-1] label = lp.split(',')[0] label = label[1:-1] vec = Vectors.dense([float(x) for x in arr]) print(model.latestModel().centers) return LabeledPoint(label, vec)
for center in file: initialCenters.append(center.split()) logging.info(initialCenters) initialWeights = [] for i in initialCenters: initialWeights.append(1.0) config = sc.broadcast(parameters) numberClusters = config.value[0] mongoIP = config.value[1] mongoDataBase = config.value[2] mongoCollection = config.value[3] stkm = StreamingKMeans(k=numberClusters, decayFactor=1).setInitialCenters( initialCenters, initialWeights) #stkm = StreamingKMeans(k=int(numberClusters),decayFactor=1.0).setRandomCenters(2,1.0,100) directKafkaStream = KafkaUtils.createDirectStream( ssc, ['StreamingKMeansTFG'], { "metadata.broker.list": "localhost:9092", "auto_offset_reset": 'earliest' }) parsed = directKafkaStream.map(lambda v: loads(v[1])) parsed = parsed.map( lambda line: Vectors.dense([float(x) for x in line.strip().split()])) stkm.trainOn(parsed)
def parse(lp): label = float(lp[lp.find('(') + 1: lp.find(')')]) vec = Vectors.dense(lp[lp.find('[') + 1: lp.find(']')].split(',')) return LabeledPoint(label, vec) trainingData = sc.textFile("spark-2.0.1-bin-hadoop2.7/data/mllib/kmeans_data.txt")\ .map(lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')])) trainingQueue = [trainingData] trainingStream = ssc.queueStream(trainingQueue) testingStream = ssc.textFileStream('history').map(parse) # We create a model with random clusters and specify the number of clusters to find model = StreamingKMeans(k=2, decayFactor=1.0).setRandomCenters(3, 1.0, 0) # Now register the streams for training and testing and start the job, # printing the predicted cluster assignments on new data points as they arrive. model.trainOn(trainingStream) result = model.predictOnValues(testingStream.map(lambda lp: (lp.label, lp.features))) result.pprint() ssc.start() ssc.awaitTermination() #ssc.stop(stopSparkContext=True, stopGraceFully=True) # $example off$ print("Final centers: " + str(model.latestModel().centers))
from pyspark.mllib.linalg import Vectors from pyspark.mllib.clustering import StreamingKMeans if __name__ == "__main__": sc = SparkContext(appName="sai twitter feed") ssc = StreamingContext(sc, 10) ssc.checkpoint("chkpfile") def parserData(line): cells = line.split(",") return Vectors.dense([float(cells[0]), float(cells[1])]) trainingStream = ssc.textFileStream("/files").map(parserData) model = StreamingKMeans(k=2, decayFactor=1.0).setRandomCenters(2, 1.0, 0) print("Initial Centres" + str(model.latestModel().centers)) model.trainOn(trainingStream) ssc.start() s = sched.scheduler(time.time, time.sleep) def print_cluster_centres(sc, model): print(str(model.latestModel().centers)) s.enter(10, 1, print_cluster_centres, (s, model)) s.enter(10, 1, print_cluster_centres, (s, model)) s.run() ssc.awaitTermination() # to make this work
.filter(lambda post: 'created_at' in post)\ .map(lambda post: (get_coord2(post)[0],get_coord2(post)[1],post["text"]))\ .filter(lambda tpl: tpl[0] != 0)\ .filter(lambda tpl: tpl[2] != '')\ .map(lambda tpl: (tpl[0],tpl[1],tokenize(tpl[2])))\ .map(lambda tpl:(tpl[0],tpl[1],tpl[2],doc2vec(tpl[2]))) #dstream_tweets.pprint() trainingData=dstream_tweets.map(lambda tpl: [tpl[0],tpl[1]]+tpl[3].tolist()) #trainingData.pprint() testdata=dstream_tweets.map(lambda tpl: (([tpl[0],tpl[1]],tpl[2]),[tpl[0],tpl[1]]+tpl[3].tolist())) #testdata.pprint() # model = StreamingKMeans(k=clusterNum, decayFactor=0.6).setRandomCenters(102, 1.0, 3) model.trainOn(trainingData) clust=model.predictOnValues(testdata) #clust.pprint() #words = lines.flatMap(lambda line: line.split(" ")) topic=clust.map(lambda x: (x[1],x[0][1])) #topic.pprint() topicAgg = topic.reduceByKey(lambda x,y: x+y) #wordCollect.pprint() topicAgg.map(lambda x: (x[0],freqcount(x[1]))).pprint() clust.foreachRDD(lambda time, rdd: q.put(rdd.collect())) # Run! ssc.start() ssc.awaitTermination()
) from pyspark import SparkContext from pyspark.streaming import StreamingContext from pyspark.mllib.linalg import Vectors from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.clustering import StreamingKMeans def parse(lp): label = float(lp[lp.find('(') + 1:lp.find(',')]) vec = Vectors.dense(lp[lp.find('[') + 1:lp.find(']')].split(',')) return LabeledPoint(label, vec) sc = SparkContext('local[4]', 'Social Panic Analysis') # Create a local StreamingContext with two working thread and batch interval of 1 second ssc = StreamingContext(sc, 10) trainingData = ssc.textFileStream("./training/").map(Vectors.parse) trainingData.pprint() testData = ssc.textFileStream("./testing/").map(parse) testData.pprint() model = StreamingKMeans(k=5, decayFactor=1.0).setRandomCenters(3, 1.0, 0) model.trainOn(trainingData) model.predictOnValues( testData.map(lambda lp: (lp.label, lp.features))).pprint() #print(model.predictOnValues(testData.map(lambda lp: (lp.label, lp.features)))) ssc.start() ssc.awaitTermination()
# clus=row[0] # #ptext[clus].set_text(str(clus)+ ':'+str([x[0] for x in row[1][1]])) # ptext[clus].set_text(str(clus)+ ':'+str(row[1][1])) # ptext[clus].set_color(colors[clus]) # plt.pause(0.0001) # q = multiprocessing.Queue() f = multiprocessing.Queue() job_for_another_core2 = multiprocessing.Process(target=data_plotting,args=(q,)) job_for_another_core2.start() sc = SparkContext('local[4]', 'Social Panic Analysis') # Create a local StreamingContext with two working thread and batch interval of 1 second ssc = StreamingContext(sc, 10) dstream = ssc.socketTextStream("localhost", 9998) trainingData = dstream.map(Vectors.parse) trainingData.pprint() testData=trainingData.map(lambda x: (x,x)) testData.pprint() model = StreamingKMeans(k=clusterNum, decayFactor=0.1).setRandomCenters(2, 1.0, 0) model.trainOn(trainingData) print(model.latestModel().clusterCenters) clust=model.predictOnValues(testData) clust.pprint() #print(model.predictOnValues(testData.map(lambda lp: (lp.label, lp.features)))) clust.foreachRDD(lambda time, rdd: q.put(rdd.collect())) ssc.start() ssc.awaitTermination()
os.makedirs("results") except: pass output_file = open(RESULT_FILE, "w") start = time.time() #output_file.write("Measurement,Number_Partitions, Time\n") #output_file.write("Spark Startup, %s, %.5f\n"%(NUMBER_PARTITIONS, time.time()-start)) #output_file.flush() ####################################################################################### decayFactor = 1.0 timeUnit = "batches" model = StreamingKMeans(k=10, decayFactor=decayFactor, timeUnit=timeUnit).setRandomCenters(3, 1.0, 0) #def printOffsetRanges(rdd): # for o in offsetRanges: # print "%s %s %s %s" % (o.topic, o.partition, o.fromOffset, o.untilOffset) def count_records(rdd): print str(type(rdd)) if rdd != None: return rdd.collect() return [0] ## OK
sc = SparkContext(appName="StreamingKMeansClustering") ssc = StreamingContext(sc, 10) ssc.checkpoint("file:///tmp/spark") def parseTrainingData(line): cells = line.split(",") return Vectors.dense([float(cells[0]), float(cells[1])]) trainingStream = ssc.textFileStream("file:///Users/jananiravi/spark/spark-2.1.0-bin-without-hadoop/tweets/training")\ .map(parseTrainingData) trainingStream.pprint(); model = StreamingKMeans(k=2, decayFactor=1.0).setRandomCenters(2, 1.0, 0) print("Initial centers: " + str(model.latestModel().centers)) model.trainOn(trainingStream) ssc.start() s = sched.scheduler(time.time, time.sleep) def print_cluster_centers(sc, model): print("Cluster centers: " + str(model.latestModel().centers)) s.enter(10, 1, print_cluster_centers, (sc, model)) s.enter(10, 1, print_cluster_centers, (s, model)) s.run()
print('Applying PCA on training data...') PCA_model = PCA(low_dim).fit(tfidf_training) tfidf_training = PCA_model.transform(tfidf_training) k = low_dim # pcArray = model.transform(tfidf_training.first()).toArray() #setting checkpoint # ssc.checkpoint("/Users/davidenardone/Desktop/checkpoint") # CREATING DStream FROM TRAINING'S RDD trainingQueue = [tfidf_training] trainingStream = ssc.queueStream(trainingQueue) # CREATING A K-MEANS MODEL WITH RANDOM CLUSTERS SPECIFYING THE NUMBER OF CLUSTERS TO FIND model = StreamingKMeans(k=2, decayFactor=1.0, timeUnit='batches').setRandomCenters(k, 1.0, 0) # print("K centers: " + str(model.latestModel().centers)) # TRAINING THE MODEL ON THE TRAINING TWEET'S DATA print('Training K-means Model...') model.trainOn(trainingStream) print('done!') # CREATE DIRECT KAFKA STREAM WITH BROKERS AND TOPICS streamData = KafkaUtils.createDirectStream( ssc, [kafka_topic], {"metadata.broker.list": kafka_brokers}) ######### FROM NOW ON, EACH ACTION OR TRANSFORMATION IS DONE ON A SINGLE INCOMING BATCH OF TWEETS ######### # PRE-PROCESSING TWEETS DATA (TESTING)
sc = SparkContext(appName="StreamingKMeans") ssc = StreamingContext(sc, 10) ssc.checkpoint("/tmp/checkpoints/") initialCenters = [[604328, 574379], [801908, 318382], [416383, 786204], [822771, 732034], [850993, 157873], [338586, 563537], [169274, 348574], [619259, 397671], [241071, 844424], [321801, 165319], [139493, 557352], [508785, 174800], [398934, 404142], [860858, 546059], [674365, 860464]] initialWeights = [ 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 ] stkm = StreamingKMeans(k=sys.argv[1], decayFactor=1.0).setInitialCenters( [[500, 500], [600, 600]], [1.0, 1.0]) directKafkaStream = KafkaUtils.createDirectStream( ssc, ['StreamingKMeansTFG'], { "metadata.broker.list": "localhost:9092", "auto_offset_reset": 'earliest' }) parsed = directKafkaStream.map(lambda v: loads(v[1])) parsed = parsed.map( lambda line: Vectors.dense([float(x) for x in line.strip().split()])) stkm.trainOn(parsed)
sys.path.append('/usr/local/Cellar/apache-spark/spark-1.5.2-bin-hadoop2.6/python/') # Append the python/build to PYTHONPATH so that py4j could be found sys.path.append('/usr/local/Cellar/apache-spark/spark-1.5.2-bin-hadoop2.6/python/lib/py4j-0.8.2.1-src.zip') from pyspark import SparkContext from pyspark.streaming import StreamingContext from pyspark.mllib.linalg import Vectors from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.clustering import StreamingKMeans def parse(lp): label = float(lp[lp.find('(') + 1: lp.find(',')]) vec = Vectors.dense(lp[lp.find('[') + 1: lp.find(']')].split(',')) return LabeledPoint(label, vec) sc = SparkContext('local[4]', 'Social Panic Analysis') # Create a local StreamingContext with two working thread and batch interval of 1 second ssc = StreamingContext(sc, 10) trainingData = ssc.textFileStream("./training/").map(Vectors.parse) trainingData.pprint() testData = ssc.textFileStream("./testing/").map(parse) testData.pprint() model = StreamingKMeans(k=5, decayFactor=1.0).setRandomCenters(3, 1.0, 0) model.trainOn(trainingData) model.predictOnValues(testData.map(lambda lp: (lp.label, lp.features))).pprint() #print(model.predictOnValues(testData.map(lambda lp: (lp.label, lp.features)))) ssc.start() ssc.awaitTermination()
return LabeledPoint(label, vec) trainingData = sc.textFile("/Users/tung/Documents/spark-2.4.3/data/mllib/kmeans_data.txt")\ .map(lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')])) testingData = sc.textFile( "/Users/tung/Documents/spark-2.4.3/data/mllib/streaming_kmeans_data_test.txt" ).map(parse) trainingQueue = [trainingData] testingQueue = [testingData] trainingStream = ssc.queueStream(trainingQueue) testingStream = ssc.queueStream(testingQueue) # We create a model with random clusters and specify the number of clusters to find model = StreamingKMeans(k=2, decayFactor=1.0).setRandomCenters(3, 1.0, 0) # Now register the streams for training and testing and start the job, # printing the predicted cluster assignments on new data points as they arrive. model.trainOn(trainingStream) result = model.predictOnValues( testingStream.map(lambda lp: (lp.label, lp.features))) result.pprint() ssc.start() ssc.stop(stopSparkContext=True, stopGraceFully=True) print("Final centers: " + str(model.latestModel().centers))
from pyspark.mllib.clustering import StreamingKMeans from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.linalg import Vectors from pyspark import SparkContext from pyspark.streaming import StreamingContext # Create a local StreamingContext with two working thread and batch interval of 1 second sc = SparkContext("local[2]", "NetworkWordCount") ssc = StreamingContext(sc, 1) # continuous training trainigData = ssc.textFileStream("/training/data/dir").map(Vectors.parse) testData = ssc.textFileStream("/training/data/dir").map(Vectors.parse) testData = ssc.textFileStream("/testing/data/dir").map( lambda s: LabeledPoint.parse(s)) model = StreamingKMeans()\ .setK(3)\ .setDecayFactor(1.0)\ .setRandomCenters(dim=3, weight=0.0, seed=42) model.trainOn(trainingData) prediction = model.predictOnValues(testData) print(prediction)
return LabeledPoint(label_, vec) #testingData = sc.textFile("data/mllib/streaming_kmeans_data_test.txt").map(parse) #testingData = sc.textFile("streaming_kmeans_data_test.txt").map(parse) testingData = sc.parallelize(train_vec).map(parse_vec) trainingQueue = [trainingData] testingQueue = [testingData] trainingStream = ssc.queueStream(trainingQueue) testingStream = ssc.queueStream(testingQueue) # We create a model with random clusters and specify the number of clusters to find #model = StreamingKMeans(k=2, decayFactor=1.0).setRandomCenters(3, 1.0, 0) model = StreamingKMeans(k=3, decayFactor=1.0).setRandomCenters(100, 1.0, 0) # Now register the streams for training and testing and start the job, # printing the predicted cluster assignments on new data points as they arrive. #model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() #model.trainOn(trainingStream) #print("TEST HERE") #result = model.predictOnValues(testingStream.map(lambda lp: (lp.label, lp.features))) model.trainOn(trainingStream) print("TEST HERE") result = model.predictOnValues( testingStream.map(lambda lp: (lp.label, lp.features))) #result.pprint(num = 20) #if result.count() != 0:
sc = SparkContext(master="local[4]", appName="Streaming-KMeans", conf=conf) ssc = StreamingContext(sc, 5) # Kafka Stream ks = KafkaUtils.createDirectStream( ssc, ["test"], {"metadata.broker.list": "localhost:9092"}) trainingData = sc.textFile("data/datatraining.txt")\ .map(lambda line: line.split(',')[2:-1]).map(lambda arr: Vectors.dense([float(x) for x in arr])) # Supplied to Streaming KMeans as the centers by StreamingKmeans are not giving good predictions init_centers = KMeans.train(trainingData, 2).centers model = StreamingKMeans(k=2, decayFactor=0.1)\ .setInitialCenters(init_centers, [1.0, 1.0, 1.0, 1.0, 1.0]) model.trainOn(ssc.queueStream([trainingData])) def parse(lp): arr = lp.split(',')[2:-1] label = lp.split(',')[0] vec = Vectors.dense([float(x) for x in arr]) return LabeledPoint(label, vec) test_stream = ks.map(lambda x: x[1]).map(parse) result = model.predictOnValues( test_stream.map(lambda lp: (lp.label, lp.features))) # Prints Prediction Prediction and Cluster Centers
.filter(lambda post: 'created_at' in post)\ .map(lambda post: (get_coord2(post)[0],get_coord2(post)[1],post["text"]))\ .filter(lambda tpl: tpl[0] != 0)\ .filter(lambda tpl: tpl[2] != '')\ .map(lambda tpl: (tpl[0],tpl[1],tokenize(tpl[2])))\ .map(lambda tpl:(tpl[0],tpl[1],tpl[2],doc2vec(tpl[2]))) #dstream_tweets.pprint() trainingData = dstream_tweets.map( lambda tpl: [tpl[0], tpl[1]] + tpl[3].tolist()) #trainingData.pprint() testdata = dstream_tweets.map(lambda tpl: ( ([tpl[0], tpl[1]], tpl[2]), [tpl[0], tpl[1]] + tpl[3].tolist())) #testdata.pprint() # model = StreamingKMeans(k=clusterNum, decayFactor=0.6).setRandomCenters(102, 1.0, 3) model.trainOn(trainingData) clust = model.predictOnValues(testdata) #clust.pprint() #words = lines.flatMap(lambda line: line.split(" ")) topic = clust.map(lambda x: (x[1], x[0][1])) #topic.pprint() topicAgg = topic.reduceByKey(lambda x, y: x + y) #wordCollect.pprint() topicAgg.map(lambda x: (x[0], freqcount(x[1]))).pprint() clust.foreachRDD(lambda time, rdd: q.put(rdd.collect())) # Run! ssc.start() ssc.awaitTermination()
def streaming(self, mnk, clusters, init_clusters): self.mnk=mnk self.clusters=clusters self.init_clusters=init_clusters self.streaming_kmeans=StreamingKMeans(self.init_clusters, self.decay_factor, self.time_unit) self.streaming_kmeans.setInitialCenters(self.mnk.cluster_centers_, np.ones([self.init_clusters]))