def test_model_params(self): """Test that the model params are set correctly""" stkm = StreamingKMeans() stkm.setK(5).setDecayFactor(0.0) self.assertEqual(stkm._k, 5) self.assertEqual(stkm._decayFactor, 0.0) # Model not set yet. self.assertIsNone(stkm.latestModel()) self.assertRaises(ValueError, stkm.trainOn, [0.0, 1.0]) stkm.setInitialCenters(centers=[[0.0, 0.0], [1.0, 1.0]], weights=[1.0, 1.0]) self.assertEqual(stkm.latestModel().centers, [[0.0, 0.0], [1.0, 1.0]]) self.assertEqual(stkm.latestModel().clusterWeights, [1.0, 1.0])
def test_trainOn_model(self): """Test the model on toy data with four clusters.""" stkm = StreamingKMeans() initCenters = [[1.0, 1.0], [-1.0, 1.0], [-1.0, -1.0], [1.0, -1.0]] stkm.setInitialCenters( centers=initCenters, weights=[1.0, 1.0, 1.0, 1.0]) # Create a toy dataset by setting a tiny offest for each point. offsets = [[0, 0.1], [0, -0.1], [0.1, 0], [-0.1, 0]] batches = [] for offset in offsets: batches.append([[offset[0] + center[0], offset[1] + center[1]] for center in initCenters]) batches = [self.sc.parallelize(batch, 1) for batch in batches] input_stream = self.ssc.queueStream(batches) stkm.trainOn(input_stream) t = time() self.ssc.start() # Give enough time to train the model. self._ssc_wait(t, 6.0, 0.01) finalModel = stkm.latestModel() self.assertTrue(all(finalModel.centers == array(initCenters))) self.assertEquals(finalModel.clusterWeights, [5.0, 5.0, 5.0, 5.0])
def test_model_params(self): """Test that the model params are set correctly""" stkm = StreamingKMeans() stkm.setK(5).setDecayFactor(0.0) self.assertEquals(stkm._k, 5) self.assertEquals(stkm._decayFactor, 0.0) # Model not set yet. self.assertIsNone(stkm.latestModel()) self.assertRaises(ValueError, stkm.trainOn, [0.0, 1.0]) stkm.setInitialCenters( centers=[[0.0, 0.0], [1.0, 1.0]], weights=[1.0, 1.0]) self.assertEquals( stkm.latestModel().centers, [[0.0, 0.0], [1.0, 1.0]]) self.assertEquals(stkm.latestModel().clusterWeights, [1.0, 1.0])
def test_trainOn_model(self): """Test the model on toy data with four clusters.""" stkm = StreamingKMeans() initCenters = [[1.0, 1.0], [-1.0, 1.0], [-1.0, -1.0], [1.0, -1.0]] stkm.setInitialCenters(centers=initCenters, weights=[1.0, 1.0, 1.0, 1.0]) # Create a toy dataset by setting a tiny offest for each point. offsets = [[0, 0.1], [0, -0.1], [0.1, 0], [-0.1, 0]] batches = [] for offset in offsets: batches.append([[offset[0] + center[0], offset[1] + center[1]] for center in initCenters]) batches = [self.sc.parallelize(batch, 1) for batch in batches] input_stream = self.ssc.queueStream(batches) stkm.trainOn(input_stream) t = time() self.ssc.start() # Give enough time to train the model. self._ssc_wait(t, 6.0, 0.01) finalModel = stkm.latestModel() self.assertTrue(all(finalModel.centers == array(initCenters))) self.assertEquals(finalModel.clusterWeights, [5.0, 5.0, 5.0, 5.0])
def test_accuracy_for_single_center(self): """Test that parameters obtained are correct for a single center.""" centers, batches = self.streamingKMeansDataGenerator(batches=5, numPoints=5, k=1, d=5, r=0.1, seed=0) stkm = StreamingKMeans(1) stkm.setInitialCenters([[0., 0., 0., 0., 0.]], [0.]) input_stream = self.ssc.queueStream( [self.sc.parallelize(batch, 1) for batch in batches]) stkm.trainOn(input_stream) self.ssc.start() def condition(): self.assertEqual(stkm.latestModel().clusterWeights, [25.0]) return True eventually(condition, catch_assertions=True) realCenters = array_sum(array(centers), axis=0) for i in range(5): modelCenters = stkm.latestModel().centers[0][i] self.assertAlmostEqual(centers[0][i], modelCenters, 1) self.assertAlmostEqual(realCenters[i], modelCenters, 1)
def perform_training(sc: SparkContext, params_dict: dict): batch_duration = 1 if 'batch_duration' not in params_dict else params_dict[ 'batch_duration'] training_duration = 20 if 'training_duration' not in params_dict else params_dict[ 'training_duration'] ssc = StreamingContext(sc, batch_duration) topics = ['normal-ekg-stream'] kafka_params = {'metadata.broker.list': 'localhost:9092'} kvs = KafkaUtils.createDirectStream( ssc, topics, kafkaParams=kafka_params, valueDecoder=lambda val: json.loads(val.decode('utf-8'))) windowed_signal = kvs.map(lambda msg: Vectors.dense( [float(value) for value in msg[1]['signal_values']])) # windowed_signal.foreachRDD(Plotter.plot_signal_window) model = StreamingKMeans(k=20, decayFactor=1.0).setRandomCenters(188, 1.0, 0) model.trainOn(windowed_signal) ssc.start() ssc.awaitTerminationOrTimeout(training_duration) ssc.stop(stopSparkContext=False, stopGraceFully=True) return model.latestModel()
def test_accuracy_for_single_center(self): """Test that parameters obtained are correct for a single center.""" centers, batches = self.streamingKMeansDataGenerator( batches=5, numPoints=5, k=1, d=5, r=0.1, seed=0) stkm = StreamingKMeans(1) stkm.setInitialCenters([[0., 0., 0., 0., 0.]], [0.]) input_stream = self.ssc.queueStream( [self.sc.parallelize(batch, 1) for batch in batches]) stkm.trainOn(input_stream) t = time() self.ssc.start() self._ssc_wait(t, 10.0, 0.01) self.assertEquals(stkm.latestModel().clusterWeights, [25.0]) realCenters = array_sum(array(centers), axis=0) for i in range(5): modelCenters = stkm.latestModel().centers[0][i] self.assertAlmostEqual(centers[0][i], modelCenters, 1) self.assertAlmostEqual(realCenters[i], modelCenters, 1)
from pyspark.mllib.clustering import StreamingKMeans if __name__ == "__main__": sc = SparkContext(appName="sai twitter feed") ssc = StreamingContext(sc, 10) ssc.checkpoint("chkpfile") def parserData(line): cells = line.split(",") return Vectors.dense([float(cells[0]), float(cells[1])]) trainingStream = ssc.textFileStream("/files").map(parserData) model = StreamingKMeans(k=2, decayFactor=1.0).setRandomCenters(2, 1.0, 0) print("Initial Centres" + str(model.latestModel().centers)) model.trainOn(trainingStream) ssc.start() s = sched.scheduler(time.time, time.sleep) def print_cluster_centres(sc, model): print(str(model.latestModel().centers)) s.enter(10, 1, print_cluster_centres, (s, model)) s.enter(10, 1, print_cluster_centres, (s, model)) s.run() ssc.awaitTermination() # to make this work #spark-submit "C:\SaiStudy - LEarn It All - Version9\Saistudy - split-csv.py"
# ptext[clus].set_text(str(clus)+ ':'+str(row[1][1])) # ptext[clus].set_color(colors[clus]) # plt.pause(0.0001) # q = multiprocessing.Queue() f = multiprocessing.Queue() job_for_another_core2 = multiprocessing.Process(target=data_plotting, args=(q, )) job_for_another_core2.start() sc = SparkContext('local[4]', 'Social Panic Analysis') # Create a local StreamingContext with two working thread and batch interval of 1 second ssc = StreamingContext(sc, 10) dstream = ssc.socketTextStream("localhost", 9998) trainingData = dstream.map(Vectors.parse) trainingData.pprint() testData = trainingData.map(lambda x: (x, x)) testData.pprint() model = StreamingKMeans(k=clusterNum, decayFactor=0.1).setRandomCenters(2, 1.0, 0) model.trainOn(trainingData) print(model.latestModel().clusterCenters) clust = model.predictOnValues(testData) clust.pprint() #print(model.predictOnValues(testData.map(lambda lp: (lp.label, lp.features)))) clust.foreachRDD(lambda time, rdd: q.put(rdd.collect())) ssc.start() ssc.awaitTermination()
return LabeledPoint(label, vec) trainingData = sc.textFile("/Users/tung/Documents/spark-2.4.3/data/mllib/kmeans_data.txt")\ .map(lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')])) testingData = sc.textFile( "/Users/tung/Documents/spark-2.4.3/data/mllib/streaming_kmeans_data_test.txt" ).map(parse) trainingQueue = [trainingData] testingQueue = [testingData] trainingStream = ssc.queueStream(trainingQueue) testingStream = ssc.queueStream(testingQueue) # We create a model with random clusters and specify the number of clusters to find model = StreamingKMeans(k=2, decayFactor=1.0).setRandomCenters(3, 1.0, 0) # Now register the streams for training and testing and start the job, # printing the predicted cluster assignments on new data points as they arrive. model.trainOn(trainingStream) result = model.predictOnValues( testingStream.map(lambda lp: (lp.label, lp.features))) result.pprint() ssc.start() ssc.stop(stopSparkContext=True, stopGraceFully=True) print("Final centers: " + str(model.latestModel().centers))
ssc = StreamingContext(sc, 10) ssc.checkpoint("file:///tmp/spark") def parseTrainingData(line): cell = line.split(",") return Vectors.dense([float(cell[0]), float(cell[1])]) trainingStream=ssc.textFileStream(":/tweets/training")\ .map(parseTrainingData) trainingStream.pprint() model = StreamingKMeans(k=5, decayFactor=1.0).setRandomCenters(2, 1.0, 0) print("Initial centers: " + str(model.latestModel().centers)) model.trainOn(trainingStream) ssc.start() s = sched.scheduler(time.time, time.sleep) def print_cluster_centers(sc, model): print("Cluster centers: " + str(model.latestModel().centers)) s.enter(10, 1, print_cluster_centers, (sc, model)) s.enter(10, 1, print_cluster_centers, (s, model)) s.run() ssc.awaitTermination()
ssc = StreamingContext(sc, 10) ssc.checkpoint("file:///tmp/spark") def parseTrainingData(line): cells = line.split(",") return Vectors.dense([float(cells[0]), float(cells[1])]) trainingStream = ssc.textFileStream("file:///Users/jananiravi/spark/spark-2.1.0-bin-without-hadoop/tweets/training")\ .map(parseTrainingData) trainingStream.pprint(); model = StreamingKMeans(k=2, decayFactor=1.0).setRandomCenters(2, 1.0, 0) print("Initial centers: " + str(model.latestModel().centers)) model.trainOn(trainingStream) ssc.start() s = sched.scheduler(time.time, time.sleep) def print_cluster_centers(sc, model): print("Cluster centers: " + str(model.latestModel().centers)) s.enter(10, 1, print_cluster_centers, (sc, model)) s.enter(10, 1, print_cluster_centers, (s, model)) s.run() ssc.awaitTermination()
'file:///home/andrey/Documentos/Projects/streaming-k-means/training/')\ .map(parse_training_data) # Inicializa o algoritmo k-means com streaming para rodar sobre os dados # adicionados ao diretório de streaming. # k=2: Número de clusters em que o dataset será dividido # decayFactor=1.0: Todos os dados, desde o início, são relevantes. # 0.0: Utilização somente dos dados mais recentes. # O k-means requer o centro dos clusters randômicos para iniciar o # processo: # 2: Quantidade de centros a serem setados # 1.0 e 0: weight e seed model = StreamingKMeans(k=2, decayFactor=1.0).setRandomCenters(2, 1.0, 0) # Imprime os centros. print('Initial centers: ' + str(model.latestModel().centers)) # Treinamento do modelo model.trainOn(training_stream) # Inicia a stream ssc.start() # Agenda a impressão dos valores do centros em tempos periódicos s = sched.scheduler(time.time, time.sleep) # Função que imprime os centros recursivamente, a cada 10s. def print_cluster_centers(sc, model): print('Cluster centers: ' + str(str(model.latestModel().centers))) s.enter(10, 1, print_cluster_centers, (sc, model))
# clus=row[0] # #ptext[clus].set_text(str(clus)+ ':'+str([x[0] for x in row[1][1]])) # ptext[clus].set_text(str(clus)+ ':'+str(row[1][1])) # ptext[clus].set_color(colors[clus]) # plt.pause(0.0001) # q = multiprocessing.Queue() f = multiprocessing.Queue() job_for_another_core2 = multiprocessing.Process(target=data_plotting,args=(q,)) job_for_another_core2.start() sc = SparkContext('local[4]', 'Social Panic Analysis') # Create a local StreamingContext with two working thread and batch interval of 1 second ssc = StreamingContext(sc, 10) dstream = ssc.socketTextStream("localhost", 9998) trainingData = dstream.map(Vectors.parse) trainingData.pprint() testData=trainingData.map(lambda x: (x,x)) testData.pprint() model = StreamingKMeans(k=clusterNum, decayFactor=0.1).setRandomCenters(2, 1.0, 0) model.trainOn(trainingData) print(model.latestModel().clusterCenters) clust=model.predictOnValues(testData) clust.pprint() #print(model.predictOnValues(testData.map(lambda lp: (lp.label, lp.features)))) clust.foreachRDD(lambda time, rdd: q.put(rdd.collect())) ssc.start() ssc.awaitTermination()
def parse(lp): label = float(lp[lp.find('(') + 1: lp.find(')')]) vec = Vectors.dense(lp[lp.find('[') + 1: lp.find(']')].split(',')) return LabeledPoint(label, vec) trainingData = sc.textFile("spark-2.0.1-bin-hadoop2.7/data/mllib/kmeans_data.txt")\ .map(lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')])) trainingQueue = [trainingData] trainingStream = ssc.queueStream(trainingQueue) testingStream = ssc.textFileStream('history').map(parse) # We create a model with random clusters and specify the number of clusters to find model = StreamingKMeans(k=2, decayFactor=1.0).setRandomCenters(3, 1.0, 0) # Now register the streams for training and testing and start the job, # printing the predicted cluster assignments on new data points as they arrive. model.trainOn(trainingStream) result = model.predictOnValues(testingStream.map(lambda lp: (lp.label, lp.features))) result.pprint() ssc.start() ssc.awaitTermination() #ssc.stop(stopSparkContext=True, stopGraceFully=True) # $example off$ print("Final centers: " + str(model.latestModel().centers))
if __name__ == "__main__": sc = SparkContext(appName="StreamingErrorCount") ssc = StreamingContext(sc, 2) ssc.checkpoint("/tmp/spark") def parseTrainingData(line): cells = line.split(",") return Vectors.dense([float(cells[0]), float(cells[1])]) trainingStream = ssc.textFileStream("./training")\ .map(parseTrainingData) model = StreamingKMeans(k=2, decayFactor=1.0).setRandomCenters(2, 1.0, 0) print "Intial centers: {0}".format(model.latestModel().centers) model.trainOn(trainingStream) ssc.start() s = sched.scheduler(time.time, time.sleep) def print_cluster_centers(sc, model): print "Cluster centers: {0}".format(model.latestModel().centers) s.enter(10, 1, print_cluster_centers, (sc, model)) s.enter(10, 1, print_cluster_centers, (s, model)) s.run() ssc.awaitTermination()