def test_predictOn_model(self): """Test that the model predicts correctly on toy data.""" stkm = StreamingKMeans() stkm._model = StreamingKMeansModel(clusterCenters=[[1.0, 1.0], [-1.0, 1.0], [-1.0, -1.0], [1.0, -1.0]], clusterWeights=[1.0, 1.0, 1.0, 1.0]) predict_data = [[[1.5, 1.5]], [[-1.5, 1.5]], [[-1.5, -1.5]], [[1.5, -1.5]]] predict_data = [ self.sc.parallelize(batch, 1) for batch in predict_data ] predict_stream = self.ssc.queueStream(predict_data) predict_val = stkm.predictOn(predict_stream) result = [] def update(rdd): rdd_collect = rdd.collect() if rdd_collect: result.append(rdd_collect) predict_val.foreachRDD(update) self.ssc.start() def condition(): self.assertEqual(result, [[0], [1], [2], [3]]) return True eventually(condition, catch_assertions=True)
def test_predictOn_model(self): """Test that the model predicts correctly on toy data.""" stkm = StreamingKMeans() stkm._model = StreamingKMeansModel(clusterCenters=[[1.0, 1.0], [-1.0, 1.0], [-1.0, -1.0], [1.0, -1.0]], clusterWeights=[1.0, 1.0, 1.0, 1.0]) predict_data = [[[1.5, 1.5]], [[-1.5, 1.5]], [[-1.5, -1.5]], [[1.5, -1.5]]] predict_data = [sc.parallelize(batch, 1) for batch in predict_data] predict_stream = self.ssc.queueStream(predict_data) predict_val = stkm.predictOn(predict_stream) result = [] def update(rdd): rdd_collect = rdd.collect() if rdd_collect: result.append(rdd_collect) predict_val.foreachRDD(update) t = time() self.ssc.start() self._ssc_wait(t, 6.0, 0.01) self.assertEquals(result, [[0], [1], [2], [3]])
def test_accuracy_for_single_center(self): """Test that parameters obtained are correct for a single center.""" centers, batches = self.streamingKMeansDataGenerator(batches=5, numPoints=5, k=1, d=5, r=0.1, seed=0) stkm = StreamingKMeans(1) stkm.setInitialCenters([[0., 0., 0., 0., 0.]], [0.]) input_stream = self.ssc.queueStream( [self.sc.parallelize(batch, 1) for batch in batches]) stkm.trainOn(input_stream) self.ssc.start() def condition(): self.assertEqual(stkm.latestModel().clusterWeights, [25.0]) return True eventually(condition, catch_assertions=True) realCenters = array_sum(array(centers), axis=0) for i in range(5): modelCenters = stkm.latestModel().centers[0][i] self.assertAlmostEqual(centers[0][i], modelCenters, 1) self.assertAlmostEqual(realCenters[i], modelCenters, 1)
def perform_training(sc: SparkContext, params_dict: dict): batch_duration = 1 if 'batch_duration' not in params_dict else params_dict[ 'batch_duration'] training_duration = 20 if 'training_duration' not in params_dict else params_dict[ 'training_duration'] ssc = StreamingContext(sc, batch_duration) topics = ['normal-ekg-stream'] kafka_params = {'metadata.broker.list': 'localhost:9092'} kvs = KafkaUtils.createDirectStream( ssc, topics, kafkaParams=kafka_params, valueDecoder=lambda val: json.loads(val.decode('utf-8'))) windowed_signal = kvs.map(lambda msg: Vectors.dense( [float(value) for value in msg[1]['signal_values']])) # windowed_signal.foreachRDD(Plotter.plot_signal_window) model = StreamingKMeans(k=20, decayFactor=1.0).setRandomCenters(188, 1.0, 0) model.trainOn(windowed_signal) ssc.start() ssc.awaitTerminationOrTimeout(training_duration) ssc.stop(stopSparkContext=False, stopGraceFully=True) return model.latestModel()
def main(): conf = SparkConf().setAppName("twitterclassifier") sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 10) tweets = ssc.socketTextStream("localhost", PORT) \ .map(lambda x: json.loads(x)) \ .filter(lambda x: 'text' in x) \ .map(lambda x: x['text'].encode('utf-8')) hasher = HashingTF(DIM) features = tweets.map(lambda x: (x, hasher.transform(featurize(x)))).cache() # We create a model with random clusters and specify the number of clusters to find # decay = 1: total memory; decay = 0: no memory model = StreamingKMeans(k=N, decayFactor=0.1).setRandomCenters(DIM, 1.0, 0) model.trainOn(features.map(lambda x: x[1])) results = model.predictOnValues(features).cache() # Need a closure over i here. def print_group(i): results.filter(lambda x: x[1] == i).map(lambda x: '%i: %s' % (x[1], x[0])).pprint(3) for i in xrange(N): print_group(i) ssc.start() ssc.awaitTermination()
def test_trainOn_predictOn(self): """Test that prediction happens on the updated model.""" stkm = StreamingKMeans(decayFactor=0.0, k=2) stkm.setInitialCenters([[0.0], [1.0]], [1.0, 1.0]) # Since decay factor is set to zero, once the first batch # is passed the clusterCenters are updated to [-0.5, 0.7] # which causes 0.2 & 0.3 to be classified as 1, even though the # classification based in the initial model would have been 0 # proving that the model is updated. batches = [[[-0.5], [0.6], [0.8]], [[0.2], [-0.1], [0.3]]] batches = [sc.parallelize(batch) for batch in batches] input_stream = self.ssc.queueStream(batches) predict_results = [] def collect(rdd): rdd_collect = rdd.collect() if rdd_collect: predict_results.append(rdd_collect) stkm.trainOn(input_stream) predict_stream = stkm.predictOn(input_stream) predict_stream.foreachRDD(collect) t = time() self.ssc.start() self._ssc_wait(t, 6.0, 0.01) self.assertEqual(predict_results, [[0, 1, 1], [1, 0, 1]])
def test_trainOn_model(self): """Test the model on toy data with four clusters.""" stkm = StreamingKMeans() initCenters = [[1.0, 1.0], [-1.0, 1.0], [-1.0, -1.0], [1.0, -1.0]] stkm.setInitialCenters(centers=initCenters, weights=[1.0, 1.0, 1.0, 1.0]) # Create a toy dataset by setting a tiny offest for each point. offsets = [[0, 0.1], [0, -0.1], [0.1, 0], [-0.1, 0]] batches = [] for offset in offsets: batches.append([[offset[0] + center[0], offset[1] + center[1]] for center in initCenters]) batches = [self.sc.parallelize(batch, 1) for batch in batches] input_stream = self.ssc.queueStream(batches) stkm.trainOn(input_stream) t = time() self.ssc.start() # Give enough time to train the model. self._ssc_wait(t, 6.0, 0.01) finalModel = stkm.latestModel() self.assertTrue(all(finalModel.centers == array(initCenters))) self.assertEquals(finalModel.clusterWeights, [5.0, 5.0, 5.0, 5.0])
def test_model_params(self): """Test that the model params are set correctly""" stkm = StreamingKMeans() stkm.setK(5).setDecayFactor(0.0) self.assertEqual(stkm._k, 5) self.assertEqual(stkm._decayFactor, 0.0) # Model not set yet. self.assertIsNone(stkm.latestModel()) self.assertRaises(ValueError, stkm.trainOn, [0.0, 1.0]) stkm.setInitialCenters(centers=[[0.0, 0.0], [1.0, 1.0]], weights=[1.0, 1.0]) self.assertEqual(stkm.latestModel().centers, [[0.0, 0.0], [1.0, 1.0]]) self.assertEqual(stkm.latestModel().clusterWeights, [1.0, 1.0])
def detect(self, k, t): # Encoding categorical features using one-hot. df1 = self.cat2Num(self.rawDF, [0, 1]).cache() df1.show(n=2, truncate=False) # Clustering points using KMeans features = df1.select("features").rdd.map(lambda row: row[0]).cache() model = StreamingKMeans(k=7, decayFactor=1.0).setRandomCenters(4, 1.0, 0) # model = KMeans.train(features, k, maxIterations=40, runs=10, initializationMode="random", seed=20) # Adding the prediction column to df1 modelBC = sc.broadcast(model) predictUDF = udf(lambda x: modelBC.value.predict(x), StringType()) df2 = df1.withColumn("prediction", predictUDF(df1.features)).cache() df2.show(n=3, truncate=False) # Adding the score column to df2; The higher the score, the more likely it is an anomaly df3 = self.addScore(df2).cache() df3.show(n=3, truncate=False) return df3.where(df3.score > t)
trainingData = sc.textFile("data/datatraining.txt")\ .map(lambda line: line.split(',')[2:-1]).map(lambda arr: Vectors.dense([float(x) for x in arr])) centers = KMeans.train(trainingData, 2).centers trainingQueue = [trainingData] trainingStream = ssc.queueStream(trainingQueue) # We create a model with random clusters and specify the number of clusters to find model = StreamingKMeans(k=2, decayFactor=0.3)#.setRandomCenters(5, 1.0, 0) model.setInitialCenters( centers, [1.0,1.0,1.0,1.0,1.0]) # Now register the streams for training and testing and start the job, # printing the predicted cluster assignments on new data points as they arrive. model.trainOn(trainingStream) def parse(lp): #label = float(lp[lp.find('(') + 1: lp.find(')')]) #vec = Vectors.dense(lp[lp.find('[') + 1: lp.find(']')].split(',')) arr = lp.split(',')[2:-1] label = lp.split(',')[0] label = label[1:-1] vec = Vectors.dense([float(x) for x in arr]) print(model.latestModel().centers) return LabeledPoint(label, vec)
sc = SparkContext(appName="StreamingKMeans") ssc = StreamingContext(sc, 10) ssc.checkpoint("/tmp/checkpoints/") initialCenters = [[604328, 574379], [801908, 318382], [416383, 786204], [822771, 732034], [850993, 157873], [338586, 563537], [169274, 348574], [619259, 397671], [241071, 844424], [321801, 165319], [139493, 557352], [508785, 174800], [398934, 404142], [860858, 546059], [674365, 860464]] initialWeights = [ 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 ] stkm = StreamingKMeans(k=sys.argv[1], decayFactor=1.0).setInitialCenters( [[500, 500], [600, 600]], [1.0, 1.0]) directKafkaStream = KafkaUtils.createDirectStream( ssc, ['StreamingKMeansTFG'], { "metadata.broker.list": "localhost:9092", "auto_offset_reset": 'earliest' }) parsed = directKafkaStream.map(lambda v: loads(v[1])) parsed = parsed.map( lambda line: Vectors.dense([float(x) for x in line.strip().split()])) stkm.trainOn(parsed)
from pyspark.mllib.linalg import Vectors from pyspark.mllib.clustering import StreamingKMeans if __name__ == "__main__": sc = SparkContext(appName="sai twitter feed") ssc = StreamingContext(sc, 10) ssc.checkpoint("chkpfile") def parserData(line): cells = line.split(",") return Vectors.dense([float(cells[0]), float(cells[1])]) trainingStream = ssc.textFileStream("/files").map(parserData) model = StreamingKMeans(k=2, decayFactor=1.0).setRandomCenters(2, 1.0, 0) print("Initial Centres" + str(model.latestModel().centers)) model.trainOn(trainingStream) ssc.start() s = sched.scheduler(time.time, time.sleep) def print_cluster_centres(sc, model): print(str(model.latestModel().centers)) s.enter(10, 1, print_cluster_centres, (s, model)) s.enter(10, 1, print_cluster_centres, (s, model)) s.run() ssc.awaitTermination() # to make this work
from pyspark.mllib.clustering import StreamingKMeans from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.linalg import Vectors from pyspark import SparkContext from pyspark.streaming import StreamingContext # Create a local StreamingContext with two working thread and batch interval of 1 second sc = SparkContext("local[2]", "NetworkWordCount") ssc = StreamingContext(sc, 1) # continuous training trainigData = ssc.textFileStream("/training/data/dir").map(Vectors.parse) testData = ssc.textFileStream("/training/data/dir").map(Vectors.parse) testData = ssc.textFileStream("/testing/data/dir").map( lambda s: LabeledPoint.parse(s)) model = StreamingKMeans()\ .setK(3)\ .setDecayFactor(1.0)\ .setRandomCenters(dim=3, weight=0.0, seed=42) model.trainOn(trainingData) prediction = model.predictOnValues(testData) print(prediction)
for center in file: initialCenters.append(center.split()) logging.info(initialCenters) initialWeights = [] for i in initialCenters: initialWeights.append(1.0) config = sc.broadcast(parameters) numberClusters = config.value[0] mongoIP = config.value[1] mongoDataBase = config.value[2] mongoCollection = config.value[3] stkm = StreamingKMeans(k=numberClusters, decayFactor=1).setInitialCenters( initialCenters, initialWeights) #stkm = StreamingKMeans(k=int(numberClusters),decayFactor=1.0).setRandomCenters(2,1.0,100) directKafkaStream = KafkaUtils.createDirectStream( ssc, ['StreamingKMeansTFG'], { "metadata.broker.list": "localhost:9092", "auto_offset_reset": 'earliest' }) parsed = directKafkaStream.map(lambda v: loads(v[1])) parsed = parsed.map( lambda line: Vectors.dense([float(x) for x in line.strip().split()])) stkm.trainOn(parsed)
print('Applying PCA on training data...') PCA_model = PCA(low_dim).fit(tfidf_training) tfidf_training = PCA_model.transform(tfidf_training) k = low_dim # pcArray = model.transform(tfidf_training.first()).toArray() #setting checkpoint # ssc.checkpoint("/Users/davidenardone/Desktop/checkpoint") # CREATING DStream FROM TRAINING'S RDD trainingQueue = [tfidf_training] trainingStream = ssc.queueStream(trainingQueue) # CREATING A K-MEANS MODEL WITH RANDOM CLUSTERS SPECIFYING THE NUMBER OF CLUSTERS TO FIND model = StreamingKMeans(k=2, decayFactor=1.0, timeUnit='batches').setRandomCenters(k, 1.0, 0) # print("K centers: " + str(model.latestModel().centers)) # TRAINING THE MODEL ON THE TRAINING TWEET'S DATA print('Training K-means Model...') model.trainOn(trainingStream) print('done!') # CREATE DIRECT KAFKA STREAM WITH BROKERS AND TOPICS streamData = KafkaUtils.createDirectStream( ssc, [kafka_topic], {"metadata.broker.list": kafka_brokers}) ######### FROM NOW ON, EACH ACTION OR TRANSFORMATION IS DONE ON A SINGLE INCOMING BATCH OF TWEETS ######### # PRE-PROCESSING TWEETS DATA (TESTING)
os.makedirs("results") except: pass output_file = open(RESULT_FILE, "w") start = time.time() #output_file.write("Measurement,Number_Partitions, Time\n") #output_file.write("Spark Startup, %s, %.5f\n"%(NUMBER_PARTITIONS, time.time()-start)) #output_file.flush() ####################################################################################### decayFactor = 1.0 timeUnit = "batches" model = StreamingKMeans(k=10, decayFactor=decayFactor, timeUnit=timeUnit).setRandomCenters(3, 1.0, 0) #def printOffsetRanges(rdd): # for o in offsetRanges: # print "%s %s %s %s" % (o.topic, o.partition, o.fromOffset, o.untilOffset) def count_records(rdd): print str(type(rdd)) if rdd != None: return rdd.collect() return [0] ## OK
sc = SparkContext(master="local[4]", appName="Streaming-KMeans", conf=conf) ssc = StreamingContext(sc, 5) # Kafka Stream ks = KafkaUtils.createDirectStream( ssc, ["test"], {"metadata.broker.list": "localhost:9092"}) trainingData = sc.textFile("data/datatraining.txt")\ .map(lambda line: line.split(',')[2:-1]).map(lambda arr: Vectors.dense([float(x) for x in arr])) # Supplied to Streaming KMeans as the centers by StreamingKmeans are not giving good predictions init_centers = KMeans.train(trainingData, 2).centers model = StreamingKMeans(k=2, decayFactor=0.1)\ .setInitialCenters(init_centers, [1.0, 1.0, 1.0, 1.0, 1.0]) model.trainOn(ssc.queueStream([trainingData])) def parse(lp): arr = lp.split(',')[2:-1] label = lp.split(',')[0] vec = Vectors.dense([float(x) for x in arr]) return LabeledPoint(label, vec) test_stream = ks.map(lambda x: x[1]).map(parse) result = model.predictOnValues( test_stream.map(lambda lp: (lp.label, lp.features))) # Prints Prediction Prediction and Cluster Centers
.filter(lambda post: 'created_at' in post)\ .map(lambda post: (get_coord2(post)[0],get_coord2(post)[1],post["text"]))\ .filter(lambda tpl: tpl[0] != 0)\ .filter(lambda tpl: tpl[2] != '')\ .map(lambda tpl: (tpl[0],tpl[1],tokenize(tpl[2])))\ .map(lambda tpl:(tpl[0],tpl[1],tpl[2],doc2vec(tpl[2]))) #dstream_tweets.pprint() trainingData = dstream_tweets.map( lambda tpl: [tpl[0], tpl[1]] + tpl[3].tolist()) #trainingData.pprint() testdata = dstream_tweets.map(lambda tpl: ( ([tpl[0], tpl[1]], tpl[2]), [tpl[0], tpl[1]] + tpl[3].tolist())) #testdata.pprint() # model = StreamingKMeans(k=clusterNum, decayFactor=0.6).setRandomCenters(102, 1.0, 3) model.trainOn(trainingData) clust = model.predictOnValues(testdata) #clust.pprint() #words = lines.flatMap(lambda line: line.split(" ")) topic = clust.map(lambda x: (x[1], x[0][1])) #topic.pprint() topicAgg = topic.reduceByKey(lambda x, y: x + y) #wordCollect.pprint() topicAgg.map(lambda x: (x[0], freqcount(x[1]))).pprint() clust.foreachRDD(lambda time, rdd: q.put(rdd.collect())) # Run! ssc.start() ssc.awaitTermination()
def streaming(self, mnk, clusters, init_clusters): self.mnk=mnk self.clusters=clusters self.init_clusters=init_clusters self.streaming_kmeans=StreamingKMeans(self.init_clusters, self.decay_factor, self.time_unit) self.streaming_kmeans.setInitialCenters(self.mnk.cluster_centers_, np.ones([self.init_clusters]))