def main(): conf = SparkConf().setAppName("twitterclassifier") sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 10) tweets = ssc.socketTextStream("localhost", PORT) \ .map(lambda x: json.loads(x)) \ .filter(lambda x: 'text' in x) \ .map(lambda x: x['text'].encode('utf-8')) hasher = HashingTF(DIM) features = tweets.map(lambda x: (x, hasher.transform(featurize(x)))).cache() # We create a model with random clusters and specify the number of clusters to find # decay = 1: total memory; decay = 0: no memory model = StreamingKMeans(k=N, decayFactor=0.1).setRandomCenters(DIM, 1.0, 0) model.trainOn(features.map(lambda x: x[1])) results = model.predictOnValues(features).cache() # Need a closure over i here. def print_group(i): results.filter(lambda x: x[1] == i).map(lambda x: '%i: %s' % (x[1], x[0])).pprint(3) for i in xrange(N): print_group(i) ssc.start() ssc.awaitTermination()
.filter(lambda tpl: tpl[0] != 0)\ .filter(lambda tpl: tpl[2] != '')\ .map(lambda tpl: (tpl[0],tpl[1],tokenize(tpl[2])))\ .map(lambda tpl:(tpl[0],tpl[1],tpl[2],doc2vec(tpl[2]))) #dstream_tweets.pprint() trainingData = dstream_tweets.map( lambda tpl: [tpl[0], tpl[1]] + tpl[3].tolist()) #trainingData.pprint() testdata = dstream_tweets.map(lambda tpl: ( ([tpl[0], tpl[1]], tpl[2]), [tpl[0], tpl[1]] + tpl[3].tolist())) #testdata.pprint() # model = StreamingKMeans(k=clusterNum, decayFactor=0.6).setRandomCenters(102, 1.0, 3) model.trainOn(trainingData) clust = model.predictOnValues(testdata) #clust.pprint() #words = lines.flatMap(lambda line: line.split(" ")) topic = clust.map(lambda x: (x[1], x[0][1])) #topic.pprint() topicAgg = topic.reduceByKey(lambda x, y: x + y) #wordCollect.pprint() topicAgg.map(lambda x: (x[0], freqcount(x[1]))).pprint() clust.foreachRDD(lambda time, rdd: q.put(rdd.collect())) # Run! ssc.start() ssc.awaitTermination()
# We create a model with random clusters and specify the number of clusters to find model = StreamingKMeans(k=2, decayFactor=0.3)#.setRandomCenters(5, 1.0, 0) model.setInitialCenters( centers, [1.0,1.0,1.0,1.0,1.0]) # Now register the streams for training and testing and start the job, # printing the predicted cluster assignments on new data points as they arrive. model.trainOn(trainingStream) def parse(lp): #label = float(lp[lp.find('(') + 1: lp.find(')')]) #vec = Vectors.dense(lp[lp.find('[') + 1: lp.find(']')].split(',')) arr = lp.split(',')[2:-1] label = lp.split(',')[0] label = label[1:-1] vec = Vectors.dense([float(x) for x in arr]) print(model.latestModel().centers) return LabeledPoint(label, vec) testingData = sc.textFile("data/dataset.txt").map(parse) testingQueue = [testingData] testingStream = ssc.queueStream(testingQueue) result = model.predictOnValues(testingStream.map(lambda lp: (lp.label, lp.features))) result.pprint() ssc.start() #ssc.stop(stopSparkContext=True, stopGraceFully=True) ssc.awaitTermination()
# clus=row[0] # #ptext[clus].set_text(str(clus)+ ':'+str([x[0] for x in row[1][1]])) # ptext[clus].set_text(str(clus)+ ':'+str(row[1][1])) # ptext[clus].set_color(colors[clus]) # plt.pause(0.0001) # q = multiprocessing.Queue() f = multiprocessing.Queue() job_for_another_core2 = multiprocessing.Process(target=data_plotting,args=(q,)) job_for_another_core2.start() sc = SparkContext('local[4]', 'Social Panic Analysis') # Create a local StreamingContext with two working thread and batch interval of 1 second ssc = StreamingContext(sc, 10) dstream = ssc.socketTextStream("localhost", 9998) trainingData = dstream.map(Vectors.parse) trainingData.pprint() testData=trainingData.map(lambda x: (x,x)) testData.pprint() model = StreamingKMeans(k=clusterNum, decayFactor=0.1).setRandomCenters(2, 1.0, 0) model.trainOn(trainingData) print(model.latestModel().clusterCenters) clust=model.predictOnValues(testData) clust.pprint() #print(model.predictOnValues(testData.map(lambda lp: (lp.label, lp.features)))) clust.foreachRDD(lambda time, rdd: q.put(rdd.collect())) ssc.start() ssc.awaitTermination()
from pyspark.mllib.clustering import StreamingKMeans from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.linalg import Vectors from pyspark import SparkContext from pyspark.streaming import StreamingContext # Create a local StreamingContext with two working thread and batch interval of 1 second sc = SparkContext("local[2]", "NetworkWordCount") ssc = StreamingContext(sc, 1) # continuous training trainigData = ssc.textFileStream("/training/data/dir").map(Vectors.parse) testData = ssc.textFileStream("/training/data/dir").map(Vectors.parse) testData = ssc.textFileStream("/testing/data/dir").map( lambda s: LabeledPoint.parse(s)) model = StreamingKMeans()\ .setK(3)\ .setDecayFactor(1.0)\ .setRandomCenters(dim=3, weight=0.0, seed=42) model.trainOn(trainingData) prediction = model.predictOnValues(testData) print(prediction)
) from pyspark import SparkContext from pyspark.streaming import StreamingContext from pyspark.mllib.linalg import Vectors from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.clustering import StreamingKMeans def parse(lp): label = float(lp[lp.find('(') + 1:lp.find(',')]) vec = Vectors.dense(lp[lp.find('[') + 1:lp.find(']')].split(',')) return LabeledPoint(label, vec) sc = SparkContext('local[4]', 'Social Panic Analysis') # Create a local StreamingContext with two working thread and batch interval of 1 second ssc = StreamingContext(sc, 10) trainingData = ssc.textFileStream("./training/").map(Vectors.parse) trainingData.pprint() testData = ssc.textFileStream("./testing/").map(parse) testData.pprint() model = StreamingKMeans(k=5, decayFactor=1.0).setRandomCenters(3, 1.0, 0) model.trainOn(trainingData) model.predictOnValues( testData.map(lambda lp: (lp.label, lp.features))).pprint() #print(model.predictOnValues(testData.map(lambda lp: (lp.label, lp.features)))) ssc.start() ssc.awaitTermination()
sys.path.append('/usr/local/Cellar/apache-spark/spark-1.5.2-bin-hadoop2.6/python/') # Append the python/build to PYTHONPATH so that py4j could be found sys.path.append('/usr/local/Cellar/apache-spark/spark-1.5.2-bin-hadoop2.6/python/lib/py4j-0.8.2.1-src.zip') from pyspark import SparkContext from pyspark.streaming import StreamingContext from pyspark.mllib.linalg import Vectors from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.clustering import StreamingKMeans def parse(lp): label = float(lp[lp.find('(') + 1: lp.find(',')]) vec = Vectors.dense(lp[lp.find('[') + 1: lp.find(']')].split(',')) return LabeledPoint(label, vec) sc = SparkContext('local[4]', 'Social Panic Analysis') # Create a local StreamingContext with two working thread and batch interval of 1 second ssc = StreamingContext(sc, 10) trainingData = ssc.textFileStream("./training/").map(Vectors.parse) trainingData.pprint() testData = ssc.textFileStream("./testing/").map(parse) testData.pprint() model = StreamingKMeans(k=5, decayFactor=1.0).setRandomCenters(3, 1.0, 0) model.trainOn(trainingData) model.predictOnValues(testData.map(lambda lp: (lp.label, lp.features))).pprint() #print(model.predictOnValues(testData.map(lambda lp: (lp.label, lp.features)))) ssc.start() ssc.awaitTermination()
def parse(lp): label = float(lp[lp.find('(') + 1: lp.find(')')]) vec = Vectors.dense(lp[lp.find('[') + 1: lp.find(']')].split(',')) return LabeledPoint(label, vec) trainingData = sc.textFile("spark-2.0.1-bin-hadoop2.7/data/mllib/kmeans_data.txt")\ .map(lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')])) trainingQueue = [trainingData] trainingStream = ssc.queueStream(trainingQueue) testingStream = ssc.textFileStream('history').map(parse) # We create a model with random clusters and specify the number of clusters to find model = StreamingKMeans(k=2, decayFactor=1.0).setRandomCenters(3, 1.0, 0) # Now register the streams for training and testing and start the job, # printing the predicted cluster assignments on new data points as they arrive. model.trainOn(trainingStream) result = model.predictOnValues(testingStream.map(lambda lp: (lp.label, lp.features))) result.pprint() ssc.start() ssc.awaitTermination() #ssc.stop(stopSparkContext=True, stopGraceFully=True) # $example off$ print("Final centers: " + str(model.latestModel().centers))
tfidf_testing = tfidf_testing.transform(lambda tup: PCA_model.transform(tup)) \ # ZIPPING EACH INCOMING TWEET WITH UNIQUE ID feature_testing_zipped = tfidf_testing.transform(lambda x: x.zipWithUniqueId()) \ .map(lambda line: (line[1], line[0])) \ # .pprint() # JOINING FEATURE AND LABEL FOR EACH INCOMING TWEET # (0, (DenseVector([0.9679, 0.6229]), '4')) feature_and_label = feature_testing_zipped.join(label) \ # CREATING LABELING DATA labeled_data = feature_and_label.map(lambda k: LabeledPoint(k[1][1], k[1][0])) \ # PREDICTING THE CLUSTER WHERE THE TWEET BELONG TO result = model.predictOnValues( labeled_data.map(lambda lp: (lp.label, lp.features))) # ZIPPING EACH RESULT result_zipped = result.transform(lambda x: x.zipWithUniqueId()) \ .map(lambda line: (line[1], line[0]))\ # PREPARING OUTPUT DATA # structure ( (gt, cluster predicted),(features)) # i.e. (DenseVector([0.9679, 0.6229]), (4.0, 0)) rr = feature_and_label.join(result_zipped)\ .map(lambda x: (x[1][0][0], x[1][1]) if (pca_mode.value == 1) else (x[1][1])) \ .foreachRDD(jdbcInsert)\ ssc.start() ssc.awaitTermination() # ssc.stop()
# Training uses the offline built model ie, word2vec_model trainingData=processed_tweets.map(lambda processed_tweet: [processed_tweet[0],processed_tweet[1]]+processed_tweet[3].tolist()) # test data # It includes lat , long , actual tokenized text and word2vector words testdata=processed_tweets.map(lambda processed_tweet: (([processed_tweet[0],processed_tweet[1]],processed_tweet[2]),[processed_tweet[0],processed_tweet[1]]+processed_tweet[3].tolist())) print("Training model using streaming-k means") # Streaming k means model # Using decayFactor of 0.6 - can be changed. This is for forgetfullness of data, ie importance to data in stram based on recency # refer to this blog for mode info - https://databricks.com/blog/2015/01/28/introducing-streaming-k-means-in-spark-1-2.html model = StreamingKMeans(k=clusterNum, decayFactor=0.6).setRandomCenters(102, 1.0, 3) model.trainOn(trainingData) print("Clustering feeds based on geo and word/topic simliarities....") clusters=model.predictOnValues(testdata) print("===========Identified clusters============") print(clusters.pprint()) topic=clusters.map(lambda x: (x[1],x[0][1])) # Aggregate based on words used in clusters which forms a topic topicAgg = topic.reduceByKey(lambda x,y: x+y) popular_words_for_clusters=topicAgg.map(lambda x: (x[0],get_most_popular_words(x[1]))) print("==================Most frequent words/tokens with frequencies================") print(popular_words_for_clusters.pprint()) clusters.foreachRDD(lambda time, rdd: q.put(rdd.collect())) processed_tweets.repartition(1).saveAsTextFiles("clusters.txt") stream.start()