Ejemplo n.º 1
0
def main():
    conf = SparkConf().setAppName("twitterclassifier")
    sc = SparkContext(conf=conf)
    ssc = StreamingContext(sc, 10)

    tweets = ssc.socketTextStream("localhost", PORT) \
                .map(lambda x: json.loads(x)) \
                .filter(lambda x: 'text' in x) \
                .map(lambda x: x['text'].encode('utf-8'))
    hasher = HashingTF(DIM)
    features = tweets.map(lambda x:
                          (x, hasher.transform(featurize(x)))).cache()

    # We create a model with random clusters and specify the number of clusters to find
    # decay = 1: total memory; decay = 0: no memory
    model = StreamingKMeans(k=N, decayFactor=0.1).setRandomCenters(DIM, 1.0, 0)
    model.trainOn(features.map(lambda x: x[1]))
    results = model.predictOnValues(features).cache()

    # Need a closure over i here.
    def print_group(i):
        results.filter(lambda x: x[1] == i).map(lambda x: '%i: %s' %
                                                (x[1], x[0])).pprint(3)

    for i in xrange(N):
        print_group(i)

    ssc.start()
    ssc.awaitTermination()
         .filter(lambda tpl: tpl[0] != 0)\
         .filter(lambda tpl: tpl[2] != '')\
         .map(lambda tpl: (tpl[0],tpl[1],tokenize(tpl[2])))\
         .map(lambda tpl:(tpl[0],tpl[1],tpl[2],doc2vec(tpl[2])))
    #dstream_tweets.pprint()

    trainingData = dstream_tweets.map(
        lambda tpl: [tpl[0], tpl[1]] + tpl[3].tolist())
    #trainingData.pprint()
    testdata = dstream_tweets.map(lambda tpl: (
        ([tpl[0], tpl[1]], tpl[2]), [tpl[0], tpl[1]] + tpl[3].tolist()))
    #testdata.pprint()
    #
    model = StreamingKMeans(k=clusterNum,
                            decayFactor=0.6).setRandomCenters(102, 1.0, 3)
    model.trainOn(trainingData)
    clust = model.predictOnValues(testdata)
    #clust.pprint()
    #words = lines.flatMap(lambda line: line.split(" "))
    topic = clust.map(lambda x: (x[1], x[0][1]))
    #topic.pprint()
    topicAgg = topic.reduceByKey(lambda x, y: x + y)
    #wordCollect.pprint()
    topicAgg.map(lambda x: (x[0], freqcount(x[1]))).pprint()

    clust.foreachRDD(lambda time, rdd: q.put(rdd.collect()))

    # Run!
    ssc.start()
    ssc.awaitTermination()
Ejemplo n.º 3
0
Archivo: example.py Proyecto: AAB94/RP
# We create a model with random clusters and specify the number of clusters to find
model = StreamingKMeans(k=2, decayFactor=0.3)#.setRandomCenters(5, 1.0, 0)
model.setInitialCenters( centers, [1.0,1.0,1.0,1.0,1.0])
# Now register the streams for training and testing and start the job,
# printing the predicted cluster assignments on new data points as they arrive.
model.trainOn(trainingStream)

def parse(lp):
    #label = float(lp[lp.find('(') + 1: lp.find(')')])
    #vec = Vectors.dense(lp[lp.find('[') + 1: lp.find(']')].split(','))
    arr = lp.split(',')[2:-1]
    label = lp.split(',')[0]
    label = label[1:-1]
    vec = Vectors.dense([float(x) for x in arr])
    print(model.latestModel().centers)
    return LabeledPoint(label, vec)


testingData = sc.textFile("data/dataset.txt").map(parse)
testingQueue = [testingData]

testingStream = ssc.queueStream(testingQueue)

result = model.predictOnValues(testingStream.map(lambda lp: (lp.label, lp.features)))
result.pprint()

ssc.start()
#ssc.stop(stopSparkContext=True, stopGraceFully=True)
ssc.awaitTermination()
            #         clus=row[0]
            #         #ptext[clus].set_text(str(clus)+ ':'+str([x[0] for x in row[1][1]]))
            #         ptext[clus].set_text(str(clus)+ ':'+str(row[1][1]))
            #         ptext[clus].set_color(colors[clus])
            #     plt.pause(0.0001)
            #


q = multiprocessing.Queue()
f = multiprocessing.Queue()
job_for_another_core2 = multiprocessing.Process(target=data_plotting,args=(q,))
job_for_another_core2.start()

sc  = SparkContext('local[4]', 'Social Panic Analysis')
# Create a local StreamingContext with two working thread and batch interval of 1 second

ssc = StreamingContext(sc, 10)
dstream = ssc.socketTextStream("localhost", 9998)
trainingData = dstream.map(Vectors.parse)
trainingData.pprint()
testData=trainingData.map(lambda x: (x,x))
testData.pprint()
model = StreamingKMeans(k=clusterNum, decayFactor=0.1).setRandomCenters(2, 1.0, 0)
model.trainOn(trainingData)
print(model.latestModel().clusterCenters)
clust=model.predictOnValues(testData)
clust.pprint()
#print(model.predictOnValues(testData.map(lambda lp: (lp.label, lp.features))))
clust.foreachRDD(lambda time, rdd: q.put(rdd.collect()))
ssc.start()
ssc.awaitTermination()
Ejemplo n.º 5
0
from pyspark.mllib.clustering import StreamingKMeans
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import Vectors
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

# Create a local StreamingContext with two working thread and batch interval of 1 second
sc = SparkContext("local[2]", "NetworkWordCount")
ssc = StreamingContext(sc, 1)

# continuous training
trainigData = ssc.textFileStream("/training/data/dir").map(Vectors.parse)
testData = ssc.textFileStream("/training/data/dir").map(Vectors.parse)
testData = ssc.textFileStream("/testing/data/dir").map(
    lambda s: LabeledPoint.parse(s))

model = StreamingKMeans()\
        .setK(3)\
        .setDecayFactor(1.0)\
        .setRandomCenters(dim=3, weight=0.0, seed=42)

model.trainOn(trainingData)
prediction = model.predictOnValues(testData)
print(prediction)
Ejemplo n.º 6
0
)
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.clustering import StreamingKMeans


def parse(lp):
    label = float(lp[lp.find('(') + 1:lp.find(',')])
    vec = Vectors.dense(lp[lp.find('[') + 1:lp.find(']')].split(','))
    return LabeledPoint(label, vec)


sc = SparkContext('local[4]', 'Social Panic Analysis')
# Create a local StreamingContext with two working thread and batch interval of 1 second

ssc = StreamingContext(sc, 10)
trainingData = ssc.textFileStream("./training/").map(Vectors.parse)
trainingData.pprint()
testData = ssc.textFileStream("./testing/").map(parse)
testData.pprint()
model = StreamingKMeans(k=5, decayFactor=1.0).setRandomCenters(3, 1.0, 0)
model.trainOn(trainingData)
model.predictOnValues(
    testData.map(lambda lp: (lp.label, lp.features))).pprint()
#print(model.predictOnValues(testData.map(lambda lp: (lp.label, lp.features))))

ssc.start()
ssc.awaitTermination()
sys.path.append('/usr/local/Cellar/apache-spark/spark-1.5.2-bin-hadoop2.6/python/')
# Append the python/build to PYTHONPATH so that py4j could be found
sys.path.append('/usr/local/Cellar/apache-spark/spark-1.5.2-bin-hadoop2.6/python/lib/py4j-0.8.2.1-src.zip')
from pyspark import SparkContext
from pyspark.streaming import StreamingContext


from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.clustering import StreamingKMeans
def parse(lp):
    label = float(lp[lp.find('(') + 1: lp.find(',')])
    vec = Vectors.dense(lp[lp.find('[') + 1: lp.find(']')].split(','))
    return LabeledPoint(label, vec)


sc  = SparkContext('local[4]', 'Social Panic Analysis')
# Create a local StreamingContext with two working thread and batch interval of 1 second

ssc = StreamingContext(sc, 10)
trainingData = ssc.textFileStream("./training/").map(Vectors.parse)
trainingData.pprint()
testData = ssc.textFileStream("./testing/").map(parse)
testData.pprint()
model = StreamingKMeans(k=5, decayFactor=1.0).setRandomCenters(3, 1.0, 0)
model.trainOn(trainingData)
model.predictOnValues(testData.map(lambda lp: (lp.label, lp.features))).pprint()
#print(model.predictOnValues(testData.map(lambda lp: (lp.label, lp.features))))

ssc.start()
ssc.awaitTermination()
Ejemplo n.º 8
0
    def parse(lp):
        label = float(lp[lp.find('(') + 1: lp.find(')')])
        vec = Vectors.dense(lp[lp.find('[') + 1: lp.find(']')].split(','))
        return LabeledPoint(label, vec)

    trainingData = sc.textFile("spark-2.0.1-bin-hadoop2.7/data/mllib/kmeans_data.txt")\
        .map(lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')]))


    trainingQueue = [trainingData]

    trainingStream = ssc.queueStream(trainingQueue)
    testingStream = ssc.textFileStream('history').map(parse)

    # We create a model with random clusters and specify the number of clusters to find
    model = StreamingKMeans(k=2, decayFactor=1.0).setRandomCenters(3, 1.0, 0)

    # Now register the streams for training and testing and start the job,
    # printing the predicted cluster assignments on new data points as they arrive.
    model.trainOn(trainingStream)

    result = model.predictOnValues(testingStream.map(lambda lp: (lp.label, lp.features)))
    result.pprint()

    ssc.start()
    ssc.awaitTermination()
    #ssc.stop(stopSparkContext=True, stopGraceFully=True)
    # $example off$

    print("Final centers: " + str(model.latestModel().centers))
        tfidf_testing = tfidf_testing.transform(lambda tup: PCA_model.transform(tup)) \

    # ZIPPING EACH INCOMING TWEET WITH UNIQUE ID
    feature_testing_zipped = tfidf_testing.transform(lambda x: x.zipWithUniqueId()) \
                    .map(lambda line: (line[1], line[0])) \
                    # .pprint()

    # JOINING FEATURE AND LABEL FOR EACH INCOMING TWEET
    # (0, (DenseVector([0.9679, 0.6229]), '4'))
    feature_and_label = feature_testing_zipped.join(label) \

    # CREATING LABELING DATA
    labeled_data = feature_and_label.map(lambda k: LabeledPoint(k[1][1], k[1][0])) \

    # PREDICTING THE CLUSTER WHERE THE TWEET BELONG TO
    result = model.predictOnValues(
        labeled_data.map(lambda lp: (lp.label, lp.features)))

    # ZIPPING EACH RESULT
    result_zipped = result.transform(lambda x: x.zipWithUniqueId()) \
                        .map(lambda line: (line[1], line[0]))\

    # PREPARING OUTPUT DATA
    # structure ( (gt, cluster predicted),(features))
    # i.e. (DenseVector([0.9679, 0.6229]), (4.0, 0))
    rr = feature_and_label.join(result_zipped)\
                            .map(lambda x: (x[1][0][0], x[1][1]) if (pca_mode.value == 1) else (x[1][1])) \
                            .foreachRDD(jdbcInsert)\

    ssc.start()
    ssc.awaitTermination()
    # ssc.stop()
    # Training uses the offline built model ie, word2vec_model
    trainingData=processed_tweets.map(lambda processed_tweet: [processed_tweet[0],processed_tweet[1]]+processed_tweet[3].tolist())
    
    # test data
    # It includes lat , long , actual tokenized text and word2vector words
    testdata=processed_tweets.map(lambda processed_tweet: (([processed_tweet[0],processed_tweet[1]],processed_tweet[2]),[processed_tweet[0],processed_tweet[1]]+processed_tweet[3].tolist()))
    
    print("Training model using streaming-k means")
    # Streaming k means model
    # Using decayFactor of 0.6 - can be changed. This is for forgetfullness of data, ie importance to data in stram based on recency
    # refer to this blog for mode info - https://databricks.com/blog/2015/01/28/introducing-streaming-k-means-in-spark-1-2.html
    model = StreamingKMeans(k=clusterNum, decayFactor=0.6).setRandomCenters(102, 1.0, 3)
    model.trainOn(trainingData)

    print("Clustering feeds based on geo and word/topic simliarities....")
    clusters=model.predictOnValues(testdata)
    print("===========Identified clusters============")
    print(clusters.pprint())
    topic=clusters.map(lambda x: (x[1],x[0][1]))
    # Aggregate based on words used in clusters which forms a topic
    topicAgg = topic.reduceByKey(lambda x,y: x+y)
    popular_words_for_clusters=topicAgg.map(lambda x: (x[0],get_most_popular_words(x[1])))
    print("==================Most frequent words/tokens with frequencies================")
    print(popular_words_for_clusters.pprint())

    
    clusters.foreachRDD(lambda time, rdd: q.put(rdd.collect()))


    processed_tweets.repartition(1).saveAsTextFiles("clusters.txt")
    stream.start()