Ejemplo n.º 1
0
    def test_trainOn_model(self):
        """Test the model on toy data with four clusters."""
        stkm = StreamingKMeans()
        initCenters = [[1.0, 1.0], [-1.0, 1.0], [-1.0, -1.0], [1.0, -1.0]]
        stkm.setInitialCenters(
            centers=initCenters, weights=[1.0, 1.0, 1.0, 1.0])

        # Create a toy dataset by setting a tiny offest for each point.
        offsets = [[0, 0.1], [0, -0.1], [0.1, 0], [-0.1, 0]]
        batches = []
        for offset in offsets:
            batches.append([[offset[0] + center[0], offset[1] + center[1]]
                            for center in initCenters])

        batches = [self.sc.parallelize(batch, 1) for batch in batches]
        input_stream = self.ssc.queueStream(batches)
        stkm.trainOn(input_stream)
        t = time()
        self.ssc.start()

        # Give enough time to train the model.
        self._ssc_wait(t, 6.0, 0.01)
        finalModel = stkm.latestModel()
        self.assertTrue(all(finalModel.centers == array(initCenters)))
        self.assertEquals(finalModel.clusterWeights, [5.0, 5.0, 5.0, 5.0])
Ejemplo n.º 2
0
    def test_trainOn_predictOn(self):
        """Test that prediction happens on the updated model."""
        stkm = StreamingKMeans(decayFactor=0.0, k=2)
        stkm.setInitialCenters([[0.0], [1.0]], [1.0, 1.0])

        # Since decay factor is set to zero, once the first batch
        # is passed the clusterCenters are updated to [-0.5, 0.7]
        # which causes 0.2 & 0.3 to be classified as 1, even though the
        # classification based in the initial model would have been 0
        # proving that the model is updated.
        batches = [[[-0.5], [0.6], [0.8]], [[0.2], [-0.1], [0.3]]]
        batches = [sc.parallelize(batch) for batch in batches]
        input_stream = self.ssc.queueStream(batches)
        predict_results = []

        def collect(rdd):
            rdd_collect = rdd.collect()
            if rdd_collect:
                predict_results.append(rdd_collect)

        stkm.trainOn(input_stream)
        predict_stream = stkm.predictOn(input_stream)
        predict_stream.foreachRDD(collect)

        t = time()
        self.ssc.start()
        self._ssc_wait(t, 6.0, 0.01)
        self.assertEqual(predict_results, [[0, 1, 1], [1, 0, 1]])
Ejemplo n.º 3
0
    def test_trainOn_predictOn(self):
        """Test that prediction happens on the updated model."""
        stkm = StreamingKMeans(decayFactor=0.0, k=2)
        stkm.setInitialCenters([[0.0], [1.0]], [1.0, 1.0])

        # Since decay factor is set to zero, once the first batch
        # is passed the clusterCenters are updated to [-0.5, 0.7]
        # which causes 0.2 & 0.3 to be classified as 1, even though the
        # classification based in the initial model would have been 0
        # proving that the model is updated.
        batches = [[[-0.5], [0.6], [0.8]], [[0.2], [-0.1], [0.3]]]
        batches = [sc.parallelize(batch) for batch in batches]
        input_stream = self.ssc.queueStream(batches)
        predict_results = []

        def collect(rdd):
            rdd_collect = rdd.collect()
            if rdd_collect:
                predict_results.append(rdd_collect)

        stkm.trainOn(input_stream)
        predict_stream = stkm.predictOn(input_stream)
        predict_stream.foreachRDD(collect)

        t = time()
        self.ssc.start()
        self._ssc_wait(t, 6.0, 0.01)
        self.assertEqual(predict_results, [[0, 1, 1], [1, 0, 1]])
Ejemplo n.º 4
0
    def test_trainOn_model(self):
        """Test the model on toy data with four clusters."""
        stkm = StreamingKMeans()
        initCenters = [[1.0, 1.0], [-1.0, 1.0], [-1.0, -1.0], [1.0, -1.0]]
        stkm.setInitialCenters(centers=initCenters,
                               weights=[1.0, 1.0, 1.0, 1.0])

        # Create a toy dataset by setting a tiny offest for each point.
        offsets = [[0, 0.1], [0, -0.1], [0.1, 0], [-0.1, 0]]
        batches = []
        for offset in offsets:
            batches.append([[offset[0] + center[0], offset[1] + center[1]]
                            for center in initCenters])

        batches = [self.sc.parallelize(batch, 1) for batch in batches]
        input_stream = self.ssc.queueStream(batches)
        stkm.trainOn(input_stream)
        t = time()
        self.ssc.start()

        # Give enough time to train the model.
        self._ssc_wait(t, 6.0, 0.01)
        finalModel = stkm.latestModel()
        self.assertTrue(all(finalModel.centers == array(initCenters)))
        self.assertEquals(finalModel.clusterWeights, [5.0, 5.0, 5.0, 5.0])
Ejemplo n.º 5
0
def main():
    conf = SparkConf().setAppName("twitterclassifier")
    sc = SparkContext(conf=conf)
    ssc = StreamingContext(sc, 10)

    tweets = ssc.socketTextStream("localhost", PORT) \
                .map(lambda x: json.loads(x)) \
                .filter(lambda x: 'text' in x) \
                .map(lambda x: x['text'].encode('utf-8'))
    hasher = HashingTF(DIM)
    features = tweets.map(lambda x:
                          (x, hasher.transform(featurize(x)))).cache()

    # We create a model with random clusters and specify the number of clusters to find
    # decay = 1: total memory; decay = 0: no memory
    model = StreamingKMeans(k=N, decayFactor=0.1).setRandomCenters(DIM, 1.0, 0)
    model.trainOn(features.map(lambda x: x[1]))
    results = model.predictOnValues(features).cache()

    # Need a closure over i here.
    def print_group(i):
        results.filter(lambda x: x[1] == i).map(lambda x: '%i: %s' %
                                                (x[1], x[0])).pprint(3)

    for i in xrange(N):
        print_group(i)

    ssc.start()
    ssc.awaitTermination()
Ejemplo n.º 6
0
    def test_accuracy_for_single_center(self):
        """Test that parameters obtained are correct for a single center."""
        centers, batches = self.streamingKMeansDataGenerator(batches=5,
                                                             numPoints=5,
                                                             k=1,
                                                             d=5,
                                                             r=0.1,
                                                             seed=0)
        stkm = StreamingKMeans(1)
        stkm.setInitialCenters([[0., 0., 0., 0., 0.]], [0.])
        input_stream = self.ssc.queueStream(
            [self.sc.parallelize(batch, 1) for batch in batches])
        stkm.trainOn(input_stream)

        self.ssc.start()

        def condition():
            self.assertEqual(stkm.latestModel().clusterWeights, [25.0])
            return True

        eventually(condition, catch_assertions=True)

        realCenters = array_sum(array(centers), axis=0)
        for i in range(5):
            modelCenters = stkm.latestModel().centers[0][i]
            self.assertAlmostEqual(centers[0][i], modelCenters, 1)
            self.assertAlmostEqual(realCenters[i], modelCenters, 1)
Ejemplo n.º 7
0
    def perform_training(sc: SparkContext, params_dict: dict):
        batch_duration = 1 if 'batch_duration' not in params_dict else params_dict[
            'batch_duration']
        training_duration = 20 if 'training_duration' not in params_dict else params_dict[
            'training_duration']
        ssc = StreamingContext(sc, batch_duration)
        topics = ['normal-ekg-stream']
        kafka_params = {'metadata.broker.list': 'localhost:9092'}
        kvs = KafkaUtils.createDirectStream(
            ssc,
            topics,
            kafkaParams=kafka_params,
            valueDecoder=lambda val: json.loads(val.decode('utf-8')))

        windowed_signal = kvs.map(lambda msg: Vectors.dense(
            [float(value) for value in msg[1]['signal_values']]))

        # windowed_signal.foreachRDD(Plotter.plot_signal_window)
        model = StreamingKMeans(k=20,
                                decayFactor=1.0).setRandomCenters(188, 1.0, 0)
        model.trainOn(windowed_signal)

        ssc.start()
        ssc.awaitTerminationOrTimeout(training_duration)
        ssc.stop(stopSparkContext=False, stopGraceFully=True)

        return model.latestModel()
Ejemplo n.º 8
0
class StreamingUpdate(object):
    """
    Streaming Update: DStream
    """
    def __init__(self, init_clusters, decay_factor, time_unit, sc, ssc):
        self.init_clusters=init_clusters
        self.decay_factor=decay_factor
        self.time_unit=time_unit
        self.sc=sc
        self.ssc=ssc

    # implement
    def streaming(self, mnk, clusters, init_clusters):
        self.mnk=mnk
        self.clusters=clusters
        self.init_clusters=init_clusters
        self.streaming_kmeans=StreamingKMeans(self.init_clusters, self.decay_factor, self.time_unit)
        self.streaming_kmeans.setInitialCenters(self.mnk.cluster_centers_, np.ones([self.init_clusters]))

    # update shape for centers in StreamingContext
    """
    Từ điển được cập nhật khi có tin tức mới đến thì em cập nhật lại kích thước của các centroid
    VD: Từ điển ban đầu có kích thước 10 từ
    Em biểu diễn một câu có 5 từ bằng sparse vector kích thước 5x10
    Từ điển sau khi cập nhật có 15 từ thì câu trên phải biểu diễn lại bằng sparse vector có kích thước 5x15
    Có cách biểu diễn khác mà không phải cập nhật lại biểu diễn của câu không ạ
    """
    def update_shape(self, docs, dictionary):
        self.streaming_kmeans.setRandomCenters(matrix_tfidf(docs, dictionary).shape[1], 1.0, 0)

    # save matrix update
    def save_matrix_update(self, docs, dictionary):
        np.savetxt('/home/ducvu/input_streaming.txt', matrix_tfidf(docs, dictionary))

    # load dstream
    def load_dstream(self):
        self.dstream = self.sc.textFile("/home/ducvu/input_streaming.txt")\
            .map(lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')]))
        
    # make predict
    def make_predict(self, docs, dictionary):
        self.streaming_kmeans.trainOn(self.load_dstream())
        self.pred_stream=[]
        matrix=matrix_tfidf(docs)
        for x in matrix:
            self.pred_stream.append(self.streaming_kmeans.latest_model.predict(x))
        self.pred_stream=np.array(self.pred_stream)
        
        df = pd.DataFrame(matrix).groupby(self.pred_stream).mean()
        for i, r in df.iterrows():
            print('\nCluster {0}:'.format(i))
            print(','.join([get_tfidf(dictionary).get_feature_names()[t] for t in np.argsort(r)[-15:]]))
Ejemplo n.º 9
0
    def test_accuracy_for_single_center(self):
        """Test that parameters obtained are correct for a single center."""
        centers, batches = self.streamingKMeansDataGenerator(
            batches=5, numPoints=5, k=1, d=5, r=0.1, seed=0)
        stkm = StreamingKMeans(1)
        stkm.setInitialCenters([[0., 0., 0., 0., 0.]], [0.])
        input_stream = self.ssc.queueStream(
            [self.sc.parallelize(batch, 1) for batch in batches])
        stkm.trainOn(input_stream)

        t = time()
        self.ssc.start()
        self._ssc_wait(t, 10.0, 0.01)
        self.assertEquals(stkm.latestModel().clusterWeights, [25.0])
        realCenters = array_sum(array(centers), axis=0)
        for i in range(5):
            modelCenters = stkm.latestModel().centers[0][i]
            self.assertAlmostEqual(centers[0][i], modelCenters, 1)
            self.assertAlmostEqual(realCenters[i], modelCenters, 1)
         .filter(lambda tpl: tpl[0] != 0)\
         .filter(lambda tpl: tpl[2] != '')\
         .map(lambda tpl: (tpl[0],tpl[1],tokenize(tpl[2])))\
         .map(lambda tpl:(tpl[0],tpl[1],tpl[2],doc2vec(tpl[2])))
    #dstream_tweets.pprint()

    trainingData = dstream_tweets.map(
        lambda tpl: [tpl[0], tpl[1]] + tpl[3].tolist())
    #trainingData.pprint()
    testdata = dstream_tweets.map(lambda tpl: (
        ([tpl[0], tpl[1]], tpl[2]), [tpl[0], tpl[1]] + tpl[3].tolist()))
    #testdata.pprint()
    #
    model = StreamingKMeans(k=clusterNum,
                            decayFactor=0.6).setRandomCenters(102, 1.0, 3)
    model.trainOn(trainingData)
    clust = model.predictOnValues(testdata)
    #clust.pprint()
    #words = lines.flatMap(lambda line: line.split(" "))
    topic = clust.map(lambda x: (x[1], x[0][1]))
    #topic.pprint()
    topicAgg = topic.reduceByKey(lambda x, y: x + y)
    #wordCollect.pprint()
    topicAgg.map(lambda x: (x[0], freqcount(x[1]))).pprint()

    clust.foreachRDD(lambda time, rdd: q.put(rdd.collect()))

    # Run!
    ssc.start()
    ssc.awaitTermination()
Ejemplo n.º 11
0
from pyspark.mllib.clustering import StreamingKMeans

if __name__ == "__main__":
    sc = SparkContext(appName="sai twitter feed")
    ssc = StreamingContext(sc, 10)

    ssc.checkpoint("chkpfile")

    def parserData(line):
        cells = line.split(",")
        return Vectors.dense([float(cells[0]), float(cells[1])])

    trainingStream = ssc.textFileStream("/files").map(parserData)

    model = StreamingKMeans(k=2, decayFactor=1.0).setRandomCenters(2, 1.0, 0)
    print("Initial Centres" + str(model.latestModel().centers))
    model.trainOn(trainingStream)
    ssc.start()

    s = sched.scheduler(time.time, time.sleep)

    def print_cluster_centres(sc, model):
        print(str(model.latestModel().centers))
        s.enter(10, 1, print_cluster_centres, (s, model))

    s.enter(10, 1, print_cluster_centres, (s, model))
    s.run()

    ssc.awaitTermination()
# to make this work
#spark-submit "C:\SaiStudy - LEarn It All - Version9\Saistudy - split-csv.py"
Ejemplo n.º 12
0
    ssc = StreamingContext(sc, 5)

    # Kafka Stream
    ks = KafkaUtils.createDirectStream(
        ssc, ["test"], {"metadata.broker.list": "localhost:9092"})

    trainingData = sc.textFile("data/datatraining.txt")\
        .map(lambda line: line.split(',')[2:-1]).map(lambda arr: Vectors.dense([float(x) for x in arr]))

    # Supplied to Streaming KMeans as the centers by StreamingKmeans are not giving good predictions
    init_centers = KMeans.train(trainingData, 2).centers

    model = StreamingKMeans(k=2, decayFactor=0.1)\
        .setInitialCenters(init_centers, [1.0, 1.0, 1.0, 1.0, 1.0])

    model.trainOn(ssc.queueStream([trainingData]))

    def parse(lp):
        arr = lp.split(',')[2:-1]
        label = lp.split(',')[0]
        vec = Vectors.dense([float(x) for x in arr])
        return LabeledPoint(label, vec)

    test_stream = ks.map(lambda x: x[1]).map(parse)

    result = model.predictOnValues(
        test_stream.map(lambda lp: (lp.label, lp.features)))

    # Prints Prediction Prediction and Cluster Centers
    def current_centers(time, rdd):
        print("\n--------------------- %s --------------------------" %
Ejemplo n.º 13
0
    ssc.checkpoint("file:///tmp/spark")

    def parseTrainingData(line):
      cells = line.split(",")
      return Vectors.dense([float(cells[0]), float(cells[1])])

    trainingStream = ssc.textFileStream("file:///Users/jananiravi/spark/spark-2.1.0-bin-without-hadoop/tweets/training")\
      .map(parseTrainingData)

    trainingStream.pprint();

    model = StreamingKMeans(k=2, decayFactor=1.0).setRandomCenters(2, 1.0, 0)

    print("Initial centers: " + str(model.latestModel().centers))

    model.trainOn(trainingStream)

    ssc.start()

    s = sched.scheduler(time.time, time.sleep)
    def print_cluster_centers(sc, model): 
        print("Cluster centers: " + str(model.latestModel().centers))
        s.enter(10, 1, print_cluster_centers, (sc, model))

    s.enter(10, 1, print_cluster_centers, (s, model))
    s.run()

    ssc.awaitTermination()


    # Inicializa o algoritmo k-means com streaming para rodar sobre os dados
    # adicionados ao diretório de streaming.
    # k=2: Número de clusters em que o dataset será dividido
    # decayFactor=1.0: Todos os dados, desde o início, são relevantes.
    #             0.0: Utilização somente dos dados mais recentes.
    # O k-means requer o centro dos clusters randômicos para iniciar o
    # processo:
    # 2: Quantidade de centros a serem setados
    # 1.0 e 0: weight e seed
    model = StreamingKMeans(k=2, decayFactor=1.0).setRandomCenters(2, 1.0, 0)

    # Imprime os centros.
    print('Initial centers: ' + str(model.latestModel().centers))

    # Treinamento do modelo
    model.trainOn(training_stream)

    # Inicia a stream
    ssc.start()

    # Agenda a impressão dos valores do centros em tempos periódicos
    s = sched.scheduler(time.time, time.sleep)

    # Função que imprime os centros recursivamente, a cada 10s.
    def print_cluster_centers(sc, model):
        print('Cluster centers: ' + str(str(model.latestModel().centers)))
        s.enter(10, 1, print_cluster_centers, (sc, model))

    # A função para imprimir os clusters (print_cluster_centers) será
    # executada a cada 10s com prioridade 1. Essa função aceita dois
    # argumentos, o schedule s e o modelo representado pela variável
            #         clus=row[0]
            #         #ptext[clus].set_text(str(clus)+ ':'+str([x[0] for x in row[1][1]]))
            #         ptext[clus].set_text(str(clus)+ ':'+str(row[1][1]))
            #         ptext[clus].set_color(colors[clus])
            #     plt.pause(0.0001)
            #


q = multiprocessing.Queue()
f = multiprocessing.Queue()
job_for_another_core2 = multiprocessing.Process(target=data_plotting,args=(q,))
job_for_another_core2.start()

sc  = SparkContext('local[4]', 'Social Panic Analysis')
# Create a local StreamingContext with two working thread and batch interval of 1 second

ssc = StreamingContext(sc, 10)
dstream = ssc.socketTextStream("localhost", 9998)
trainingData = dstream.map(Vectors.parse)
trainingData.pprint()
testData=trainingData.map(lambda x: (x,x))
testData.pprint()
model = StreamingKMeans(k=clusterNum, decayFactor=0.1).setRandomCenters(2, 1.0, 0)
model.trainOn(trainingData)
print(model.latestModel().clusterCenters)
clust=model.predictOnValues(testData)
clust.pprint()
#print(model.predictOnValues(testData.map(lambda lp: (lp.label, lp.features))))
clust.foreachRDD(lambda time, rdd: q.put(rdd.collect()))
ssc.start()
ssc.awaitTermination()
Ejemplo n.º 16
0
    initialCenters, initialWeights)

#stkm = StreamingKMeans(k=int(numberClusters),decayFactor=1.0).setRandomCenters(2,1.0,100)

directKafkaStream = KafkaUtils.createDirectStream(
    ssc, ['StreamingKMeansTFG'], {
        "metadata.broker.list": "localhost:9092",
        "auto_offset_reset": 'earliest'
    })

parsed = directKafkaStream.map(lambda v: loads(v[1]))

parsed = parsed.map(
    lambda line: Vectors.dense([float(x) for x in line.strip().split()]))

stkm.trainOn(parsed)


def sendPartition(rdd):
    connection = MongoClient(mongoIP)
    test_db = connection.get_database(mongoDataBase)
    collection = test_db.get_collection(mongoCollection)
    model = stkm.latestModel()
    centers = model.centers
    weights = model.clusterWeights
    myquery = {"name": mongoCollection}
    newvalues = {
        "$set": {
            "clusterCenters": centers.tolist(),
            "clusterWeights": weights
        }