Ejemplo n.º 1
0
    def test_predictOn_model(self):
        """Test that the model predicts correctly on toy data."""
        stkm = StreamingKMeans()
        stkm._model = StreamingKMeansModel(
            clusterCenters=[[1.0, 1.0], [-1.0, 1.0], [-1.0, -1.0], [1.0, -1.0]],
            clusterWeights=[1.0, 1.0, 1.0, 1.0])

        predict_data = [[[1.5, 1.5]], [[-1.5, 1.5]], [[-1.5, -1.5]], [[1.5, -1.5]]]
        predict_data = [self.sc.parallelize(batch, 1) for batch in predict_data]
        predict_stream = self.ssc.queueStream(predict_data)
        predict_val = stkm.predictOn(predict_stream)

        result = []

        def update(rdd):
            rdd_collect = rdd.collect()
            if rdd_collect:
                result.append(rdd_collect)

        predict_val.foreachRDD(update)
        self.ssc.start()

        def condition():
            self.assertEqual(result, [[0], [1], [2], [3]])
            return True

        self._eventually(condition, catch_assertions=True)
Ejemplo n.º 2
0
    def test_trainOn_predictOn(self):
        """Test that prediction happens on the updated model."""
        stkm = StreamingKMeans(decayFactor=0.0, k=2)
        stkm.setInitialCenters([[0.0], [1.0]], [1.0, 1.0])

        # Since decay factor is set to zero, once the first batch
        # is passed the clusterCenters are updated to [-0.5, 0.7]
        # which causes 0.2 & 0.3 to be classified as 1, even though the
        # classification based in the initial model would have been 0
        # proving that the model is updated.
        batches = [[[-0.5], [0.6], [0.8]], [[0.2], [-0.1], [0.3]]]
        batches = [sc.parallelize(batch) for batch in batches]
        input_stream = self.ssc.queueStream(batches)
        predict_results = []

        def collect(rdd):
            rdd_collect = rdd.collect()
            if rdd_collect:
                predict_results.append(rdd_collect)

        stkm.trainOn(input_stream)
        predict_stream = stkm.predictOn(input_stream)
        predict_stream.foreachRDD(collect)

        t = time()
        self.ssc.start()
        self._ssc_wait(t, 6.0, 0.01)
        self.assertEqual(predict_results, [[0, 1, 1], [1, 0, 1]])
Ejemplo n.º 3
0
    def test_trainOn_predictOn(self):
        """Test that prediction happens on the updated model."""
        stkm = StreamingKMeans(decayFactor=0.0, k=2)
        stkm.setInitialCenters([[0.0], [1.0]], [1.0, 1.0])

        # Since decay factor is set to zero, once the first batch
        # is passed the clusterCenters are updated to [-0.5, 0.7]
        # which causes 0.2 & 0.3 to be classified as 1, even though the
        # classification based in the initial model would have been 0
        # proving that the model is updated.
        batches = [[[-0.5], [0.6], [0.8]], [[0.2], [-0.1], [0.3]]]
        batches = [self.sc.parallelize(batch) for batch in batches]
        input_stream = self.ssc.queueStream(batches)
        predict_results = []

        def collect(rdd):
            rdd_collect = rdd.collect()
            if rdd_collect:
                predict_results.append(rdd_collect)

        stkm.trainOn(input_stream)
        predict_stream = stkm.predictOn(input_stream)
        predict_stream.foreachRDD(collect)

        self.ssc.start()

        def condition():
            self.assertEqual(predict_results, [[0, 1, 1], [1, 0, 1]])
            return True

        eventually(condition, catch_assertions=True)
Ejemplo n.º 4
0
    def test_predictOn_model(self):
        """Test that the model predicts correctly on toy data."""
        stkm = StreamingKMeans()
        stkm._model = StreamingKMeansModel(clusterCenters=[[1.0, 1.0],
                                                           [-1.0, 1.0],
                                                           [-1.0, -1.0],
                                                           [1.0, -1.0]],
                                           clusterWeights=[1.0, 1.0, 1.0, 1.0])

        predict_data = [[[1.5, 1.5]], [[-1.5, 1.5]], [[-1.5, -1.5]],
                        [[1.5, -1.5]]]
        predict_data = [
            self.sc.parallelize(batch, 1) for batch in predict_data
        ]
        predict_stream = self.ssc.queueStream(predict_data)
        predict_val = stkm.predictOn(predict_stream)

        result = []

        def update(rdd):
            rdd_collect = rdd.collect()
            if rdd_collect:
                result.append(rdd_collect)

        predict_val.foreachRDD(update)
        self.ssc.start()

        def condition():
            self.assertEqual(result, [[0], [1], [2], [3]])
            return True

        eventually(condition, catch_assertions=True)
Ejemplo n.º 5
0
    def test_predictOn_model(self):
        """Test that the model predicts correctly on toy data."""
        stkm = StreamingKMeans()
        stkm._model = StreamingKMeansModel(
            clusterCenters=[[1.0, 1.0], [-1.0, 1.0], [-1.0, -1.0], [1.0, -1.0]],
            clusterWeights=[1.0, 1.0, 1.0, 1.0])

        predict_data = [[[1.5, 1.5]], [[-1.5, 1.5]], [[-1.5, -1.5]], [[1.5, -1.5]]]
        predict_data = [sc.parallelize(batch, 1) for batch in predict_data]
        predict_stream = self.ssc.queueStream(predict_data)
        predict_val = stkm.predictOn(predict_stream)

        result = []

        def update(rdd):
            rdd_collect = rdd.collect()
            if rdd_collect:
                result.append(rdd_collect)

        predict_val.foreachRDD(update)
        t = time()
        self.ssc.start()
        self._ssc_wait(t, 6.0, 0.01)
        self.assertEquals(result, [[0], [1], [2], [3]])
Ejemplo n.º 6
0
        return LabeledPoint(label, vec)

    sc = SparkContext(appName="StreamingKmeansKafka")
    ssc = StreamingContext(sc, 20)

    zkQuorum, trainTopic, testTopic = sys.argv[1:]
    kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer",
                                  {trainTopic: 1})
    lines = kvs.map(lambda x: x[1])
    trainingData = lines.map(lambda line: Vectors.dense(
        [float(x) for x in line.strip().split(',')]))

    kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer",
                                  {testTopic: 1})
    lines = kvs.map(lambda x: x[1])
    testingData = lines.map(parse)

    model = StreamingKMeans(k=3, decayFactor=1.0).setRandomCenters(4, 1.0, 0)

    model.trainOn(trainingData)

    result = model.predictOn(trainingData)
    result.pprint()

    result = model.predictOnValues(
        testingData.map(lambda lp: (lp.label, lp.features)))
    result.pprint()

    ssc.start()
    ssc.awaitTermination()
Ejemplo n.º 7
0
stkm.trainOn(parsed)


def sendPartition(rdd):
    connection = MongoClient("mongodb://localhost:27017/")
    test_db = connection.get_database('sparkDB')
    collection = test_db.get_collection('sset1')
    model = stkm.latestModel()
    centers = model.centers
    weights = model.clusterWeights
    myquery = {"name": "sset1"}
    newvalues = {
        "$set": {
            "clusterCenters": centers.tolist(),
            "clusterWeights": weights
        }
    }
    collection.update_one(myquery, newvalues)
    connection.close()


parsed.count().foreachRDD(sendPartition)

stkm.predictOn(parsed).pprint()

parsed.count().pprint()

ssc.start()
ssc.awaitTermination()