def test_streaming_kmeans():
    records = get_data_from_db()

    conf = SparkConf().setAppName("testingClusters").setMaster("local[2]")
    sc = SparkContext(conf=conf)
    sc.setLogLevel("ERROR")

    initCenters = [[0.0, 0.0, 0.0], [1.0, 1.0, 1.0], [0.5, 0.5, 0.5]]
    initWeights = [1.0, 1.0, 1.0]
    stkm = StreamingKMeansModel(initCenters, initWeights)
    recordRDD = sc.parallelize(records)
    stkm = stkm.update(data=recordRDD, decayFactor=1.0, timeUnit=u"batches")

    for center in stkm.centers:
        print(center)

    sample_data = np.array([46.5, 23.0, 1034.0]).reshape(1, -1)
    record_scaled = scaler.transform(sample_data)
    print("Scaled Record: ", record_scaled)
    ret = stkm.predict(sc.parallelize(record_scaled))
    print("******** Predicted cluster index: ")
    ret.foreach(print)
    for center in stkm.centers:
        distance = np.linalg.norm(center - record_scaled)
        similarity = 1 / (1 + distance)
        print(center, distance, similarity)

    stkm = stkm.update(sc.parallelize(record_scaled), 1.0, u"points")

    sample_data = np.array([46.2, 23.5, 1034.32]).reshape(1, -1)
    record_scaled = scaler.transform(sample_data)
    print("Scaled Record: ", record_scaled)
    ret = stkm.predict(sc.parallelize(record_scaled))
    print("******** Predicted cluster index: ")
    ret.foreach(print)
    for center in stkm.centers:
        distance = np.linalg.norm(center - record_scaled)
        similarity = 1 / (1 + distance)
        print(center, distance, similarity)
Exemple #2
0
                 truePredictions.append(int(c[0]))

     with open("datasets/"+sys.argv[1]+".txt","r") as fichero:
    	  for linea in fichero:
              points.append(linea.strip("\n").split())

     for document in cursor:
         centers = document["clusterCenters"]
         weights = document["clusterWeights"]

     stkm = StreamingKMeansModel(centers, weights)

     predictions = []

     for point in points:
         predictions.append(stkm.predict(point))

     recall = recall_score(truePredictions,predictions, average='weighted')

     precision = precision_score(truePredictions, predictions, average='weighted')

     f1Score = 2 * (precision * recall) / (precision + recall)

     logging.info("Recall = " + str(recall) )
     logging.info("Precision = " + str(precision) )
     logging.info("F1-Score = " + str(f1Score))

     f=open(sys.argv[1]+"Resultados.txt","w")
     f.write("Recall = " + str(recall))
     f.write("\n") 
     f.write("Precision = " + str(precision))
Exemple #3
0
stkm = StreamingKMeansModel(offlineModel.clusterCenters, initWeights)

#kafkaStream = KafkaUtils.createDirectStream(ssc, ['test'], {'metadata.broker.list': 'localhost:9092'})

#lines = kafkaStream.map(lambda line: array([float(x) for x in line.split('\t')]))

consumer = KafkaConsumer('test',
                         bootstrap_servers=['localhost:9092'],
                         value_deserializer=lambda x: loads(x.decode('utf-8')),
                         consumer_timeout_ms=10000)

colors = [
    'r', 'k', 'b', 'grey', 'darkorange', 'm', 'y', 'c', 'gold', 'slateblue',
    'beige', 'coral', 'g', 'peru', 'pink'
]

i = 0

print("Collecting data")
for message in consumer:
    testData = Vectors.dense([float(x) for x in message.value.strip().split()])
    plt.plot(testData[0],
             testData[1],
             'o',
             color=colors[stkm.predict(testData)])
print("Vamos a dibujar")
plt.show()

ssc.start()
ssc.awaitTermination()