Beispiel #1
0
    def perform_training(sc: SparkContext, params_dict: dict):
        normal_ekg_data_path = None if 'normal_ekg_data_path' not in params_dict else params_dict[
            'normal_ekg_data_path']
        min_num_of_clusters = 5 if 'min_num_of_clusters' not in params_dict else int(params_dict['min_num_of_clusters'])
        max_num_of_clusters = 20 if 'max_num_of_clusters' not in params_dict else int(
            params_dict['max_num_of_clusters'])
        boundary_ratio = 0.8 if 'boundary_ratio' not in params_dict else int(params_dict['boundary_ratio'])

        ekg_rdd_data = sc.textFile(normal_ekg_data_path).map(
            lambda line: np.array([float(val) for val in line.split(',')]))

        # ekg_rdd_data.foreach(Plotter.plot_signal_window)
        k_range = range(min_num_of_clusters, max_num_of_clusters, 1)
        prev_cost = float(np.inf)
        final_km = KMeansModel(ekg_rdd_data.takeSample(False, 1))
        cost_ratios = []
        found_best = False
        for k in k_range:
            km = KMeans.train(ekg_rdd_data, k)
            # cost equals to sum of squared distances of samples to the nearest cluster centre
            cost = km.computeCost(ekg_rdd_data)
            ratio = cost / prev_cost
            prev_cost = cost
            cost_ratios.append(ratio)
            if (ratio > boundary_ratio) & (not found_best):
                final_km = km
                found_best = True

        Plotter.plot_elbow(cost_ratios, k_range)
        return final_km
Beispiel #2
0
def kmeansInitialClusters(dataset):
    model = KMeansModel(CENTER_VECTORS)
    vectorsRdd = dataset.rdd.map(lambda data: Vectors.parse(Vectors.stringify(data['features'])))
    trainedModel = KMeans.train(vectorsRdd, 4, maxIterations=1000, initialModel=model)
    result=[]
    for d in dataset.collect():
        entry = {}
        entry["features"] = d["features"]
        entry["prediction"] = trainedModel.predict(Vectors.parse(Vectors.stringify(d['features'])))
        entry["label"] = d['label']
        result.append(entry)

    plotDiversitySizeClustering(result, CENTERS, "Size", "Diversity", "Song Analysis by Size and Diversity with Initial Clusters")
    centroidArtistSongCount(result, CENTERS)
Beispiel #3
0
def assign_pooling(data):

    image_name, feature_matrix = data[0]
    clusterCenters = data[1]

    feature_matrix = np.array(feature_matrix)

    model = KMeansModel(clusterCenters)
    bow = np.zeros(len(clusterCenters))

    for x in feature_matrix:
        k = model.predict(x)
        dist = distance.euclidean(clusterCenters[k], x)
        bow[k] = max(bow[k], dist)

    clusters = bow.tolist()
    group = clusters.index(min(clusters)) + 1
    return [image_name, group]
def assign_pooling(row, clusterCenters, pooling):
    image_name = row['fileName']
    feature_matrix = np.array(row['features'])
    clusterCenters = clusterCenters.value
    model = KMeansModel(clusterCenters)
    bow = np.zeros(len(clusterCenters))

    for x in feature_matrix:
        k = model.predict(x)
        dist = distance.euclidean(clusterCenters[k], x)
        if pooling == "max":
            bow[k] = max(bow[k], dist)
        elif pooling == "sum":
            bow[k] = bow[k] + dist
    clusters = bow.tolist()
    group = clusters.index(min(clusters)) + 1
    #print(image_name + " in group: " + str(group))
    return [(image_name, group)]
def assign_pooling(data):

    row = data[0]
    clusterCenters = data[1]
    pooling = data[2]

    image_name = row['fileName']
    feature_matrix = np.array(row['features'])

    model = KMeansModel(clusterCenters)
    bow = np.zeros(len(clusterCenters))

    for x in feature_matrix:
        k = model.predict(x)
        dist = distance.euclidean(clusterCenters[k], x)
        if pooling == "max":
            bow[k] = max(bow[k], dist)
        elif pooling == "sum":
            bow[k] = bow[k] + dist
    clusters = bow.tolist()
    group = clusters.index(min(clusters)) + 1
    return [image_name, group]
def train_rotations(sc, split_vecs, M, Cs):
    """
    For compute rotations for each split of the data using given coarse quantizers.
    """

    Rs = []
    mus = []
    counts = []
    for split in xrange(2):

        print 'Starting rotation fitting for split %d' % split

        # Get the data for this split
        data = split_vecs.map(lambda x: x[split])

        # Get kmeans model
        model = KMeansModel(Cs[split])

        R, mu, count = compute_local_rotations(sc, data, model, M / 2)
        Rs.append(R)
        mus.append(mu)
        counts.append(count)

    return Rs, mus, counts
Beispiel #7
0
    currTime = strftime("%Y-%m-%d-%H-%M-%S")
    sc = SparkContext(appName="KMeans")
    lines = sc.textFile("hdfs://masterNode:9000/user/spark/dataset_observatory/initial_centroids.csv")
    dataset = sc.textFile("hdfs://masterNode:9000/user/spark/dataset_observatory/training_data.csv")
    predict_data = sc.textFile("hdfs://masterNode:9000/user/spark/dataset_observatory/predict_data/Semestres/Semestre1-2016.csv")

    average_per_year = average_year(lines) # 2014 and 2015
    average_per_month = average_month(average_per_year)
    data = parseDataset(dataset)
    k = int(sys.argv[1])
    initial_centroids = generate_initial_centroids(average_per_month.collect(), k)

    # KMeans
    start = time()
    kmeans_model = KMeans.train(data, k, maxIterations = 100, initialModel = KMeansModel(initial_centroids))
    end = time()
    elapsed_time = end - start
    kmeans_output = [
        "====================== KMeans ====================\n",
        "Final centers: " + str(kmeans_model.clusterCenters),
        "Total Cost: " + str(kmeans_model.computeCost(data)),
        "Value of K: " + str(k),
        "Elapsed time: %0.10f seconds." % elapsed_time
    ]

    # Predicting
    points = parseDataset(predict_data)
    count_lines = float(len(points.collect()))
    probabilities = generate_probabilities(points, k, kmeans_model, count_lines)
    print("Prob: ", probabilities)
        "hdfs://masterNode:9000/user/spark/dataset_observatory/predict_data/Semestres/Semestre1-2016.csv"
    )

    average_per_year = average_year(lines)  # 2014 and 2015
    average_per_month = average_month(average_per_year)
    data = parseDataset(dataset)
    k = int(sys.argv[1])
    initial_centroids = generate_initial_centroids(average_per_month.collect(),
                                                   k)

    # KMeans
    start = time()
    kmeans_model = KMeans.train(data,
                                k,
                                maxIterations=100,
                                initialModel=KMeansModel(initial_centroids))
    end = time()
    elapsed_time = end - start
    kmeans_output = [
        "====================== KMeans ====================\n",
        "Final centers: " + str(kmeans_model.clusterCenters),
        "Total Cost: " + str(kmeans_model.computeCost(data)),
        "Value of K: " + str(k),
        "Elapsed time: %0.10f seconds." % elapsed_time
    ]

    # Predicting
    points = parseDataset(predict_data)
    count_lines = float(len(points.collect()))
    probabilities = generate_probabilities(points, k, kmeans_model,
                                           count_lines)
Beispiel #9
0
from pyspark.mllib.clustering import KMeansModel
from pyspark.mllib.linalg import Vectors
from pyspark import SparkContext
from itertools import permutations
import csv
import operator

centers = []
with open('/home/ronald/kmeansModel', 'r') as f:
    line = f.readline()
    while line:
        points = line[1:len(line) - 2].split(",")
        centers.append([float(i) for i in points])
        line = f.readline()

model = KMeansModel(centers)

modelCenters = model.clusterCenters
realCenters = []
with open('/home/ronald/centers.csv', 'r') as f:
    csvReader = csv.DictReader(f)
    for row in csvReader:
        center = []
        for i in row:
            center.append(row[i])
        realCenters.append(Vectors.dense(center))

perm = list(permutations([i for i in range(8)]))

totalDist = []
for i in perm:
Beispiel #10
0
#Taking latitude and longitude columns data for clustering.
rddLoc = rdd.map(lambda line: (float(line[20]), float(line[21])))
arr = []

#Finding the optimum K value for clusters we used KMeansModel library
for k in range(10, 160, 10):

    #Taking sample list for initial centroids
    samplelist = sc.parallelize(rdd.take(k))
    list = samplelist.map(lambda line: (line[20], line[21])).collect()
    sample_centroidlist = np.array(list).astype('float')
    model = KMeans.train(rddLoc,
                         k,
                         maxIterations=30,
                         initialModel=KMeansModel(sample_centroidlist))

    #The model trained gives the final centroids.
    #The final centers are used for evaluating clustering.
    # Evaluate clustering by computing Within Set Sum of Squared Errors
    def error(point):
        center = model.centers[model.predict(point)]
        return sqrt(sum([x**2 for x in (point - center)]))

    WSSSE = rddLoc.map(lambda point: error(point)).reduce(lambda x, y: x + y)
    arr.append(WSSSE)
print("Within Set Sum of Squared Error = ")
print(arr)
#Plot the graph for array returned and corresponding K values. The optimum value of k is where there is an elbow in the curve.
#The K value we got is 60.