Esempio n. 1
0
 def spark_means(self, Matrix, Kcluster=2, MaxIterations=10, runs=10):
     cluster_data = self.sc.parallelize(Matrix)
     trains = KMeans().train(cluster_data, Kcluster, MaxIterations, runs)
     results = trains.predict(cluster_data).collect()
     return results
spark = SparkSession \
    .builder \
    .appName("KMeans") \
    .config("spark.some.config.option", "Angadpreet-KMeans") \
    .getOrCreate()
today = dt.datetime.today()
spark_df = sc.parallelize(spark.read.json("Data/yelp_academic_dataset_business.json").select("stars","review_count","is_open").take(1700))
scaler = MinMaxScaler(inputCol="_1",\
         outputCol="scaled_1")
trial_df = spark_df.map(lambda x: pyspark.ml.linalg.Vectors.dense(x)).map(lambda x:(x, )).toDF()
scalerModel = scaler.fit(trial_df)
vector_df = scalerModel.transform(trial_df).select("scaled_1").rdd.map(lambda x:Vectors.dense(x))
num_clusters = 3

#Input into the Algorithm
km = KMeans()
kme = km.train(vector_df, k = num_clusters, maxIterations = 10, seed=2018)
centers = kme.clusterCenters

err = vector_df.map(lambda x:(x[0], findCenter(x[0], centers))).collect()

#Silhoutte Value comparison
ag = 0
agi = 0
for er in err:
    avg = [0] * num_clusters
    avgi = [0] * num_clusters
    for e in err:
        avg[e[1]] += Vectors.squared_distance(er[0], e[0])
        avgi[e[1]] += 1
    a = avg[er[1]] / avgi[er[1]]
Esempio n. 3
0
#
print("done in %fs" % (time() - t0))
print("n_samples: %d, n_features: %d" % X.shape)
print()
done in 5.232165s
n_samples: 7540, n_features: 6638

#
# KMeans Clustering
# Initial clusters = 7
# Maximum iteration = 100
#

In [47]:

km = KMeans(n_clusters=7, init='k-means++', max_iter=100, n_init=1,
            verbose=1)

print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(X)
print("done in %0.3fs" % (time() - t0))

Clustering sparse data with KMeans(copy_x=True, init='k-means++', max_iter=100, n_clusters=7, n_init=1,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=1)
Initialization complete
Iteration  0, inertia 13635.141
Iteration  1, inertia 6943.485
Iteration  2, inertia 6924.093
Iteration  3, inertia 6915.004
Iteration  4, inertia 6909.212
Esempio n. 4
0
    def fit(self, data, n_components, n_iter, ct):
        """
        Estimate model parameters with the expectation-maximization
        algorithm.

        Parameters
        ----------
        data - RDD of data points
        n_components - Number of components
        n_iter - Number of iterations. Default to 100

        Attributes
        ----------

        covariance_type : Type of covariance matrix.
            Supports only diagonal covariance matrix.

        ct : Threshold value to check the convergence criteria.
            Defaults to 1e-3

        min_covar : Floor on the diagonal of the covariance matrix to prevent
            overfitting.  Defaults to 1e-3.

        converged : True once converged False otherwise.

        Weights : array of shape (1,  n_components)
            weights for each mixture component.

        Means : array of shape (n_components, n_dim)
            Mean parameters for each mixture component.

        Covars : array of shape (n_components, n_dim)
            Covariance parameters for each mixture component

        """
        sc = data.context
        covariance_type = 'diag'
        converged = False
        self.min_covar = 1e-3

        #  observation statistics
        self.s0 = 0
        self.s1 = 0
        #  To get the no of data points
        n_points = data.count()
        #  To get the no of dimensions
        n_dim = data.first().size

        if (n_points == 0):
            raise ValueError('Dataset cannot be empty')
        if (n_points < n_components):
            raise ValueError(
                'Not possible to make (%s) components from (%s) datapoints' %
                (n_components, n_points))

        # Initialize Covars(diagonal covariance matrix)
        if hasattr(data.first(), 'indices'):
            self.isSparse = 1

            def convert_to_kvPair(eachV):
                g = []
                for i in range(eachV.indices.size):
                    g.append(
                        (eachV.indices[i],
                         (eachV.values[i], eachV.values[i] * eachV.values[i])))
                return g

            def computeVariance(x):
                mean = x[1][0] / n_points
                sumSq = x[1][1] / n_points
                return x[0], sumSq - mean * mean

            cov = []
            kvPair = data.flatMap(convert_to_kvPair)
            res = kvPair.reduceByKey(np.add).map(computeVariance)
            cov = Vectors.sparse(n_dim, res.collectAsMap()).toArray() + 1e-3
            self.Covars = np.tile(cov, (n_components, 1))

        else:
            self.isSparse = 0
            cov = []
            for i in range(n_dim):
                cov.append(
                    data.map(lambda m: m[i]).variance() + self.min_covar)
            self.Covars = np.tile(cov, (n_components, 1))

        # Initialize Means using MLlib KMeans
        self.Means = np.array(KMeans().train(data,
                                             n_components).clusterCenters)
        # Initialize Weights with the value 1/n_components for each component
        self.Weights = np.tile(1.0 / n_components, n_components)
        #  EM algorithm
        # loop until number of iterations  or convergence criteria is satisfied
        for i in range(n_iter):

            logging.info("GMM running iteration %s " % i)
            # broadcasting means,covars and weights
            self.meansBc = sc.broadcast(self.Means)
            self.covarBc = sc.broadcast(self.Covars)
            self.weightBc = sc.broadcast(self.Weights)
            # Expectation Step
            EstepOut = data.map(self.scoreOnePoint)
            # Maximization step
            MstepIn = EstepOut.reduce(lambda (w1, x1, y1, z1), (
                w2, x2, y2, z2): (w1 + w2, x1 + x2, y1 + y2, z1 + z2))
            self.s0 = self.s1
            self.mStep(MstepIn[0], MstepIn[1], MstepIn[2], MstepIn[3])

            #  Check for convergence.
            if i > 0 and abs(self.s1 - self.s0) < ct:
                converged = True
                logging.info("Converged at iteration %s" % i)
                break

        return self
Esempio n. 5
0
def costs_movies(cluster, train, test):
    for c in cluster:
        m = KMeans().train(train, k=c, maxIterations=10, runs=3)
        wscc = m.computeCost(test)
        print("WSCC for k=" + str(c) + ":" + str(wscc))
Esempio n. 6
0
#fitting the vector data and transforming with scaler transformation
scaler_model = scaler.fit(final_df)
final_df = scaler_model.transform(final_df)
final_df.show(6)

# In[28]:

import numpy as np
import matplotlib.pyplot as plt
from time import time

cost = np.zeros(20)
for k in range(2, 20):
    start = time()
    kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features")
    model = kmeans.fit(final_df.sample(False, 0.1, seed=42))
    cost[k] = model.computeCost(final_df)
    end = time()
    print("K means from spark took {:.4f} seconds(k = {:.4f})".format(
        end - start, k))

# In[8]:

fig, ax = plt.subplots(1, 1, figsize=(8, 6))
ax.plot(range(2, 20), cost[2:20])
ax.set_xlabel('k')
ax.set_ylabel('cost')

# In[39]:
Esempio n. 7
0
row_ratings.cache()

#
als_model = ALS.train(row_ratings, 50, 10, 0.1)
movie_factors = als_model.productFeatures().map(lambda (id, factor):
                                                (id, Vectors.dense(factor)))
movie_vectors = movie_factors.map(lambda (id, vector): vector)
#print(movie_vectors.first())
user_factors = als_model.userFeatures().map(lambda (id, factor):
                                            (id, Vectors.dense(factor)))
user_vectors = user_factors.map((lambda (id, vector): vector))
#print(user_vectors.first())

# train
movie_cluster_model = KMeans().train(movie_vectors,
                                     k=5,
                                     maxIterations=10,
                                     runs=3)
print("movie cluster model kmeans :")
print(movie_cluster_model)
user_cluster_model = KMeans().train(user_vectors,
                                    k=5,
                                    maxIterations=10,
                                    runs=3)
print("user cluster model kmeans :")
print(user_cluster_model)

# predict
movie_1 = movie_vectors.first()
movie_cluster = movie_cluster_model.predict(movie_1)
print(movie_cluster)
Esempio n. 8
0
 def K_means(self, data):
     cluster_data = self.sc.parallelize(data)
     trains = KMeans().train(cluster_data, self.k, self.iteration,
                             self.runs)
     results = trains.predict(cluster_data).collect()
     return results
Esempio n. 9
0
from pyspark.mllib.clustering import KMeans, KMeansModel
from numpy import array
from math import sqrt

from pyspark.ml.clustering import KMeans
kmeans = KMeans(k=2, seed=1)


def mapper(line):
    return line[0], line[1], line[2], line[3], line[4], line[5], line[6], line[
        7], line[8], line[9], line[10], line[11], line[12],


weather_features = latlongagain.map(mapper)

weather_features_df = weather_features.toDF()
weather_df = weather_features_df.selectExpr(
    "_1 as datetime1", "_2 as day", "_3 as month", "_4 as lat", "_5 as lng",
    "_6 as base", "_7 as humidity", "_8 as wind", "_9 as temp", "_10 as desc",
    "_11 as rain", "_12 as latlng", "_13 as borough", "_14 as features")

test1 = weather_df.withColumn("features", udf_foo("features"))
test1.printSchema()

model = kmeans.fit(test1.select('features'))
Esempio n. 10
0
def kmeans_label(mat, scoring=False):
    kmeans_model = KMeans(n_clusters=NUMBER_CLUSTERS, random_state=1).fit(mat)
    labels = kmeans_model.labels_
    if scoring:
        print "kmeans score: ", silhouette_score(tif_mat, labels, metric='euclidean')
    return labels
Esempio n. 11
0
# print(first_part[:10])
first_part = sc.parallelize(first_part).repartition(5).persist(
    pyspark.StorageLevel.DISK_ONLY)

#adding first sample data in data_used and reoving those from originalRDD
data_used = first_part.map(lambda line: int(line[0])).collect()
data_used = set(data_used)
originalRDD = originalRDD.filter(lambda line: True
                                 if int(line[0]) not in data_used else False)

# Trains a k-means model.
# make clusters model with 5*number of clusters
# predicting results for every point in the first part
train_data = first_part.map(lambda line: array(line[2:]))
train_data = np.array(train_data.collect())
kmeans = KMeans(n_clusters=input_clusters * 5, random_state=0).fit(train_data)
print(kmeans.labels_)
results = first_part.map(lambda line: (kmeans.predict([line[
    2:]]), [int(line[0])])).map(lambda line: (line[0].tolist()[0], line[1])
                                ).reduceByKey(lambda a, b: a + b).persist(
                                    pyspark.StorageLevel.DISK_ONLY)

# seperating the clusters with only one point and adding them to the retained set
RetainedSetRDD = results.filter(lambda line: True if len(line[1]) == 1 else
                                False).map(lambda line: line[1][0])
retained_set.update(set(RetainedSetRDD.collect()))
# print(retained_set)

# Running K means on the candidates for Discard Set
remaining = results.filter(lambda line: True if len(line[1]) > 1 else False
                           ).flatMap(lambda line: line[1])