Beispiel #1
0
def create_kmeans_dendogram(input_csv, num_clusters):
    spark = SparkSession.builder.appName(
        'HotelsPriceDataGeneratorSession').getOrCreate()

    # Lazy op - Load the data
    # Read CSV
    # Note that the schema is already defined, a fully null df will result if the csv does not fit the schema
    print('Reading CSV from ' + input_csv)
    generated_hotels_df = spark.read.csv(input_csv,
                                         header=True,
                                         inferSchema=True)

    # Limit the clusters to num cols
    num_clusters = min(num_clusters, len(generated_hotels_df.columns[1:]))

    # Assemble the features vector column
    vecAssembler = VectorAssembler(inputCols=generated_hotels_df.columns[1:],
                                   outputCol="features")
    vector_df = vecAssembler.transform(generated_hotels_df)

    # Run the BisectingKMeans to find hierarchial clusters
    kmeans = BisectingKMeans().setK(num_clusters).setSeed(42)
    model = kmeans.fit(vector_df)

    # Link it to find relations between the clusters
    z = hc.linkage(model.clusterCenters(),
                   method='average',
                   metric='correlation')

    # Plot the dendrogram
    hc.dendrogram(z)
    plt.show()
Beispiel #2
0
 def __find_cluster_split_kmeans_sparkdf(cls, feature_col, df_norm, n_iterations, kmeans_method, sc):
     from pyspark.ml.clustering import KMeans
     start_time = time.time()
     #convert to spark df
     sqlContext = SQLContext(sc)
     spark_df = sqlContext.createDataFrame(df_norm)
     #assemble vector
     vecAssembler = VectorAssembler(inputCols=feature_col, outputCol="features")
     spark_df_clustering = vecAssembler.transform(spark_df).select('features')
     n_components_list = []
     n_range = np.arange(2, 20)
     for iteration in np.arange(n_iterations):
         cost = []
         for k in n_range:
             if kmeans_method == 'kmeans':
                 print("Kmeans Elbow Method K = ", k)
                 kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features")
                 model = kmeans.fit(spark_df_clustering)
             elif kmeans_method == 'bisecting_kmeans':
                 print("Bisecting Kmeans Elbow Method K = ", k)
                 bkm = BisectingKMeans().setK(k).setSeed(1).setFeaturesCol("features")
                 model = bkm.fit(spark_df_clustering)
             cost.append(model.computeCost(spark_df_clustering)) # requires Spark 2.0 or later
         print('Cluster List: ', n_range)
         print('Within Set Sum of Squared Errors: ', cost)
         n_split_knee = cls.__knee_locator(n_range, cost, 'convex', 'decreasing', 'sum_of_square_error')
         print("Recommended no. of components by knee locator: " + str(n_split_knee))
         n_components_list.append(n_split_knee)
     n_components = int(np.median(n_components_list).round(0))
     print('Recommended median number of splits: ', n_components)
     print("elbow method time: ", time.time()-start_time, "(sec)")
     return n_components
Beispiel #3
0
def getTopClusters(startDate, endDate, startTime, endTime, category):
    filteredDF = applyFilter(startDate, endDate, startTime, endTime, category).cache()

    # Extract X, Y into feature vector
    vectorizer = VectorAssembler()
    vectorizer.setInputCols(["X", "Y"])
    vectorizer.setOutputCol("features")
    pointsDF = vectorizer.transform(filteredDF).cache()

    # Hierarchical K means
    bkm = BisectingKMeans().setK(10).setSeed(7).setMaxIter(7)
    model = bkm.fit(pointsDF)

    # RDD of (clusterIndex, size)
    clustersRDD = (model.transform(pointsDF)
                   .select("prediction").rdd
                   .map(lambda row: (row["prediction"], 1))
                   .reduceByKey(lambda a, c: a + c))

    clusters = model.clusterCenters()
    clusterRV = clustersRDD.collect()

    rv = []
    for ind, num in clusterRV:
        val = {"c": (clusters[ind][0], clusters[ind][1]), "o": num}
        rv.append(val)

    return rv
Beispiel #4
0
 def get_clusters(self, parameters: dict, urls_and_vectors: DataFrame) -> DataFrame:
     urls_and_vectors = urls_and_vectors.cache()
     bisecting_kmeans = BisectingKMeans().setK(parameters['k']).setDistanceMeasure(
         parameters['distance_measure']).setFeaturesCol("vector").setPredictionCol("cluster_id")
     model = bisecting_kmeans.fit(urls_and_vectors)
     clustered_url_vectors = model.transform(urls_and_vectors)
     urls_and_vectors.unpersist()
     return clustered_url_vectors
Beispiel #5
0
def clustering(df_kmeans, n):
    kmeans = BisectingKMeans().setK(n).setSeed(1).setFeaturesCol("features")
    print('kmeans ', kmeans)
    model = kmeans.fit(df_kmeans)

    centers = model.clusterCenters()

    print("Cluster Centers: ")
    for center in centers:
        print(center)
Beispiel #6
0
    def bisecting_k_means(self, k):
        print('\nBisecting K-Means - ' + str(k))
        kmeans = BisectingKMeans().setK(k).setSeed(1)
        model = kmeans.fit(self.df.select('features'))

        transformed = model.transform(self.df)
        transformed.groupBy("prediction").count().show()

        centers = model.clusterCenters()
        self.print_centers(centers)
Beispiel #7
0
def bisect_model(data):
    #TODO grid search best parametrs
    bkm = BisectingKMeans().setK(2).setSeed(1)
    model = bkm.fit(data)
    cost = model.computeCost(data)
    prilog.infont("Within Set Sum of Squared Errors = " + str(cost))
    log.info("Cluster Centers: ")
    centers = model.clusterCenters()
    for center in centers:
        log.info(center)
    predictions_bi = model.transform(data)

    return predictions_bi
Beispiel #8
0
def model_list():
    clist = []
    df2 = df1.select('features')
    df2.cache
    df1.cache
    for i in range(2,20):
        kmeans = BisectingKMeans(k=i, minDivisibleClusterSize=1.0)
        model = kmeans.fit(df2)
        WSSSE = model.computeCost(df1)
        #print("Within Set Sum of Squared Error, k = " + str(i) + ": " +str(WSSSE))
        clist.append({i: WSSSE, 'model': model})
    df1.unpersist
    df2.unpersist
    return clist
Beispiel #9
0
 def test_bisecting_kmeans_summary(self):
     data = [(Vectors.dense(1.0), ), (Vectors.dense(5.0), ),
             (Vectors.dense(10.0), ), (Vectors.sparse(1, [], []), )]
     df = self.spark.createDataFrame(data, ["features"])
     bkm = BisectingKMeans(k=2)
     model = bkm.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.featuresCol, "features")
     self.assertEqual(s.predictionCol, "prediction")
     self.assertTrue(isinstance(s.cluster, DataFrame))
     self.assertEqual(len(s.clusterSizes), 2)
     self.assertEqual(s.k, 2)
     self.assertEqual(s.numIter, 20)
Beispiel #10
0
 def __bisecting_k_mean(cls, k_clusters, xnorm, feature_col, sc):
     #k_clusters = elbow point
     start_time = time.time()
     #convert to spark df
     sqlContext = SQLContext(sc)
     df_norm = pd.DataFrame(data = xnorm, columns = feature_col)
     spark_df = sqlContext.createDataFrame(df_norm)
     #assemble vector
     vecAssembler = VectorAssembler(inputCols=feature_col, outputCol="features")
     spark_df_clustering = vecAssembler.transform(spark_df).select('features')
     bkm = BisectingKMeans().setK(k_clusters).setSeed(1).setFeaturesCol("features")
     model = bkm.fit(spark_df_clustering)
     prediction = model.transform(spark_df_clustering).select('prediction').collect()
     labels = [p.prediction for p in prediction]
     return labels
Beispiel #11
0
 def test_bisecting_kmeans_summary(self):
     data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),),
             (Vectors.sparse(1, [], []),)]
     df = self.spark.createDataFrame(data, ["features"])
     bkm = BisectingKMeans(k=2)
     model = bkm.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.featuresCol, "features")
     self.assertEqual(s.predictionCol, "prediction")
     self.assertTrue(isinstance(s.cluster, DataFrame))
     self.assertEqual(len(s.clusterSizes), 2)
     self.assertEqual(s.k, 2)
     self.assertEqual(s.numIter, 20)
def main(sc, spark):
    # Load the Corpus
    corpus = load_corpus(sc, spark)

    # Create the vector/cluster pipeline
    pipeline = Pipeline(stages=[
        Tokenizer(inputCol="text", outputCol="tokens"),
        Word2Vec(vectorSize=7, minCount=0, inputCol="tokens",
                 outputCol="vecs"),
        BisectingKMeans(k=10, featuresCol="vecs", maxIter=10),
    ])

    # Fit the model
    model = pipeline.fit(corpus)
    corpus = model.transform(corpus)

    # Evaluate clustering.
    bkm = model.stages[-1]
    cost = bkm.computeCost(corpus)
    sizes = bkm.summary.clusterSizes

    # TODO: compute cost of each cluster individually

    # Get the text representation of each cluster.
    wvec = model.stages[-2]
    table = [["Cluster", "Size", "Terms"]]
    for ci, c in enumerate(bkm.clusterCenters()):
        ct = wvec.findSynonyms(c, 7)
        size = sizes[ci]
        terms = " ".join([row.word for row in ct.take(7)])
        table.append([ci, size, terms])

    # Print Results
    print(tabulate(table))
    print("Sum of square distance to center: {:0.3f}".format(cost))
def main():
    parser = argparse.ArgumentParser(description='Clustering with pyspark.')

    parser.add_argument('--data-file', type=str, default='enwiki.json')
    parser.add_argument('--num-clusters', type=int, default=4)
    parser.add_argument('--seed', type=int, default=23)
    parser.add_argument('--algorithm',
                        default='kmeans',
                        choices=['kmeans', 'hier', 'gmm'])
    parser.add_argument('--output-groundtruth',
                        type=str,
                        default='groundtruth.csv')
    parser.add_argument('--output-cluster', type=str, default='cluster.csv')

    args = parser.parse_args()

    spark_session = SparkSession.builder.appName('clustering').getOrCreate()

    data = preprocess(spark_session, args.data_file)

    if args.algorithm == 'kmeans':
        alg = KMeans()
    elif args.algorithm == 'hier':
        alg = BisectingKMeans()
    elif args.algorithm == 'gmm':
        alg = GaussianMixture()

    model = train(alg, data, args.num_clusters, seed=args.seed)
    evaluate(data, model, args.algorithm, args.num_clusters,
             args.output_groundtruth, args.output_cluster)
Beispiel #14
0
def search_opt_k(df_kmeans):
    # Trains a k-means model.
    df_kmeans.show()
    # найдем оптимальное k методом локтя
    cost = np.zeros(20)
    for k in range(2, 20):
        kmeans = BisectingKMeans().setK(k).setSeed(1).setFeaturesCol("features")
        print('kmeans ', kmeans)
        model = kmeans.fit(df_kmeans.sample(False, 0.1, seed=42))
        cost[k] = model.computeCost(df_kmeans)  # requires Spark 2.0 or later
    # print(cost)
    # визуализируем локоть
    fig, ax = plt.subplots(1, 1, figsize=(8, 6))
    ax.plot(range(2, 20), cost[2:20])
    ax.set_xlabel('k')
    ax.set_ylabel('cost')
    plt.show()
Beispiel #15
0
def train(df, hiperparameter):
    '''
    KMeans training, returning KMeans model.
    input: - Dataframe
           - config (configurasi hiperparameter)
    
    return: kmeans model
    '''
    bs_kmeans = BisectingKMeans(
        featuresCol=hiperparameter['featuresCol'],
        predictionCol=hiperparameter['predictionCol'],
        maxIter=hiperparameter['maxIter'],
        seed=hiperparameter['seed'],
        k=hiperparameter['k'],
        minDivisibleClusterSize=hiperparameter['minDivisibleClusterSize'])
    model = bs_kmeans.fit(df)
    return model
Beispiel #16
0
def main(argv):

    spark = SparkSession.builder \
        .appName('VIDEO_CLUSTERING') \
        .master('spark://{}:{}'.format(SPARK_MASTER_ADDR, SPARK_MASTER_PORT)) \
        .getOrCreate()
    spark.conf.set('spark.sql.execution.arrow.enabled', 'true')
    spark.conf.set('spark.driver.maxResultSize', '0')
    spark.conf.set('spark.driver.cores', '4')
    spark.conf.set('spark.driver.memory', '4g')
    spark.conf.set('spark.executor.memory', '4g')
    spark.conf.set('spark.executor.cores', '4')

    video_type_code = handle_params(argv)

    video_df = spark.read.format('jdbc')\
        .option('url', 'jdbc:mysql://192.168.174.133:3306/big_data')\
        .option('driver', 'com.mysql.cj.jdbc.Driver')\
        .option('dbtable', 'VIDEO_STATISTIC')\
        .option('user', 'root').option('password', 'root').load()

    assembler = VectorAssembler()\
        .setInputCols(['play_count',
                        'favorite_count',
                        'comment_count',
                        'barrage_count'])\
        .setOutputCol('features')

    video_vector = assembler.transform(video_df.select(
        'play_count', 'favorite_count', 'comment_count', 'barrage_count'
    ).limit(1000))

    bkm = BisectingKMeans(k=8, minDivisibleClusterSize=1.0)
    model = bkm.fit(video_vector)
    centers = model.clusterCenters()

    video_vector = assembler.transform(video_df.select(
        'play_count', 'favorite_count', 'comment_count', 'barrage_count'
    ))

    transformed = model.transform(video_vector).select('features', 'prediction')


    transformed.show()
Beispiel #17
0
def bisecting_kmeans(features, num_clusters):
    """Does clustering on the features dataset using Bisecting KMeans clustering.

    Params:
    - features (pyspark.sql.DataFrame): The data frame containing the features to be used for clustering
    - num_clusters (int): The number of clusters to be used

    Returns:
    - clustered (pyspark.sql.DataFrame): The data frame, with the predicted clusters in a 'cluster' column
    """
    kmeans = BisectingKMeans(k=num_clusters,
                             featuresCol='features',
                             predictionCol='cluster')
    kmeans_model = kmeans.fit(features)
    clustered = kmeans_model.transform(features)
    clustered.show()
    print("=====Clustering Results=====")
    print("Clustering cost = ", kmeans_model.computeCost(features))
    print("Cluster sizes = ", kmeans_model.summary.clusterSizes)
    return clustered
Beispiel #18
0
def bisecting_k_means():
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()

    data = [(Vectors.dense([0.0, 0.0]), ), (Vectors.dense([1.0, 1.0]), ),
            (Vectors.dense([9.0, 8.0]), ), (Vectors.dense([8.0, 9.0]), )]
    df = spark.createDataFrame(data, ["features"])
    bkm = BisectingKMeans(k=2, minDivisibleClusterSize=1.0)
    model = bkm.fit(df)
    centers = model.clusterCenters()
    len(centers)
    model.computeCost(df)
    model.hasSummary
    summary = model.summary
    summary.k
    summary.clusterSizes
    #预测
    transformed = model.transform(df).select("features", "prediction")
    rows = transformed.collect()
    rows[0].prediction == rows[1].prediction
    rows[2].prediction == rows[3].prediction
def main(args):
    spark=SparkSession\
            .builder\
            .master(args[2])\
            .appName(args[1])\
            .getOrCreate()

    start_computing_time = time.time()

    # Load the data stored in LIBSVM format as a DataFrame.
    data = spark.read.format("libsvm").load(args[3])

    (trainingData, testData) = data.randomSplit([0.7, 0.3], seed=1234)

    # Trains a bisecting k-means model.
    bkm = BisectingKMeans().setK(2).setSeed(1)
    model = bkm.fit(trainingData)

    # Make predictions
    predictions = model.transform(testData)

    appendTime(sys.argv, start_computing_time)

    spark.stop()
Beispiel #20
0
def compute_clusters(addons_df, num_clusters, random_seed):
    """Performs user clustering by using add-on ids as features."""

    # Build the stages of the pipeline. We need hashing to make the next
    # steps work.
    hashing_stage = HashingTF(inputCol="addon_ids",
                              outputCol="hashed_features")
    idf_stage = IDF(inputCol="hashed_features",
                    outputCol="features",
                    minDocFreq=1)
    # As a future improvement, we may add a sane value for the minimum cluster size
    # to BisectingKMeans (e.g. minDivisibleClusterSize). For now, just make sure
    # to pass along the random seed if needed for tests.
    kmeans_kwargs = {"seed": random_seed} if random_seed else {}
    bkmeans_stage = BisectingKMeans(k=num_clusters, **kmeans_kwargs)
    pipeline = Pipeline(stages=[hashing_stage, idf_stage, bkmeans_stage])

    # Run the pipeline and compute the results.
    model = pipeline.fit(addons_df)
    return model.transform(addons_df).select(["client_id", "prediction"])
Beispiel #21
0
def return_correct_clustering_algorithm(_type, _cluster_number, _max_iter):
    """
	This method returns an instance of the clustering algorithm
	selected by the user.
	:param _type: the name of the algorithm we want to use.
	:param _cluster_number: the number of clusters.
	:param _max_iter: the maximum number of iterations.
	:return: an _type instance or it raises an execption if _type is not valid.
	"""

    cluster_number = int(_cluster_number) if _cluster_number else 10
    max_iter = int(_max_iter) if _max_iter else 20

    if _type == "kmeans":
        return KMeans().setK(cluster_number).setMaxIter(max_iter).setSeed(1)
    elif _type == "b-kmeans":
        return BisectingKMeans().setK(cluster_number).setMaxIter(
            max_iter).setSeed(1)
    else:
        raise Exception(
            "The clustering algorithm requested {} is not available".format(
                _type))
Beispiel #22
0
cluster_centers = kmeans_model.clusterCenters()
print(cluster_centers)

# COMMAND ----------

# MAGIC %md #####Hierarchial Clustering via Bisecting K-means

# COMMAND ----------

from pyspark.ml.clustering import BisectingKMeans
from pyspark.ml.evaluation import ClusteringEvaluator

retail_features = spark.read.table("retail_features")
train_df = retail_features.selectExpr("selected_features as features")

bkmeans = BisectingKMeans(k=3, featuresCol='features')
bkmeans_model = kmeans.fit(train_df)

predictions = bkmeans_model.transform(train_df)

evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Silhouette measure using squared euclidean distance = " +
      str(silhouette))

cluster_centers = kmeans_model.clusterCenters()
print(cluster_centers)

# COMMAND ----------
  bin/spark-submit examples/src/main/python/ml/bisecting_k_means_example.py
"""

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("PythonBisectingKMeansExample")\
        .getOrCreate()

    # $example on$
    # Loads data.
    dataset = spark.read.format("libsvm").load(
        "data/mllib/sample_kmeans_data.txt")

    # Trains a bisecting k-means model.
    bkm = BisectingKMeans().setK(2).setSeed(1)
    model = bkm.fit(dataset)

    # Evaluate clustering.
    cost = model.computeCost(dataset)
    print("Within Set Sum of Squared Errors = " + str(cost))

    # Shows the result.
    print("Cluster Centers: ")
    centers = model.clusterCenters()
    for center in centers:
        print(center)
    # $example off$

    spark.stop()
"""
A simple example demonstrating a bisecting k-means clustering.
"""

if __name__ == "__main__":

    sc = SparkContext(appName="PythonBisectingKMeansExample")
    sqlContext = SQLContext(sc)

    # $example on$
    data = sc.textFile("data/mllib/kmeans_data.txt")
    parsed = data.map(lambda l: Row(features=Vectors.dense([float(x) for x in l.split(' ')])))
    training = sqlContext.createDataFrame(parsed)

    kmeans = BisectingKMeans().setK(2).setSeed(1).setFeaturesCol("features")

    model = kmeans.fit(training)

    # Evaluate clustering
    cost = model.computeCost(training)
    print("Bisecting K-means Cost = " + str(cost))

    centers = model.clusterCenters()
    print("Cluster Centers: ")
    for center in centers:
        print(center)
    # $example off$

    sc.stop()
from pyspark.ml.clustering import BisectingKMeans
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("BisectingKMeansExample")\
        .getOrCreate()

    # $example on$
    # Loads data.
    dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt")

    # Trains a bisecting k-means model.
    bkm = BisectingKMeans().setK(2).setSeed(1)
    model = bkm.fit(dataset)

    # Evaluate clustering.
    cost = model.computeCost(dataset)
    print("Within Set Sum of Squared Errors = " + str(cost))

    # Shows the result.
    print("Cluster Centers: ")
    centers = model.clusterCenters()
    for center in centers:
        print(center)
    # $example off$

    spark.stop()
Beispiel #26
0
          ('HYP_TENS_GEST', typ.IntegerType()),
          ('PREV_BIRTH_PRETERM', typ.IntegerType())]

births_transformed = "file:///home/yuty/yangzz/births_transformed.csv"
schema = typ.StructType([typ.StructField(e[0], e[1], False) for e in labels])
births = spark.read.csv(births_transformed, header=True, schema=schema)
featuresCreator = ft.VectorAssembler(
    inputCols=[col[0] for col in labels[1:]],
    outputCol='features').transform(births).select('features').collect()

from pyspark.ml.linalg import Vectors
from pyspark.ml.clustering import BisectingKMeans

data = [(Vectors.dense([10, 10]), ), (Vectors.dense([3.0, 5.0]), ),
        (Vectors.dense([0.0, 0.0]), ), (Vectors.dense([1.0, 1.0]), ),
        (Vectors.dense([9.0, 8.0]), ), (Vectors.dense([8.0, 9.0]), )]
df = spark.createDataFrame(data, ["features"])
bkm = BisectingKMeans(k=2, minDivisibleClusterSize=1.0)
model = bkm.fit(df)
centers = model.clusterCenters()
len(centers)
model.computeCost(df)
model.hasSummary
summary = model.summary
summary.k
summary.clusterSizes

transformed = model.transform(df).select("features", "prediction")
rows = transformed.collect()
rows[0].prediction
Beispiel #27
0
A simple example demonstrating a bisecting k-means clustering.
"""

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("PythonBisectingKMeansExample")\
        .getOrCreate()

    # $example on$
    data = spark.read.text("data/mllib/kmeans_data.txt").rdd
    parsed = data\
        .map(lambda row: Row(features=Vectors.dense([float(x) for x in row.value.split(' ')])))
    training = spark.createDataFrame(parsed)

    kmeans = BisectingKMeans().setK(2).setSeed(1).setFeaturesCol("features")

    model = kmeans.fit(training)

    # Evaluate clustering
    cost = model.computeCost(training)
    print("Bisecting K-means Cost = " + str(cost))

    centers = model.clusterCenters()
    print("Cluster Centers: ")
    for center in centers:
        print(center)
    # $example off$

    spark.stop()
# COMMAND ----------

summary = kmModel.summary
print summary.clusterSizes # number of points
kmModel.computeCost(sales)
centers = kmModel.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)


# COMMAND ----------

from pyspark.ml.clustering import BisectingKMeans
bkm = BisectingKMeans().setK(5).setMaxIter(5)
bkmModel = bkm.fit(sales)


# COMMAND ----------

summary = bkmModel.summary
print summary.clusterSizes # number of points
kmModel.computeCost(sales)
centers = kmModel.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)


# COMMAND ----------
Beispiel #29
0
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans

cluster_df = spark.read.csv('clustering_dataset.csv',
                            header=True,
                            inferSchema=True)
cluster_df.show()

vectorAssembler = VectorAssembler(inputCols=['col1', 'col2', 'col3'],
                                  outputCol='features')
vcluster_df = vectorAssembler.transform(cluster_df)

vcluster_df.show()

kmeans = KMeans().setK(3)
kmeans = kmeans.setSeed(1)
kmodel = kmeans.fit(vcluster_df)

centers = kmodel.clusterCenters()

# hierarchical clustering
vcluster_df.show()

from pyspark.ml.clustering import BisectingKMeans
bkmeans = BisectingKMeans().setK(3)
bkmeans = bkmeans.setSeed(1)
bkmodel = bkmeans.fit(vcluster_df)

bkcenters = bkmodel.clusterCenters()
    start = time.time()
    kmeans = KMeans(k=8,
                    seed=int(np.random.randint(100, size=1)),
                    initMode="k-means||")
    modelKmeans = kmeans.fit(tsneDataFrame.select("features"))
    predictions = modelKmeans.transform(tsneDataFrame)
    end = time.time()
    times.append(end - start)
kmeansTime = average(times)

###########      BISECTING K-MEANS       ################
from pyspark.ml.clustering import BisectingKMeans
times = []
for i in range(1, 5):
    start = time.time()
    bkm = BisectingKMeans(k=8, seed=int(np.random.randint(100, size=1)))
    modelBkm = bkm.fit(tsneDataFrame.select("features"))
    transformedBkm = modelBkm.transform(tsneDataFrame)
    end = time.time()
    times.append(end - start)
bisectingKmeansTime = average(times)

##############       GMM      #################
from pyspark.ml.clustering import GaussianMixture
times = []
for i in range(1, 5):
    start = time.time()
    gmm = GaussianMixture(k=8, seed=int(np.random.randint(100, size=1)))
    modelGmm = gmm.fit(tsneDataFrame.select("features"))
    transformedGmm = modelGmm.transform(tsneDataFrame)
    end = time.time()
kmModel = km.fit(sales)

# COMMAND ----------

summary = kmModel.summary
print summary.clusterSizes  # number of points
kmModel.computeCost(sales)
centers = kmModel.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

# COMMAND ----------

from pyspark.ml.clustering import BisectingKMeans
bkm = BisectingKMeans().setK(5).setMaxIter(5)
bkmModel = bkm.fit(sales)

# COMMAND ----------

summary = bkmModel.summary
print summary.clusterSizes  # number of points
kmModel.computeCost(sales)
centers = kmModel.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

# COMMAND ----------

from pyspark.ml.clustering import GaussianMixture
kmeansScores = []
for k in range(4, 8):
    kmeans = KMeans().setK(k).setSeed(216)
    model = kmeans.fit(trainingData)
    prediction = model.transform(testData)
    evaluator = ClusteringEvaluator()
    score = evaluator.evaluate(prediction)
    kmeansScores.append(score)

plt.plot(range(4, 8), kmeansScores, 'ro')
plt.savefig('kmeansScores.pdf')

bisectScores = []
for k in range(4, 8):
    bisection = BisectingKMeans().setK(k).setSeed(216)
    model = bisection.fit(trainingData)
    prediction = model.transform(testData)
    evaluator = ClusteringEvaluator()
    score = evaluator.evaluate(prediction)
    bisectScores.append(score)

plt.plot(range(4, 8), bisectScores, 'g^')
plt.savefig('bisectScores.pdf')
plt.clf()

kmeansK = np.argmax(kmeansScores) + 4
bisectK = np.argmax(bisectScores) + 4

evaluator = ClusteringEvaluator()
kmeans = KMeans().setK(kmeansK).setSeed(216)
Beispiel #33
0
sqlContext = SQLContext(sc)

# Loading required packages
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans

# Reading the data set
cluster_df = spark.read.csv("./Exercise_Files/Ch03/03_02/clustering_dataset.csv", header = True, inferSchema = True)

# Examining the data frame
print(cluster_df.schema)
cluster_df.printSchema()
print(cluster_df.columns)

# VectorAssembler for transformation
vectorAssembler = VectorAssembler(inputCols = ["col1", "col2", "col3"], outputCol = "features")
vectorized_cluster_df = vectorAssembler.transform(cluster_df)
print(vectorized_cluster_df.select(["features"]).show())
print(vectorized_cluster_df.take(1))

# K-means clustering - not working for some reason!
# kmeans = KMeans().setK(3).setSeed(1)
# kmeans_model = KMeans.fit(vectorized_cluster_df.select("features"), 3, maxIterations = 10, initializationMode = "random")
# km_centers = kmeans_model.clusterCenters()

# Hierarchical clustering (Bisecting K-means) - not working for same reason as KMeans
from pyspark.ml.clustering import BisectingKMeans
bkmeans = BisectingKMeans().setK(3).setSeed(1)
# bk_model = bkmeans.fit(vectorized_cluster_df)
# bk_centers = bk_model.fit(vectorized_cluster_df)
dataset = outputFeatureDf
kValues = [2, 3, 4, 5, 6, 7, 8]
wssse = []
for k in kValues:
    kmeans = KMeans().setK(k).setSeed(122)
    model = kmeans.fit(dataset)
    wssse.append(model.computeCost(dataset))
for i in wssse:
    print(i)

# In[29]:

from pyspark.ml.clustering import BisectingKMeans
# Trains a bisecting k-means model.
bkm = BisectingKMeans().setK(2).setSeed(1222)
model = bkm.fit(outputFeatureDf)
# Evaluate clustering.
cost = model.computeCost(dataset)
print("Within Set Sum of Squared Errors = " + str(cost))
# Shows the result.
print("Cluster Centers: ")
centers = model.clusterCenters()
for center in centers:
    print(center)

# In[30]:

from sklearn.metrics.cluster import completeness_score
transformed = model.transform(dataset)
labels = labeldf.collect()