Esempio n. 1
0
def create_kmeans_dendogram(input_csv, num_clusters):
    spark = SparkSession.builder.appName(
        'HotelsPriceDataGeneratorSession').getOrCreate()

    # Lazy op - Load the data
    # Read CSV
    # Note that the schema is already defined, a fully null df will result if the csv does not fit the schema
    print('Reading CSV from ' + input_csv)
    generated_hotels_df = spark.read.csv(input_csv,
                                         header=True,
                                         inferSchema=True)

    # Limit the clusters to num cols
    num_clusters = min(num_clusters, len(generated_hotels_df.columns[1:]))

    # Assemble the features vector column
    vecAssembler = VectorAssembler(inputCols=generated_hotels_df.columns[1:],
                                   outputCol="features")
    vector_df = vecAssembler.transform(generated_hotels_df)

    # Run the BisectingKMeans to find hierarchial clusters
    kmeans = BisectingKMeans().setK(num_clusters).setSeed(42)
    model = kmeans.fit(vector_df)

    # Link it to find relations between the clusters
    z = hc.linkage(model.clusterCenters(),
                   method='average',
                   metric='correlation')

    # Plot the dendrogram
    hc.dendrogram(z)
    plt.show()
Esempio n. 2
0
def getTopClusters(startDate, endDate, startTime, endTime, category):
    filteredDF = applyFilter(startDate, endDate, startTime, endTime, category).cache()

    # Extract X, Y into feature vector
    vectorizer = VectorAssembler()
    vectorizer.setInputCols(["X", "Y"])
    vectorizer.setOutputCol("features")
    pointsDF = vectorizer.transform(filteredDF).cache()

    # Hierarchical K means
    bkm = BisectingKMeans().setK(10).setSeed(7).setMaxIter(7)
    model = bkm.fit(pointsDF)

    # RDD of (clusterIndex, size)
    clustersRDD = (model.transform(pointsDF)
                   .select("prediction").rdd
                   .map(lambda row: (row["prediction"], 1))
                   .reduceByKey(lambda a, c: a + c))

    clusters = model.clusterCenters()
    clusterRV = clustersRDD.collect()

    rv = []
    for ind, num in clusterRV:
        val = {"c": (clusters[ind][0], clusters[ind][1]), "o": num}
        rv.append(val)

    return rv
Esempio n. 3
0
 def __find_cluster_split_kmeans_sparkdf(cls, feature_col, df_norm, n_iterations, kmeans_method, sc):
     from pyspark.ml.clustering import KMeans
     start_time = time.time()
     #convert to spark df
     sqlContext = SQLContext(sc)
     spark_df = sqlContext.createDataFrame(df_norm)
     #assemble vector
     vecAssembler = VectorAssembler(inputCols=feature_col, outputCol="features")
     spark_df_clustering = vecAssembler.transform(spark_df).select('features')
     n_components_list = []
     n_range = np.arange(2, 20)
     for iteration in np.arange(n_iterations):
         cost = []
         for k in n_range:
             if kmeans_method == 'kmeans':
                 print("Kmeans Elbow Method K = ", k)
                 kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features")
                 model = kmeans.fit(spark_df_clustering)
             elif kmeans_method == 'bisecting_kmeans':
                 print("Bisecting Kmeans Elbow Method K = ", k)
                 bkm = BisectingKMeans().setK(k).setSeed(1).setFeaturesCol("features")
                 model = bkm.fit(spark_df_clustering)
             cost.append(model.computeCost(spark_df_clustering)) # requires Spark 2.0 or later
         print('Cluster List: ', n_range)
         print('Within Set Sum of Squared Errors: ', cost)
         n_split_knee = cls.__knee_locator(n_range, cost, 'convex', 'decreasing', 'sum_of_square_error')
         print("Recommended no. of components by knee locator: " + str(n_split_knee))
         n_components_list.append(n_split_knee)
     n_components = int(np.median(n_components_list).round(0))
     print('Recommended median number of splits: ', n_components)
     print("elbow method time: ", time.time()-start_time, "(sec)")
     return n_components
Esempio n. 4
0
 def get_clusters(self, parameters: dict, urls_and_vectors: DataFrame) -> DataFrame:
     urls_and_vectors = urls_and_vectors.cache()
     bisecting_kmeans = BisectingKMeans().setK(parameters['k']).setDistanceMeasure(
         parameters['distance_measure']).setFeaturesCol("vector").setPredictionCol("cluster_id")
     model = bisecting_kmeans.fit(urls_and_vectors)
     clustered_url_vectors = model.transform(urls_and_vectors)
     urls_and_vectors.unpersist()
     return clustered_url_vectors
Esempio n. 5
0
def clustering(df_kmeans, n):
    kmeans = BisectingKMeans().setK(n).setSeed(1).setFeaturesCol("features")
    print('kmeans ', kmeans)
    model = kmeans.fit(df_kmeans)

    centers = model.clusterCenters()

    print("Cluster Centers: ")
    for center in centers:
        print(center)
Esempio n. 6
0
    def bisecting_k_means(self, k):
        print('\nBisecting K-Means - ' + str(k))
        kmeans = BisectingKMeans().setK(k).setSeed(1)
        model = kmeans.fit(self.df.select('features'))

        transformed = model.transform(self.df)
        transformed.groupBy("prediction").count().show()

        centers = model.clusterCenters()
        self.print_centers(centers)
Esempio n. 7
0
def bisect_model(data):
    #TODO grid search best parametrs
    bkm = BisectingKMeans().setK(2).setSeed(1)
    model = bkm.fit(data)
    cost = model.computeCost(data)
    prilog.infont("Within Set Sum of Squared Errors = " + str(cost))
    log.info("Cluster Centers: ")
    centers = model.clusterCenters()
    for center in centers:
        log.info(center)
    predictions_bi = model.transform(data)

    return predictions_bi
Esempio n. 8
0
def model_list():
    clist = []
    df2 = df1.select('features')
    df2.cache
    df1.cache
    for i in range(2,20):
        kmeans = BisectingKMeans(k=i, minDivisibleClusterSize=1.0)
        model = kmeans.fit(df2)
        WSSSE = model.computeCost(df1)
        #print("Within Set Sum of Squared Error, k = " + str(i) + ": " +str(WSSSE))
        clist.append({i: WSSSE, 'model': model})
    df1.unpersist
    df2.unpersist
    return clist
Esempio n. 9
0
 def test_bisecting_kmeans_summary(self):
     data = [(Vectors.dense(1.0), ), (Vectors.dense(5.0), ),
             (Vectors.dense(10.0), ), (Vectors.sparse(1, [], []), )]
     df = self.spark.createDataFrame(data, ["features"])
     bkm = BisectingKMeans(k=2)
     model = bkm.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.featuresCol, "features")
     self.assertEqual(s.predictionCol, "prediction")
     self.assertTrue(isinstance(s.cluster, DataFrame))
     self.assertEqual(len(s.clusterSizes), 2)
     self.assertEqual(s.k, 2)
     self.assertEqual(s.numIter, 20)
Esempio n. 10
0
 def __bisecting_k_mean(cls, k_clusters, xnorm, feature_col, sc):
     #k_clusters = elbow point
     start_time = time.time()
     #convert to spark df
     sqlContext = SQLContext(sc)
     df_norm = pd.DataFrame(data = xnorm, columns = feature_col)
     spark_df = sqlContext.createDataFrame(df_norm)
     #assemble vector
     vecAssembler = VectorAssembler(inputCols=feature_col, outputCol="features")
     spark_df_clustering = vecAssembler.transform(spark_df).select('features')
     bkm = BisectingKMeans().setK(k_clusters).setSeed(1).setFeaturesCol("features")
     model = bkm.fit(spark_df_clustering)
     prediction = model.transform(spark_df_clustering).select('prediction').collect()
     labels = [p.prediction for p in prediction]
     return labels
Esempio n. 11
0
 def test_bisecting_kmeans_summary(self):
     data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),),
             (Vectors.sparse(1, [], []),)]
     df = self.spark.createDataFrame(data, ["features"])
     bkm = BisectingKMeans(k=2)
     model = bkm.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.featuresCol, "features")
     self.assertEqual(s.predictionCol, "prediction")
     self.assertTrue(isinstance(s.cluster, DataFrame))
     self.assertEqual(len(s.clusterSizes), 2)
     self.assertEqual(s.k, 2)
     self.assertEqual(s.numIter, 20)
Esempio n. 12
0
def search_opt_k(df_kmeans):
    # Trains a k-means model.
    df_kmeans.show()
    # найдем оптимальное k методом локтя
    cost = np.zeros(20)
    for k in range(2, 20):
        kmeans = BisectingKMeans().setK(k).setSeed(1).setFeaturesCol("features")
        print('kmeans ', kmeans)
        model = kmeans.fit(df_kmeans.sample(False, 0.1, seed=42))
        cost[k] = model.computeCost(df_kmeans)  # requires Spark 2.0 or later
    # print(cost)
    # визуализируем локоть
    fig, ax = plt.subplots(1, 1, figsize=(8, 6))
    ax.plot(range(2, 20), cost[2:20])
    ax.set_xlabel('k')
    ax.set_ylabel('cost')
    plt.show()
Esempio n. 13
0
def train(df, hiperparameter):
    '''
    KMeans training, returning KMeans model.
    input: - Dataframe
           - config (configurasi hiperparameter)
    
    return: kmeans model
    '''
    bs_kmeans = BisectingKMeans(
        featuresCol=hiperparameter['featuresCol'],
        predictionCol=hiperparameter['predictionCol'],
        maxIter=hiperparameter['maxIter'],
        seed=hiperparameter['seed'],
        k=hiperparameter['k'],
        minDivisibleClusterSize=hiperparameter['minDivisibleClusterSize'])
    model = bs_kmeans.fit(df)
    return model
Esempio n. 14
0
def main(argv):

    spark = SparkSession.builder \
        .appName('VIDEO_CLUSTERING') \
        .master('spark://{}:{}'.format(SPARK_MASTER_ADDR, SPARK_MASTER_PORT)) \
        .getOrCreate()
    spark.conf.set('spark.sql.execution.arrow.enabled', 'true')
    spark.conf.set('spark.driver.maxResultSize', '0')
    spark.conf.set('spark.driver.cores', '4')
    spark.conf.set('spark.driver.memory', '4g')
    spark.conf.set('spark.executor.memory', '4g')
    spark.conf.set('spark.executor.cores', '4')

    video_type_code = handle_params(argv)

    video_df = spark.read.format('jdbc')\
        .option('url', 'jdbc:mysql://192.168.174.133:3306/big_data')\
        .option('driver', 'com.mysql.cj.jdbc.Driver')\
        .option('dbtable', 'VIDEO_STATISTIC')\
        .option('user', 'root').option('password', 'root').load()

    assembler = VectorAssembler()\
        .setInputCols(['play_count',
                        'favorite_count',
                        'comment_count',
                        'barrage_count'])\
        .setOutputCol('features')

    video_vector = assembler.transform(video_df.select(
        'play_count', 'favorite_count', 'comment_count', 'barrage_count'
    ).limit(1000))

    bkm = BisectingKMeans(k=8, minDivisibleClusterSize=1.0)
    model = bkm.fit(video_vector)
    centers = model.clusterCenters()

    video_vector = assembler.transform(video_df.select(
        'play_count', 'favorite_count', 'comment_count', 'barrage_count'
    ))

    transformed = model.transform(video_vector).select('features', 'prediction')


    transformed.show()
Esempio n. 15
0
def bisecting_kmeans(features, num_clusters):
    """Does clustering on the features dataset using Bisecting KMeans clustering.

    Params:
    - features (pyspark.sql.DataFrame): The data frame containing the features to be used for clustering
    - num_clusters (int): The number of clusters to be used

    Returns:
    - clustered (pyspark.sql.DataFrame): The data frame, with the predicted clusters in a 'cluster' column
    """
    kmeans = BisectingKMeans(k=num_clusters,
                             featuresCol='features',
                             predictionCol='cluster')
    kmeans_model = kmeans.fit(features)
    clustered = kmeans_model.transform(features)
    clustered.show()
    print("=====Clustering Results=====")
    print("Clustering cost = ", kmeans_model.computeCost(features))
    print("Cluster sizes = ", kmeans_model.summary.clusterSizes)
    return clustered
def main(args):
    spark=SparkSession\
            .builder\
            .master(args[2])\
            .appName(args[1])\
            .getOrCreate()

    start_computing_time = time.time()

    # Load the data stored in LIBSVM format as a DataFrame.
    data = spark.read.format("libsvm").load(args[3])

    (trainingData, testData) = data.randomSplit([0.7, 0.3], seed=1234)

    # Trains a bisecting k-means model.
    bkm = BisectingKMeans().setK(2).setSeed(1)
    model = bkm.fit(trainingData)

    # Make predictions
    predictions = model.transform(testData)

    appendTime(sys.argv, start_computing_time)

    spark.stop()
Esempio n. 17
0
def bisecting_k_means():
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()

    data = [(Vectors.dense([0.0, 0.0]), ), (Vectors.dense([1.0, 1.0]), ),
            (Vectors.dense([9.0, 8.0]), ), (Vectors.dense([8.0, 9.0]), )]
    df = spark.createDataFrame(data, ["features"])
    bkm = BisectingKMeans(k=2, minDivisibleClusterSize=1.0)
    model = bkm.fit(df)
    centers = model.clusterCenters()
    len(centers)
    model.computeCost(df)
    model.hasSummary
    summary = model.summary
    summary.k
    summary.clusterSizes
    #预测
    transformed = model.transform(df).select("features", "prediction")
    rows = transformed.collect()
    rows[0].prediction == rows[1].prediction
    rows[2].prediction == rows[3].prediction
Esempio n. 18
0
"""
A simple example demonstrating a bisecting k-means clustering.
"""

if __name__ == "__main__":

    sc = SparkContext(appName="PythonBisectingKMeansExample")
    sqlContext = SQLContext(sc)

    # $example on$
    data = sc.textFile("data/mllib/kmeans_data.txt")
    parsed = data.map(lambda l: Row(features=Vectors.dense([float(x) for x in l.split(' ')])))
    training = sqlContext.createDataFrame(parsed)

    kmeans = BisectingKMeans().setK(2).setSeed(1).setFeaturesCol("features")

    model = kmeans.fit(training)

    # Evaluate clustering
    cost = model.computeCost(training)
    print("Bisecting K-means Cost = " + str(cost))

    centers = model.clusterCenters()
    print("Cluster Centers: ")
    for center in centers:
        print(center)
    # $example off$

    sc.stop()
Esempio n. 19
0
from pyspark.ml.clustering import BisectingKMeans
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("BisectingKMeansExample")\
        .getOrCreate()

    # $example on$
    # Loads data.
    dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt")

    # Trains a bisecting k-means model.
    bkm = BisectingKMeans().setK(2).setSeed(1)
    model = bkm.fit(dataset)

    # Evaluate clustering.
    cost = model.computeCost(dataset)
    print("Within Set Sum of Squared Errors = " + str(cost))

    # Shows the result.
    print("Cluster Centers: ")
    centers = model.clusterCenters()
    for center in centers:
        print(center)
    # $example off$

    spark.stop()
Esempio n. 20
0
    for i in col_name:
        data = data.withColumn(i, data[i].cast(DoubleType()))
        dataWithFeatures = VectorAssembler(inputCols=col_name, outputCol="features")

    data = dataWithFeatures.transform(data)

    trainingData = data
    testData = data
    '''RFC = RandomForestClassifier(featuresCol="features", labelCol="label", predictionCol="prediction", probabilityCol="probability", rawPredictionCol="rawPrediction",
                                maxDepth=14, maxBins=32, minInstancesPerNode=2, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=True, checkpointInterval=10, 
                                impurity="gini", numTrees=2000, featureSubsetStrategy="auto", seed=None, subsamplingRate=0.8)'''
    BKM = BisectingKMeans(featuresCol="features", predictionCol="prediction", maxIter=20, seed=1, k=4,
                          minDivisibleClusterSize=1.0)
    # Train model.
    # paramMap1 = {gbt.stepSize: 0.05,gbt.minInstancesPerNode:2,gbt.maxDepth:6,gbt.cacheNodeIds:True,gbt.subsamplingRate:1,gbt.maxIter:200}
    BKM_model = BKM.fit(trainingData)
    cost = BKM_model.computeCost(trainingData)
    print("Within Set Sum of Squared Errors = " + str(cost))
    centers = BKM_model.clusterCenters()
    summary = BKM_model.summary
    print(summary.k)
    print(summary.clusterSizes)

    # Make predictions. #"features", "label","mobile","prediction"
    prediction_result = BKM_model.transform(testData)
    # prediction_result.take(5)
    prediction_result.select('mobile', 'label', 'prediction').repartition(1).write.csv(
        '/user/wangkang/qiche/word2vec/Kmeans', mode='overwrite')
'''conf = SparkConf()
conf.setAppName('a')
sc = SparkContext(conf=conf)
Esempio n. 21
0
          ('HYP_TENS_GEST', typ.IntegerType()),
          ('PREV_BIRTH_PRETERM', typ.IntegerType())]

births_transformed = "file:///home/yuty/yangzz/births_transformed.csv"
schema = typ.StructType([typ.StructField(e[0], e[1], False) for e in labels])
births = spark.read.csv(births_transformed, header=True, schema=schema)
featuresCreator = ft.VectorAssembler(
    inputCols=[col[0] for col in labels[1:]],
    outputCol='features').transform(births).select('features').collect()

from pyspark.ml.linalg import Vectors
from pyspark.ml.clustering import BisectingKMeans

data = [(Vectors.dense([10, 10]), ), (Vectors.dense([3.0, 5.0]), ),
        (Vectors.dense([0.0, 0.0]), ), (Vectors.dense([1.0, 1.0]), ),
        (Vectors.dense([9.0, 8.0]), ), (Vectors.dense([8.0, 9.0]), )]
df = spark.createDataFrame(data, ["features"])
bkm = BisectingKMeans(k=2, minDivisibleClusterSize=1.0)
model = bkm.fit(df)
centers = model.clusterCenters()
len(centers)
model.computeCost(df)
model.hasSummary
summary = model.summary
summary.k
summary.clusterSizes

transformed = model.transform(df).select("features", "prediction")
rows = transformed.collect()
rows[0].prediction
Esempio n. 22
0
A simple example demonstrating a bisecting k-means clustering.
"""

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("PythonBisectingKMeansExample")\
        .getOrCreate()

    # $example on$
    data = spark.read.text("data/mllib/kmeans_data.txt").rdd
    parsed = data\
        .map(lambda row: Row(features=Vectors.dense([float(x) for x in row.value.split(' ')])))
    training = spark.createDataFrame(parsed)

    kmeans = BisectingKMeans().setK(2).setSeed(1).setFeaturesCol("features")

    model = kmeans.fit(training)

    # Evaluate clustering
    cost = model.computeCost(training)
    print("Bisecting K-means Cost = " + str(cost))

    centers = model.clusterCenters()
    print("Cluster Centers: ")
    for center in centers:
        print(center)
    # $example off$

    spark.stop()
# COMMAND ----------

summary = kmModel.summary
print summary.clusterSizes # number of points
kmModel.computeCost(sales)
centers = kmModel.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)


# COMMAND ----------

from pyspark.ml.clustering import BisectingKMeans
bkm = BisectingKMeans().setK(5).setMaxIter(5)
bkmModel = bkm.fit(sales)


# COMMAND ----------

summary = bkmModel.summary
print summary.clusterSizes # number of points
kmModel.computeCost(sales)
centers = kmModel.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)


# COMMAND ----------
Esempio n. 24
0
#Apply the above expression
df_vector = df_limit.select(*expr)

#Transform the dataFrame based on the vector assembler
df_trans = vectorAssembler.transform(df_vector)

#Create id that can be used correlate each observation to its feature vector
df_trans = df_trans.withColumn("id", monotonically_increasing_id())
df_limit = df_limit.withColumn("id", monotonically_increasing_id()).drop("Latitude").drop("Longitude")

#Drop one of the id columns after joining
df_joined = df_limit.join(df_trans, "id", "inner").drop("id")
df_joined.cache()

bkm = BisectingKMeans(k=num_of_Clusters, minDivisibleClusterSize=minDivisSize, featuresCol="Features")
model = bkm.fit(df_joined)

log("Model was trained using following parameters:")
log("")
log(model.extractParamMap())
log("")

centers = model.clusterCenters()
log("The coordinates to each cluster center:")
log(centers)

summary = model.summary
log("Size of each identified cluster:")
log(summary.clusterSizes)

#DataFrame of predicted cluster centers for each training data point
    kmeans = KMeans(k=8,
                    seed=int(np.random.randint(100, size=1)),
                    initMode="k-means||")
    modelKmeans = kmeans.fit(tsneDataFrame.select("features"))
    predictions = modelKmeans.transform(tsneDataFrame)
    end = time.time()
    times.append(end - start)
kmeansTime = average(times)

###########      BISECTING K-MEANS       ################
from pyspark.ml.clustering import BisectingKMeans
times = []
for i in range(1, 5):
    start = time.time()
    bkm = BisectingKMeans(k=8, seed=int(np.random.randint(100, size=1)))
    modelBkm = bkm.fit(tsneDataFrame.select("features"))
    transformedBkm = modelBkm.transform(tsneDataFrame)
    end = time.time()
    times.append(end - start)
bisectingKmeansTime = average(times)

##############       GMM      #################
from pyspark.ml.clustering import GaussianMixture
times = []
for i in range(1, 5):
    start = time.time()
    gmm = GaussianMixture(k=8, seed=int(np.random.randint(100, size=1)))
    modelGmm = gmm.fit(tsneDataFrame.select("features"))
    transformedGmm = modelGmm.transform(tsneDataFrame)
    end = time.time()
    times.append(end - start)
kmeansScores = []
for k in range(4, 8):
    kmeans = KMeans().setK(k).setSeed(216)
    model = kmeans.fit(trainingData)
    prediction = model.transform(testData)
    evaluator = ClusteringEvaluator()
    score = evaluator.evaluate(prediction)
    kmeansScores.append(score)

plt.plot(range(4, 8), kmeansScores, 'ro')
plt.savefig('kmeansScores.pdf')

bisectScores = []
for k in range(4, 8):
    bisection = BisectingKMeans().setK(k).setSeed(216)
    model = bisection.fit(trainingData)
    prediction = model.transform(testData)
    evaluator = ClusteringEvaluator()
    score = evaluator.evaluate(prediction)
    bisectScores.append(score)

plt.plot(range(4, 8), bisectScores, 'g^')
plt.savefig('bisectScores.pdf')
plt.clf()

kmeansK = np.argmax(kmeansScores) + 4
bisectK = np.argmax(bisectScores) + 4

evaluator = ClusteringEvaluator()
kmeans = KMeans().setK(kmeansK).setSeed(216)
kmModel = kmeans.fit(trainingData)
dataset = outputFeatureDf
kValues = [2, 3, 4, 5, 6, 7, 8]
wssse = []
for k in kValues:
    kmeans = KMeans().setK(k).setSeed(122)
    model = kmeans.fit(dataset)
    wssse.append(model.computeCost(dataset))
for i in wssse:
    print(i)

# In[29]:

from pyspark.ml.clustering import BisectingKMeans
# Trains a bisecting k-means model.
bkm = BisectingKMeans().setK(2).setSeed(1222)
model = bkm.fit(outputFeatureDf)
# Evaluate clustering.
cost = model.computeCost(dataset)
print("Within Set Sum of Squared Errors = " + str(cost))
# Shows the result.
print("Cluster Centers: ")
centers = model.clusterCenters()
for center in centers:
    print(center)

# In[30]:

from sklearn.metrics.cluster import completeness_score
transformed = model.transform(dataset)
labels = labeldf.collect()
label_array = [int(i[0]) for i in labels]
data = sc.textFile("practice5.data")
data_label = data.map(parsePoint)
data_label = np.array(data_label.collect())

trainData, testData = sort_by_target(data_label)
trainData = map(lambda x: (int(x[-1]), Vectors.dense(x[:-1])), trainData)
testData = map(lambda x: (int(x[-1]), Vectors.dense(x[:-1])), testData)

trainData = sqlContext.createDataFrame(trainData, schema=["label", "features"])
trFeat = trainData.select([c for c in trainData.columns if c in ["features"]])
trLab = trainData.select([c for c in trainData.columns if c in ["label"]])

testData = sqlContext.createDataFrame(testData, schema=["label", "features"])
tsFeat = testData.select([c for c in testData.columns if c in ["features"]])
tsLab = testData.select([c for c in testData.columns if c in ["label"]])

bkm = BSK(k=10, minDivisibleClusterSize=1.0)
model = bkm.fit(trFeat)

predict = model.transform(tsFeat).select("prediction")
predict = predict.rdd.flatMap(lambda x: x).collect()

Label = [int(row['label']) for row in tsLab.collect()]

f = open('result.txt', 'w')
f.write('NMI of hierarchical clustering\n')
f.write('{:.4f}'.format(NMI(Label, predict)))

sc.stop()
# COMMAND ----------

summary = kmModel.summary
print summary.clusterSizes  # number of points
kmModel.computeCost(sales)
centers = kmModel.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

# COMMAND ----------

from pyspark.ml.clustering import BisectingKMeans
bkm = BisectingKMeans().setK(5).setMaxIter(5)
bkmModel = bkm.fit(sales)

# COMMAND ----------

summary = bkmModel.summary
print summary.clusterSizes  # number of points
kmModel.computeCost(sales)
centers = kmModel.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

# COMMAND ----------

from pyspark.ml.clustering import GaussianMixture
gmm = GaussianMixture().setK(5)
Esempio n. 30
0
stand_scaled = StandardScaler(inputCol="DenseVector",
                              outputCol="features",
                              withStd=True,
                              withMean=True)
'''
outputCol must be named Features as Spark KMeans will only use that column as input
'''

scaled_model = stand_scaled.fit(train_df)

train_df = scaled_model.transform(train_df)

bkmeans = BisectingKMeans().setK(2)
bkmeans = bkmeans.setSeed(1)
bkmodel = bkmeans.fit(train_df)
bkcenters = bkmodel.clusterCenters()

if bkmodel.hasSummary:
    print(bkmodel.summary.clusterSizes)
    print(bkmodel.clusterCenters())

predict_df = bkmodel.transform(train_df)

predict_df = predict_df.select("avgMeasuredTime", "avgSpeed", "vehicleCount",
                               "prediction")

predict_df.show(2)

c1 = mpatches.Patch(color="green", label="No Traffic")
Esempio n. 31
0
from pyspark.ml.evaluation import ClusteringEvaluator

# SETTING UP SPARK CONTEXT AND SESSION
conf = SparkConf()
sc = SparkContext(conf=conf)
sc.setLogLevel("ERROR")
spark = SparkSession(sc)

# READING THE GIVEN DATA FILE FRONM LIBSVM FORMAT INTO DATAFRAME
dataset = spark.read.format("libsvm").load(
    "/home/vmalapati1/data/kmeans_input.txt")

#INTIALIZING THE KMMEANS ALGO
kmeans = BisectingKMeans(k=2, seed=1)  # 2 clusters here
# TRIANING THE MODEL WITH ABOVE DATA FRAME
model = kmeans.fit(dataset)
# PREDICTING THE RESULTS BASED ON THE INPUT OF THE MODEL
transformed = model.transform(dataset)

transformed.show(200)

#dataset.show()
#COMPUTING THE COST OF THE KMEANS MODEL
cost = model.computeCost(dataset)
print("Within Set Sum of Squared Errors = " + str(cost))
#FINDING THE CENTERS OF THE CLUSTERS
centers = model.clusterCenters()
# PRINTING THE CENTERS OF THE CLUSTERS
for center in centers:

    print(center)
  bin/spark-submit examples/src/main/python/ml/bisecting_k_means_example.py
"""

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("PythonBisectingKMeansExample")\
        .getOrCreate()

    # $example on$
    # Loads data.
    dataset = spark.read.format("libsvm").load(
        "data/mllib/sample_kmeans_data.txt")

    # Trains a bisecting k-means model.
    bkm = BisectingKMeans().setK(2).setSeed(1)
    model = bkm.fit(dataset)

    # Evaluate clustering.
    cost = model.computeCost(dataset)
    print("Within Set Sum of Squared Errors = " + str(cost))

    # Shows the result.
    print("Cluster Centers: ")
    centers = model.clusterCenters()
    for center in centers:
        print(center)
    # $example off$

    spark.stop()
Esempio n. 33
0
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans

cluster_df = spark.read.csv('clustering_dataset.csv',
                            header=True,
                            inferSchema=True)
cluster_df.show()

vectorAssembler = VectorAssembler(inputCols=['col1', 'col2', 'col3'],
                                  outputCol='features')
vcluster_df = vectorAssembler.transform(cluster_df)

vcluster_df.show()

kmeans = KMeans().setK(3)
kmeans = kmeans.setSeed(1)
kmodel = kmeans.fit(vcluster_df)

centers = kmodel.clusterCenters()

# hierarchical clustering
vcluster_df.show()

from pyspark.ml.clustering import BisectingKMeans
bkmeans = BisectingKMeans().setK(3)
bkmeans = bkmeans.setSeed(1)
bkmodel = bkmeans.fit(vcluster_df)

bkcenters = bkmodel.clusterCenters()
def train_model(dataset):
    # Trains a bisecting k-means model.
    bkm = BisectingKMeans().setK(2).setSeed(1)
    model = bkm.fit(dataset)
    return model