Esempio n. 1
0
    def clustering_tuning(self):
        df_raw = pd.read_csv(f"{self.DEFAULT_PREPROCESSING_OUTPUT}",
                             header=None)

        spark = SparkSession \
            .builder \
            .appName("PySparkKMeans") \
            .config("spark.some.config.option", "some-value") \
            .getOrCreate()

        df = spark.createDataFrame(df_raw)
        assembler = VectorAssembler(inputCols=df.columns, outputCol="features")
        # df_sample = df.sample(withReplacement=False, fraction=0.1)
        df_vec = assembler.transform(df).select("features")

        K_lst = list(range(100, 10001, 50))

        for k in K_lst:

            kmeans = KMeans(k=k)
            kmeans.setSeed(1)
            kmeans.setMaxIter(5000)

            model = kmeans.fit(df_vec)
            model.setPredictionCol("newPrediction")
            model.predict(df_vec.head().features)

            centers = model.clusterCenters()

            transformed = model.transform(df_vec).select(
                "features", "newPrediction")
            rows = transformed.collect()

            # Evaluate clustering by computing Silhouette score
            evaluator = ClusteringEvaluator()
            transformed = transformed.withColumn("prediction",
                                                 func.col("newPrediction"))
            transformed = transformed.reset_index()

            silhouette = evaluator.evaluate(transformed)

        return transformed
Esempio n. 2
0
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans

cluster_df = spark.read.csv('clustering_dataset.csv',
                            header=True,
                            inferSchema=True)
cluster_df.show()

vectorAssembler = VectorAssembler(inputCols=['col1', 'col2', 'col3'],
                                  outputCol='features')
vcluster_df = vectorAssembler.transform(cluster_df)

vcluster_df.show()

kmeans = KMeans().setK(3)
kmeans = kmeans.setSeed(1)
kmodel = kmeans.fit(vcluster_df)

centers = kmodel.clusterCenters()

# hierarchical clustering
vcluster_df.show()

from pyspark.ml.clustering import BisectingKMeans
bkmeans = BisectingKMeans().setK(3)
bkmeans = bkmeans.setSeed(1)
bkmodel = bkmeans.fit(vcluster_df)

bkcenters = bkmodel.clusterCenters()
Esempio n. 3
0
                              withMean=True)

scaled_model = stand_scaled.fit(train_df)

train_df = scaled_model.transform(train_df)

scaled_model = stand_scaled.fit(test1_df)

test1_df = scaled_model.transform(test1_df)

scaled_model = stand_scaled.fit(test2_df)

test2_df = scaled_model.transform(test2_df)

kmeans = KMeans().setK(2)  # set number of clusters
kmeans = kmeans.setSeed(1)  # set start point
kmodel = kmeans.fit(train_df)
centers = kmodel.clusterCenters()

print(centers)

test1_df = kmodel.transform(test1_df)
test1_df.select("features", "Occupancy", "prediction").show(5)

test2_df = kmodel.transform(test2_df)
test2_df.select("features", "Occupancy", "prediction").show(5)

count1 = test1_df.filter(" prediction!=Occupancy").count()
total1 = test1_df.count()

count2 = test2_df.filter(" prediction!=Occupancy").count()
Esempio n. 4
0
score_table = 'from lookalike_application_score_vector_08192021_1m'
num_clusters = 100

spark = SparkSession.builder.enableHiveSupport().getOrCreate()

# num_features = 10
# num_rows = 10000
# data = np.random.rand(num_rows, num_features)
# spark.createDataFrame(data)

did_bucket = 0

command = "SELECT did, score_vector, did_bucket FROM {} WHERE did_bucket = {}".format(
    score_table, did_bucket)
df = spark.sql(command)

list_to_vector_udf = udf(lambda l: Vectors.dense(l), VectorUDT())
df = df.withColumn('score_vec', list_to_vector_udf(df.score_vector))

first_time = True
if first_time:
    kmeans = KMeans(k=num_clusters, featuresCol='score_vec')
    kmeans.setSeed(1)
    kmeans.setPredictionCol('cluster_id')
    model = kmeans.fit(df)
    first_time = False

df2 = model.transform(df)
df2.show()