Ejemplo n.º 1
0
def kmeans(features, num_clusters):
    """Does clustering on the features dataset using KMeans clustering.

    Params:
    - features (pyspark.sql.DataFrame): The data frame containing the features to be used for clustering
    - num_clusters (int): The number of clusters to be used

    Returns:
    - clustered (pyspark.sql.DataFrame): The data frame, with the predicted clusters in a 'cluster' column
    """
    kmeans = KMeans(k=num_clusters,
                    featuresCol='features',
                    predictionCol='cluster')
    kmeans_model = kmeans.fit(features)
    clustered = kmeans_model.transform(features)
    clustered.show()
    cluster_centers = kmeans_model.clusterCenters()
    clustered = clustered.rdd.map(
        lambda row: Row(distance=Vectors.squared_distance(
            cluster_centers[row['cluster']], row['features']),
                        **row.asDict())).toDF()
    clustered.show()
    print("=====Clustering Results=====")
    print("Clustering cost = ", kmeans_model.computeCost(features))
    print("Cluster sizes = ", kmeans_model.summary.clusterSizes)
    return clustered
Ejemplo n.º 2
0
 def calculate_distance(self, input_df):
     squared_distance = udf(
         lambda vect1, vect2: float(Vectors.squared_distance(vect1, vect2)), FloatType())
     ad = input_df.alias('df1').join(
         input_df.alias('df2'), col('df1.sentence_id') != col('df2.sentence_id'), 'inner')
     ad2 = ad.select(col('df1.sentence_id').alias('sentence_id'),
                     col('df2.sentence_id').alias('sentence_id_match'),
                     col('df1.sentence_vector').alias('sentence_vector'),
                     col('df2.sentence_vector').alias(
                         'sentence_vector_match')
                     )
     return ad2.withColumn('distance', squared_distance(
         col('sentence_vector'), col('sentence_vector_match')))
Ejemplo n.º 3
0
def distToCentroid(datapt, centroid):
    return math.sqrt(Vectors.squared_distance(datapt, centroid))