def kmeans(features, num_clusters): """Does clustering on the features dataset using KMeans clustering. Params: - features (pyspark.sql.DataFrame): The data frame containing the features to be used for clustering - num_clusters (int): The number of clusters to be used Returns: - clustered (pyspark.sql.DataFrame): The data frame, with the predicted clusters in a 'cluster' column """ kmeans = KMeans(k=num_clusters, featuresCol='features', predictionCol='cluster') kmeans_model = kmeans.fit(features) clustered = kmeans_model.transform(features) clustered.show() cluster_centers = kmeans_model.clusterCenters() clustered = clustered.rdd.map( lambda row: Row(distance=Vectors.squared_distance( cluster_centers[row['cluster']], row['features']), **row.asDict())).toDF() clustered.show() print("=====Clustering Results=====") print("Clustering cost = ", kmeans_model.computeCost(features)) print("Cluster sizes = ", kmeans_model.summary.clusterSizes) return clustered
def calculate_distance(self, input_df): squared_distance = udf( lambda vect1, vect2: float(Vectors.squared_distance(vect1, vect2)), FloatType()) ad = input_df.alias('df1').join( input_df.alias('df2'), col('df1.sentence_id') != col('df2.sentence_id'), 'inner') ad2 = ad.select(col('df1.sentence_id').alias('sentence_id'), col('df2.sentence_id').alias('sentence_id_match'), col('df1.sentence_vector').alias('sentence_vector'), col('df2.sentence_vector').alias( 'sentence_vector_match') ) return ad2.withColumn('distance', squared_distance( col('sentence_vector'), col('sentence_vector_match')))
def distToCentroid(datapt, centroid): return math.sqrt(Vectors.squared_distance(datapt, centroid))