-
Notifications
You must be signed in to change notification settings - Fork 0
/
k-means.py
57 lines (44 loc) · 2.7 KB
/
k-means.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
from pyspark.sql import SQLContext
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from notebooks import utils
%matplotlib inline
sqlContext = SQLContext(sc)
df = sqlContext.read.load('file:///home/cloudera/Downloads/download/big-data-4/minute_weather.csv',format='com.databricks.spark.csv',inferSchema='true',header='true')
# subset :filter vs remove :drop
filterDF = df.filter((df.rowID % 10 ) == 0)
# filterDF.describe().toPandas().transpose()
# filterDF.filter(filterDF.rain_accumulation == 0.0).count()
# drop the rain_accumulation and rain_duration for having so many zeros,and drop unusing column:hpwren_timestamp
workingDF = filterDF.drop('rain_accumulation').drop('rain_duration').drop('hpwren_timestamp')
# drop missing values
before = workingDF.count()
workdingDF = workdingDF.na.drop()
after = workingDF.count()
before - after
# scale the data : because all are used to calculate the distance ,they are should be in the same scale
workdingDF.columns
# not use rowID (result is be stored)
# max_wind_speed has a high correlation with the wind* ,not incude them either
featureColumns = ['air_pressure','air_temp','avg_wind_direction','avg_wind_speed','max_wind_direction','max_wind_speed','relative_humidity']
assembler = VectorAssembler(inputCols=featureColumns,outputCol='features_unscaled')
assembled = assembler.transform(workdingDF)
# scale
# (each column - mean /std) =====mean = 0
scaler = StandardScaler(inputCol='features_unscaled',outputCol='features',withStd=True,withMean=True)
scaleModel = scaler.fit(assembled)
scaleData = ScaleModel.transform(assembled)
#(X-mean)/std 计算时对每个属性/每列分别进行。
'''
将数据按期属性(按列进行)减去其均值,并处以其方差。得到的结果是,对于每个属性/每列来说所有数据都聚集在0附近,方差为1。
'''
# create elbow plot to see the number of centers
#This method involves applying k-means, using different values for k, and calculating the within-cluster sum-of-squared error (WSSE). Since this means applying k-means multiple times, this process can be very compute-intensive. To speed up the process, we will use only a subset of the dataset. We will take every third sample from the dataset to create this subset:
scaleData = ScaleData.select('features','rowID')
elbowset = scale.filter((scaleData.rowID % 3 == 0)).select('features')
elbowset.persist()
#The last line calls the persist() method to tell Spark to keep the data in memory (if possible), which will speed up the computations.
clusters = range(2,31)
wsseList = utils.elbow(elbowset,clusters)
utils.elbow_plot(wsseList,clusters) # matplotlib line make plot automatically