Example #1
0
colLabel = "label"
colCat = "ocean_proximity"
colNum = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population',
          'households', 'median_income', 'median_house_value', 'bedrooms_per_room', 'population_per_household']
# ?????????????????????
# for col in renamedHousing.head(0):
#   if col != "label" or "ocean_proximity":
#      colNum.append(col)
# print(colNum)
################################################不太会

for c in renamedHousing.columns:
    print(c, " has null values : ", renamedHousing.filter(renamedHousing[c].isNull()).count())

imputer = Imputer()
imputer.setInputCols(["total_bedrooms", "bedrooms_per_room"])
imputer.setOutputCols(["out_total_bedrooms", "out_bedrooms_per_room"])
imputedHousing = imputer.setStrategy('median').setMissingValue(414).fit(renamedHousing).transform(renamedHousing)
imputedHousing = imputedHousing.drop('total_bedrooms').drop('bedrooms_per_room')

for c in imputedHousing.columns:
    print(c, " has null values : ", imputedHousing.filter(imputedHousing[c].isNull()).count())

colNum_to_scale = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'population',
          'households', 'median_income', 'rooms_per_household','population_per_household','out_total_bedrooms','out_bedrooms_per_room']
va = VectorAssembler().setInputCols(colNum_to_scale).setOutputCol('features')
featuredHousing = va.transform(imputedHousing)
featuredHousing.show()

scaler = StandardScaler(withMean=True, withStd=True)
scaler.setInputCol("features").setOutputCol("scaled_features")
Example #2
0
dataset_new = dataset.withColumn('TEMP_new',when(col('TEMP')==9999.9, float("nan")))\
.withColumn('DEWP_new',when(col('DEWP')==9999.9, float("nan")))\
.withColumn('SLP_new',when(col('SLP')==9999.9, float("nan")))\
.withColumn('STP_new',when(col('STP')==9999.9, float("nan")))\
.withColumn('VISIB_new',when(col('VISIB')==999.9, float("nan")))\
.withColumn('WDSP_new',when(col('WDSP')==999.9, float("nan")))\
.withColumn('MXSPD_new',when(col('MXSPD')==999.9, float("nan")))\
.withColumn('GUST_new',when(col('GUST')==999.9, float("nan")))\
.withColumn('MAX_new',when(col('MAX')==9999.9, float("nan")))\
.withColumn('MIN_new',when(col('MIN')==9999.9, float("nan")))\
.withColumn('PRCP_new',when(col('PRCP')==99.9, float("nan")))\
.withColumn('SNDP_new',when(col('SNDP')==999.9, float("nan")))

#Calling the imputer function
imputer = Imputer(strategy='mean', missingValue=nan)
imputer.setInputCols(['TEMP_new','DEWP_new','SLP_new','STP_new','VISIB_new' ,'WDSP_new','MXSPD_new','GUST_new','MAX_new','MIN_new','PRCP_new','SNDP_new'])
imputer.setOutputCols(['TEMP_impu','DEWP_impu','SLP_impu','STP_impu','VISIB_impu' ,'WDSP_impu','MXSPD_impu','GUST_impu','MAX_impu','MIN_impu','PRCP_impu','SNDP_impu'])

#fitting and transforming model
model = imputer.fit(dataset_new)
clean_dataset = model.transform(dataset_new)

clean_dataset=clean_dataset.select('STN---','WBAN','YEARMODA','TEMP_impu','DEWP_impu','SLP_impu','STP_impu','VISIB_impu',\
                       'WDSP_impu','MXSPD_impu','GUST_impu','MAX_impu','MIN_impu','PRCP_impu','SNDP_impu','FRSHTT')

#Final Join with Country
final_dataset=clean_dataset.join(df_stationlist_countrylist, clean_dataset['STN---']==df_stationlist_countrylist['STN_NO']).select('STN---','WBAN','YEARMODA','TEMP_impu','DEWP_impu','SLP_impu','STP_impu','VISIB_impu',\
                       'WDSP_impu','MXSPD_impu','GUST_impu','MAX_impu','MIN_impu','PRCP_impu','SNDP_impu','FRSHTT','COUNTRY_FULL')
					   

#