colLabel = "label" colCat = "ocean_proximity" colNum = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value', 'bedrooms_per_room', 'population_per_household'] # ????????????????????? # for col in renamedHousing.head(0): # if col != "label" or "ocean_proximity": # colNum.append(col) # print(colNum) ################################################不太会 for c in renamedHousing.columns: print(c, " has null values : ", renamedHousing.filter(renamedHousing[c].isNull()).count()) imputer = Imputer() imputer.setInputCols(["total_bedrooms", "bedrooms_per_room"]) imputer.setOutputCols(["out_total_bedrooms", "out_bedrooms_per_room"]) imputedHousing = imputer.setStrategy('median').setMissingValue(414).fit(renamedHousing).transform(renamedHousing) imputedHousing = imputedHousing.drop('total_bedrooms').drop('bedrooms_per_room') for c in imputedHousing.columns: print(c, " has null values : ", imputedHousing.filter(imputedHousing[c].isNull()).count()) colNum_to_scale = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'population', 'households', 'median_income', 'rooms_per_household','population_per_household','out_total_bedrooms','out_bedrooms_per_room'] va = VectorAssembler().setInputCols(colNum_to_scale).setOutputCol('features') featuredHousing = va.transform(imputedHousing) featuredHousing.show() scaler = StandardScaler(withMean=True, withStd=True) scaler.setInputCol("features").setOutputCol("scaled_features")
dataset_new = dataset.withColumn('TEMP_new',when(col('TEMP')==9999.9, float("nan")))\ .withColumn('DEWP_new',when(col('DEWP')==9999.9, float("nan")))\ .withColumn('SLP_new',when(col('SLP')==9999.9, float("nan")))\ .withColumn('STP_new',when(col('STP')==9999.9, float("nan")))\ .withColumn('VISIB_new',when(col('VISIB')==999.9, float("nan")))\ .withColumn('WDSP_new',when(col('WDSP')==999.9, float("nan")))\ .withColumn('MXSPD_new',when(col('MXSPD')==999.9, float("nan")))\ .withColumn('GUST_new',when(col('GUST')==999.9, float("nan")))\ .withColumn('MAX_new',when(col('MAX')==9999.9, float("nan")))\ .withColumn('MIN_new',when(col('MIN')==9999.9, float("nan")))\ .withColumn('PRCP_new',when(col('PRCP')==99.9, float("nan")))\ .withColumn('SNDP_new',when(col('SNDP')==999.9, float("nan"))) #Calling the imputer function imputer = Imputer(strategy='mean', missingValue=nan) imputer.setInputCols(['TEMP_new','DEWP_new','SLP_new','STP_new','VISIB_new' ,'WDSP_new','MXSPD_new','GUST_new','MAX_new','MIN_new','PRCP_new','SNDP_new']) imputer.setOutputCols(['TEMP_impu','DEWP_impu','SLP_impu','STP_impu','VISIB_impu' ,'WDSP_impu','MXSPD_impu','GUST_impu','MAX_impu','MIN_impu','PRCP_impu','SNDP_impu']) #fitting and transforming model model = imputer.fit(dataset_new) clean_dataset = model.transform(dataset_new) clean_dataset=clean_dataset.select('STN---','WBAN','YEARMODA','TEMP_impu','DEWP_impu','SLP_impu','STP_impu','VISIB_impu',\ 'WDSP_impu','MXSPD_impu','GUST_impu','MAX_impu','MIN_impu','PRCP_impu','SNDP_impu','FRSHTT') #Final Join with Country final_dataset=clean_dataset.join(df_stationlist_countrylist, clean_dataset['STN---']==df_stationlist_countrylist['STN_NO']).select('STN---','WBAN','YEARMODA','TEMP_impu','DEWP_impu','SLP_impu','STP_impu','VISIB_impu',\ 'WDSP_impu','MXSPD_impu','GUST_impu','MAX_impu','MIN_impu','PRCP_impu','SNDP_impu','FRSHTT','COUNTRY_FULL') #