def main_preprocessing_version_10(): """ Main to perform some data preprocessing of the data """ # Read the data with the NaNs included: data_2017 = zipcode_data_2017() data_2019 = zipcode_data_2019() # process the data... data = pd.read_csv("Data/zipcodedata_version_9_nanincluded.csv") data["P_MAN"] = data["P_MAN"] * 100 data["P_VROUW"] = data["P_VROUW"] * 100 data["P_INW_014"] = data["P_INW_014"] * 100 data["P_INW_1524"] = data["P_INW_1524"] * 100 data["P_INW_2544"] = data["P_INW_2544"] * 100 data["P_INW_4564"] = data["P_INW_4564"] * 100 data["P_INW_65PL"] = data["P_INW_65PL"] * 100 data["P_UITKMINAOW"] = data["P_UITKMINAOW"] * 100 final_data = data # save data version = 10 # specify version final_data.to_csv("Data/zipcodedata_version_" + str(version) + "_nanincluded.csv", index=False) print(final_data)
def main_preprocessing_version_8(): """ Main to perform some data preprocessing of the data """ # Read the data with the NaNs included: data_2017 = zipcode_data_2017() data_2019 = zipcode_data_2019() data = pd.read_csv("Data/zipcodedata_version_7_nanincluded.csv") # process the data.... AFS_OPRIT = data_2017["AFS_OPRIT"] AFS_OPRIT = AFS_OPRIT.replace(-99997, np.nan) AFS_TRNOVS = data_2017["AFS_TRNOVS"] AFS_TRNOVS = AFS_TRNOVS.replace(-99997, np.nan) AFS_TREINS = data_2017["AFS_TREINS"] AFS_TREINS = AFS_TREINS.replace(-99997, np.nan) data.insert(8, "AFS_OPRIT", AFS_OPRIT) data.insert(8, "AFS_TRNOVS", AFS_TRNOVS) data.insert(8, "AFS_TREINS", AFS_TREINS) final_data = data # save data version = 8 # specify version final_data.to_csv("Data/zipcodedata_version_" + str(version) + "_nanincluded.csv", index=False) print(final_data)
def main_preprocessing_version_7(): """ Main to perform some data preprocessing of the data """ # Read the data with the NaNs included: data_2017 = zipcode_data_2017() data_2019 = zipcode_data_2019() data = pd.read_csv("Data/zipcodedata_version_6_nanincluded.csv") final_data = data # process the data.... UITKMINAOW = data_2019["UITKMINAOW"] UITKMINAOW = UITKMINAOW.replace(-99997, np.nan) INWONER = data_2019["INWONER"] INWONER = INWONER.replace(-99997, np.nan) data.insert(18, "P_UITKMINAOW", UITKMINAOW / INWONER) data = data.drop(["UITKMINAOW_HH"], axis=1) final_data = data # save data version = 7 # specify version final_data.to_csv("Data/zipcodedata_version_" + str(version) + "_nanincluded.csv", index=False) print(final_data)
def main_preprocessing_version_5(): """ Main to perform some data preprocessing of the data """ # Read the data with the NaNs included: data_2017 = zipcode_data_2017() data_2019 = zipcode_data_2019() data_v4 = pd.read_csv("Data/zipcodedata_version_4_nanincluded.csv") # process the data.... AANTAL_HH = data_2019["AANTAL_HH"] AANTAL_HH = AANTAL_HH.replace(-99997, np.nan) data_v4.insert(1, "INWONER_HH", data_v4["INWONER"] / AANTAL_HH) # number of inhabitants per household data_v4 = data_v4.drop(["INWONER"], axis=1) data_v4.insert( 18, "UITKMINAOW_HH", data_v4["UITKMINAOW"] / AANTAL_HH ) # number of inhabitants receiving social benefits per household data_v4 = data_v4.drop(["UITKMINAOW"], axis=1) final_data = data_v4 # save data version = 5 # specify version final_data.to_csv("Data/zipcodedata_version_" + str(version) + "_nanincluded.csv", index=False) print(final_data)
def main_preprocessing_version_9(): """ Main to perform some data preprocessing of the data """ # Read the data with the NaNs included: data_2017 = zipcode_data_2017() data_2019 = zipcode_data_2019() data = pd.read_csv("Data/zipcodedata_version_8_nanincluded.csv") # process the data.... #data = data.drop(["log_median_inc"], axis=1) final_data = data # save data version = 9 # specify version final_data.to_csv("Data/zipcodedata_version_" + str(version) + "_nanincluded.csv", index=False) print(final_data)
def main_preprocessing_version_6(): """ Main to perform some data preprocessing of the data """ # Read the data with the NaNs included: data_2017 = zipcode_data_2017() data_2019 = zipcode_data_2019() data = pd.read_csv("Data/zipcodedata_version_5_nanincluded.csv") data = data.drop(["INWONER_HH"], axis=1) final_data = data # process the data.... AANTAL_HH = data_2019["AANTAL_HH"] AANTAL_HH = AANTAL_HH.replace(-99997, np.nan) data.insert(1, "AANTAL_HH", AANTAL_HH) # save data version = 6 # specify version final_data.to_csv("Data/zipcodedata_version_" + str(version) + "_nanincluded.csv", index=False) print(final_data)
import numpy as np from DataSets import zipcode_data_2017, zipcode_data_2019, downloaded_zipcode_2017, medianincome_zipcode_data import pandas as pd import sys zipcode_df = zipcode_data_2019() cbs_2017 = zipcode_data_2017() cbs_2017_extra = downloaded_zipcode_2017() median = medianincome_zipcode_data() print(zipcode_df) # Drop all irrelevant variables zipcode_df = zipcode_df.drop(columns=[ 'AANTAL_HH', 'TOTHH_EENP', 'TOTHH_MPZK', 'HH_EENOUD', 'HH_TWEEOUD', 'WONING', 'WON_MRGEZ', 'GEBOORTE', 'STED', 'WONVOOR45', 'WON_4564', 'WON_6574', 'WON_7584', 'WON_8594', 'WON_9504', 'WON_0514', 'WON_1524' ]) #Add extra income-related variables zipcode_df['P_LINK_HH'] = cbs_2017_extra['P_LINK_HH'] zipcode_df['P_HINK_HH'] = cbs_2017_extra['P_HINK_HH'] # Add variables from 2017 CBS file (provided by UFS) #zipcode_df['AFS_CAFE']= cbs_2017['AFS_CAFE'] #zipcode_df['AV1_CAFE']= cbs_2017['AV1_CAFE'] #zipcode_df['AV3_CAFE']= cbs_2017['AV3_CAFE'] zipcode_df['AV5_CAFE'] = cbs_2017['AV5_CAFE'] #zipcode_df['AFS_CAFTAR']= cbs_2017['AFS_CAFTAR']
def main_preprocessing_version_4(): """ Main to perform some data preprocessing of the data """ # Read the data with the NaNs included: data_2017 = zipcode_data_2017() data_2019 = zipcode_data_2019() data_v3 = pd.read_csv("Data/zipcodedata_version_3_nanincluded.csv") final_data = pd.DataFrame( data_2019["pc4"], columns=["pc4"]) # initialize it so we have the # of rows beforehand # gender distributions gender_data = pd.DataFrame(data_2019[["MAN", "VROUW"]], columns=["MAN", "VROUW"]) gender_data = gender_data.replace(-99997, np.nan) # replace weird values with NaN gender_data.dropna(inplace=True) total_gender = gender_data["MAN"] + gender_data["VROUW"] gender_data["P_MAN"] = gender_data["MAN"] / total_gender gender_data["P_VROUW"] = gender_data["VROUW"] / total_gender final_data["INWONER"] = data_v3["INWONER"] final_data["P_MAN"] = gender_data["P_MAN"] final_data["P_VROUW"] = gender_data["P_VROUW"] # age distributions age_data = pd.DataFrame( data_v3[["INW_014", "INW_1524", "INW_2544", "INW_4564", "INW_65PL"]]) age_data.dropna(inplace=True) total_ages = age_data["INW_014"] + age_data["INW_1524"] + age_data[ "INW_2544"] + age_data["INW_4564"] + age_data["INW_65PL"] final_data["P_INW_014"] = age_data["INW_014"] / total_ages final_data["P_INW_1524"] = age_data["INW_1524"] / total_ages final_data["P_INW_2544"] = age_data["INW_2544"] / total_ages final_data["P_INW_4564"] = age_data["INW_4564"] / total_ages final_data["P_INW_65PL"] = age_data["INW_65PL"] / total_ages #data["P_AGES"] = data["Page_INW_014"] + data["P_INW_1524"] + data["P_INW_2544"] + data["P_INW_4564"] + data["P_INW_65PL"] # sum of food facilities in a 1, 3, 5 km radius food_data = pd.DataFrame(data_2017[[ "AV1_CAFE", "AV1_CAFTAR", "AV1_RESTAU", "AV3_CAFE", "AV3_CAFTAR", "AV3_RESTAU", "AV5_CAFE", "AV5_CAFTAR", "AV5_HOTEL", "AV5_RESTAU" ]]) food_data = food_data.replace(-99997, np.nan) # replace weird values with NaN food_data.dropna() final_data["AV1_FOOD"] = food_data["AV1_CAFE"] + food_data[ "AV1_CAFTAR"] + food_data["AV1_RESTAU"] final_data["AV3_FOOD"] = food_data["AV3_CAFE"] + food_data[ "AV3_CAFTAR"] + food_data["AV3_RESTAU"] final_data["AV5_FOOD"] = food_data["AV5_CAFE"] + food_data[ "AV5_CAFTAR"] + food_data["AV5_RESTAU"] # other relevant variables variable_names = [ "OAD", "P_NL_ACHTG", "P_WE_MIG_A", "P_NW_MIG_A", "GEM_HH_GR", "UITKMINAOW", "P_LINK_HH", "P_HINK_HH", "median_inc" ] for name in variable_names: final_data[name] = data_v3[name] final_data["log_median_inc"] = np.log(final_data["median_inc"]) final_data = final_data.drop(columns=["median_inc" ]) # only include log median income # save data version = 4 # specify version final_data.to_csv("Data/zipcodedata_version_" + str(version) + "_nanincluded.csv", index=False) print(final_data) print(len(final_data["pc4"])) final_data = final_data.dropna() print(len(final_data["pc4"]))