def main_preprocessing_version_8():
    """
    Main to perform some data preprocessing of the data
    """
    # Read the data with the NaNs included:
    data_2017 = zipcode_data_2017()
    data_2019 = zipcode_data_2019()
    data = pd.read_csv("Data/zipcodedata_version_7_nanincluded.csv")

    # process the data....
    AFS_OPRIT = data_2017["AFS_OPRIT"]
    AFS_OPRIT = AFS_OPRIT.replace(-99997, np.nan)
    AFS_TRNOVS = data_2017["AFS_TRNOVS"]
    AFS_TRNOVS = AFS_TRNOVS.replace(-99997, np.nan)
    AFS_TREINS = data_2017["AFS_TREINS"]
    AFS_TREINS = AFS_TREINS.replace(-99997, np.nan)
    data.insert(8, "AFS_OPRIT", AFS_OPRIT)
    data.insert(8, "AFS_TRNOVS", AFS_TRNOVS)
    data.insert(8, "AFS_TREINS", AFS_TREINS)
    final_data = data

    # save data
    version = 8  # specify version
    final_data.to_csv("Data/zipcodedata_version_" + str(version) +
                      "_nanincluded.csv",
                      index=False)
    print(final_data)
def main_preprocessing_version_7():
    """
    Main to perform some data preprocessing of the data
    """
    # Read the data with the NaNs included:
    data_2017 = zipcode_data_2017()
    data_2019 = zipcode_data_2019()
    data = pd.read_csv("Data/zipcodedata_version_6_nanincluded.csv")
    final_data = data

    # process the data....
    UITKMINAOW = data_2019["UITKMINAOW"]
    UITKMINAOW = UITKMINAOW.replace(-99997, np.nan)
    INWONER = data_2019["INWONER"]
    INWONER = INWONER.replace(-99997, np.nan)
    data.insert(18, "P_UITKMINAOW", UITKMINAOW / INWONER)
    data = data.drop(["UITKMINAOW_HH"], axis=1)
    final_data = data

    # save data
    version = 7  # specify version
    final_data.to_csv("Data/zipcodedata_version_" + str(version) +
                      "_nanincluded.csv",
                      index=False)
    print(final_data)
def main_preprocessing_version_5():
    """
    Main to perform some data preprocessing of the data
    """
    # Read the data with the NaNs included:
    data_2017 = zipcode_data_2017()
    data_2019 = zipcode_data_2019()
    data_v4 = pd.read_csv("Data/zipcodedata_version_4_nanincluded.csv")

    # process the data....
    AANTAL_HH = data_2019["AANTAL_HH"]
    AANTAL_HH = AANTAL_HH.replace(-99997, np.nan)
    data_v4.insert(1, "INWONER_HH", data_v4["INWONER"] /
                   AANTAL_HH)  # number of inhabitants per household
    data_v4 = data_v4.drop(["INWONER"], axis=1)

    data_v4.insert(
        18, "UITKMINAOW_HH", data_v4["UITKMINAOW"] / AANTAL_HH
    )  # number of inhabitants receiving social benefits per household
    data_v4 = data_v4.drop(["UITKMINAOW"], axis=1)
    final_data = data_v4

    # save data
    version = 5  # specify version
    final_data.to_csv("Data/zipcodedata_version_" + str(version) +
                      "_nanincluded.csv",
                      index=False)
    print(final_data)
def main_preprocessing_version_9():
    """
    Main to perform some data preprocessing of the data
    """
    # Read the data with the NaNs included:
    data_2017 = zipcode_data_2017()
    data_2019 = zipcode_data_2019()
    data = pd.read_csv("Data/zipcodedata_version_8_nanincluded.csv")

    # process the data....
    #data = data.drop(["log_median_inc"], axis=1)
    final_data = data

    # save data
    version = 9  # specify version
    final_data.to_csv("Data/zipcodedata_version_" + str(version) +
                      "_nanincluded.csv",
                      index=False)
    print(final_data)
def main_preprocessing_version_6():
    """
    Main to perform some data preprocessing of the data
    """
    # Read the data with the NaNs included:
    data_2017 = zipcode_data_2017()
    data_2019 = zipcode_data_2019()
    data = pd.read_csv("Data/zipcodedata_version_5_nanincluded.csv")
    data = data.drop(["INWONER_HH"], axis=1)
    final_data = data

    # process the data....
    AANTAL_HH = data_2019["AANTAL_HH"]
    AANTAL_HH = AANTAL_HH.replace(-99997, np.nan)

    data.insert(1, "AANTAL_HH", AANTAL_HH)
    # save data
    version = 6  # specify version
    final_data.to_csv("Data/zipcodedata_version_" + str(version) +
                      "_nanincluded.csv",
                      index=False)
    print(final_data)
def main_statistics():
    '''
    Main to show the distribtuions of different variables
    '''
    version = 4
    data = pd.read_csv("Data/zipcodedata_KNN_version_" + str(version) + ".csv")
    version = 5
    data2 = pd.read_csv("Data/zipcodedata_version_" + str(version) +
                        "_nanincluded.csv")

    data_2019 = zipcode_data_2019().replace(-99997, np.nan)
    #data_2019.dropna(inplace=True)
    data = data.merge(data_2019[["pc4", "AANTAL_HH"]],
                      left_on='pc4',
                      right_on='pc4')
    data.dropna(inplace=True)

    x = data["INWONER"]
    sns.distplot(x,
                 hist=True,
                 bins=30,
                 kde=True,
                 color="darkblue",
                 hist_kws={'edgecolor': 'black'},
                 kde_kws={'linewidth': 4})

    plt.show()

    x = data["INWONER"] / data["AANTAL_HH"]
    sns.distplot(x,
                 hist=True,
                 bins=30,
                 kde=True,
                 color="darkblue",
                 hist_kws={'edgecolor': 'black'},
                 kde_kws={'linewidth': 4})

    plt.show()

    x = data["UITKMINAOW"]
    sns.distplot(x,
                 hist=True,
                 bins=30,
                 kde=True,
                 color="darkblue",
                 hist_kws={'edgecolor': 'black'},
                 kde_kws={'linewidth': 4})

    plt.show()

    x = data["UITKMINAOW"] / data["AANTAL_HH"]
    sns.distplot(x,
                 hist=True,
                 bins=30,
                 kde=True,
                 color="darkblue",
                 hist_kws={'edgecolor': 'black'},
                 kde_kws={'linewidth': 4})

    plt.show()

    for var in data2.columns:
        x = data2[var]

        x.dropna(inplace=True)
        JB = jarque_bera(x)
        print("%s has a JB-statistics of %.3f with p-value = %.3f" %
              (var, JB[0], JB[1]))
        print("observations: " + str(len(x)))
        print("")