Exemple #1
0
def compare_input_data(
        input_h5="./Simulation_engine/dummy_data.h5",
        input_h5_b="./Simulation_engine/dummy_data.h5",
        name_variables=("rfr", "irpp", "nbptr"),
        PERIOD=None,
):
    if PERIOD is None:
        PERIOD = annee_de_calcul
    TBS = FranceTaxBenefitSystem()
    DUMMY_DATA = pandas.read_hdf(input_h5)
    simulation_base_deciles, dictionnaire_datagrouped = simulation(
        PERIOD, DUMMY_DATA, TBS)
    df = dictionnaire_datagrouped["foyer_fiscal"][["wprm"]]
    for nv in name_variables:
        df["{}_base".format(nv)] = simulation_base_deciles.calculate(
            nv, PERIOD)
    isdif = False
    data2 = pandas.read_hdf(input_h5_b)
    col = "b"
    newsim, ddg2 = simulation(PERIOD, data2, TBS)
    for nv in name_variables:
        df["{}_{}".format(nv, col)] = newsim.calculate(nv, PERIOD)

        isdif |= len(df[df["{}_{}".format(nv, col)] -
                        df["{}_base".format(nv)] > 0.01]) + len(
                            df[df["{}_{}".format(nv, col)] -
                               df["{}_base".format(nv)] < -0.01])
    return not isdif
Exemple #2
0
def test_homemade_nbptr_function(
    reform_config_base_2020, nbptr_parametres_par_defaut, various_cas_types
):
    # Verifie que les resultats de nbptr et irpp sont les mêmes avec la fonction par defaut
    period = "2020"
    data = dataframe_from_cas_types_description(various_cas_types)
    tbs_reforme_sans_nbptr = IncomeTaxReform(
        FranceTaxBenefitSystem(), reform_config_base_2020, period
    )
    tbs_reforme_avec_nbptr = IncomeTaxReform(
        FranceTaxBenefitSystem(),
        {
            "impot_revenu": {
                **(reform_config_base_2020["impot_revenu"]),
                **nbptr_parametres_par_defaut,
            }
        },
        period,
    )

    sim_sans_nbptr, _ = simulation(period, data, tbs_reforme_sans_nbptr)
    sim_avec_nbptr, _ = simulation(period, data, tbs_reforme_avec_nbptr)

    print("sans", sim_sans_nbptr.calculate("nbptr", period))
    print("avec", sim_avec_nbptr.calculate("nbptr", period))

    assert array_equal(
        sim_sans_nbptr.calculate("nbptr", period),
        sim_avec_nbptr.calculate("nbptr", period),
    )
    assert array_equal(
        sim_sans_nbptr.calculate("irpp", period),
        sim_avec_nbptr.calculate("irpp", period),
    )
def generate_default_results():
    # Keeping computations short with option to keep file under 1000 FF
    # DUMMY_DATA = DUMMY_DATA[(DUMMY_DATA["idmen"] > 2500) & (DUMMY_DATA["idmen"] < 7500)]
    bulk_data_simulation, data_by_entity = simulation(PERIOD, DUMMY_DATA, TBS)
    # precalcul cas de base sur la population pour le cache
    base_results = data_by_entity["foyer_fiscal"][["wprm", "idfoy"]]
    base_results["avant"] = bulk_data_simulation.calculate("irpp", PERIOD)
    simulation_plf_deciles = simulation(PERIOD, DUMMY_DATA, TBS_PLF)
    base_results["plf"] = simulation_plf_deciles[0].calculate("irpp", PERIOD)
    base_results[["idfoy", "avant", "plf", "wprm"]].to_csv(
        "base_results.csv", index=False
    )
    return base_results
def test_sim_pop_dict_content(reform):
    simulation_reform = simulation(PERIOD, DUMMY_DATA, reform)
    comp_result = compare(PERIOD, {"apres": simulation_reform})
    assert "total" in comp_result
    assert "deciles" in comp_result
    assert "frontieres_deciles" in comp_result
    assert len(comp_result["frontieres_deciles"]) == len(
        comp_result["deciles"])
    assert "foyers_fiscaux_touches" in comp_result
    # assert len(comp_result["deciles"])==10 Removed cause with the cas type description
    for key in ["avant", "apres", "plf"]:
        assert key in comp_result["total"]
        assert key in comp_result["deciles"][0]
    for key in ["avant_to_apres", "avant_to_plf", "plf_to_apres"]:
        assert key in comp_result["foyers_fiscaux_touches"]
        for type_touche, nb_people in comp_result["foyers_fiscaux_touches"][
                key].items():
            assert type_touche in [
                "gagnant",
                "neutre",
                "perdant",
                "perdant_zero",
                "neutre_zero",
            ]
            assert isinstance(nb_people, int)
def test_sim_pop_dict_content(reform, requested_simulations):
    simulation_reform = simulation(PERIOD, DUMMY_DATA, reform)
    comp_result = compare(PERIOD, {"apres": simulation_reform})
    assert "total" in comp_result
    assert "deciles" in comp_result
    assert "frontieres_deciles" in comp_result
    assert len(comp_result["frontieres_deciles"]) == len(
        comp_result["deciles"])
    assert "foyers_fiscaux_touches" in comp_result
    # assert len(comp_result["deciles"])==10 Removed cause with the cas type description
    for key in requested_simulations:
        assert key in comp_result["total"]
        assert key in comp_result["deciles"][0]
    for index_key_1 in range(len(requested_simulations)):
        for index_key_2 in range(index_key_1 + 1, len(requested_simulations)):
            key = (requested_simulations[index_key_1] + "_to_" +
                   requested_simulations[index_key_2])
            # list of keys checked can be for example ["avant_to_apres", "avant_to_plf", "plf_to_apres"]
            assert key in comp_result["foyers_fiscaux_touches"]
            for type_touche, nb_people in comp_result[
                    "foyers_fiscaux_touches"][key].items():
                assert type_touche in [
                    "gagnant",
                    "neutre",
                    "perdant",
                    "perdant_zero",
                    "neutre_zero",
                ]
                assert isinstance(nb_people, int)
def test_sim_base_cas_types_dict_content_ok(reform, requested_simulations):
    simulation_reform = simulation(PERIOD, CAS_TYPE, reform)
    simulations_cas_types = simulations_reformes_par_defaut_castypes
    simulations_cas_types["apres"] = simulation_reform
    comp_result = compare(PERIOD, simulations_cas_types, compute_deciles=False)
    assert "total" in comp_result
    assert "res_brut" in comp_result
    # assert len(comp_result["deciles"])==10 Removed cause with the cas type description
    for key in requested_simulations:
        assert key in comp_result["total"]
        assert key in comp_result["res_brut"]
        assert len(comp_result["res_brut"][key]) == 6
def generate_default_results():
    # precalcul cas de base sur la population pour le cache
    base_results = None
    liste_base_reformes = []
    for reforme in TBS_DEFAULT:
        liste_base_reformes += [reforme]
        bulk_data_simulation, data_by_entity = simulation(
            PERIOD, DUMMY_DATA, TBS_DEFAULT[reforme])
        if base_results is None:
            base_results = data_by_entity["foyer_fiscal"][["wprm", "idfoy"]]
        base_results[reforme] = bulk_data_simulation.calculate("irpp", PERIOD)

    base_results[["idfoy"] + liste_base_reformes + ["wprm"]].to_csv(
        "base_results.csv", index=False)
    return base_results
Exemple #8
0
def test_zero_nbptr(reform_config_base_2020, nbptr_zero, various_cas_types):
    # Verifie que les resultats de nbptr sont bien zero pour tout le monde si tous les param
    # sont à zéro
    period = "2020"
    data = dataframe_from_cas_types_description(various_cas_types)
    tbs_reforme_avec_nbptr = IncomeTaxReform(
        FranceTaxBenefitSystem(),
        {"impot_revenu": {**(reform_config_base_2020["impot_revenu"]), **nbptr_zero}},
        period,
    )

    sim_avec_nbptr, _ = simulation(period, data, tbs_reforme_avec_nbptr)

    resultats_nbptr = sim_avec_nbptr.calculate("nbptr", period)

    assert not resultats_nbptr.any()
Exemple #9
0
def test_deux_adultes_ancien_combattants_deux_enfants(reform_config_base_2020):
    # données
    foyer = {
        "declarants": [
            {
                "ancienCombattant": True,
                "invalide": False,
                "parentIsole": False,
                "retraite": False,
                "veuf": False
            },
            {
                "ancienCombattant": True,
                "invalide": False,
                "parentIsole": False,
                "retraite": False,
                "veuf": False
            }
        ],
        "personnesACharge": [
            {
                "chargePartagee": False,
                "invalide": False
            },
            {
                "chargePartagee": False,
                "invalide": False
            }
        ],
        "residence": "metropole",
        "revenuImposable": 120000
    }
    data = dataframe_from_cas_types_description([foyer])
    period = "2020"

    # loi française + réforme IR
    tbs_reforme_impot_revenu = IncomeTaxReform(
        FranceTaxBenefitSystem(), reform_config_base_2020, period
    )
    built_simulation, _dict_data_by_entity = simulation(
        period, data, tbs_reforme_impot_revenu
    )

    nbptr = built_simulation.calculate("nbptr", period)
    assert nbptr == [3.5]
def test_sim_base_cas_types_dict_content_ok(reform):
    simulation_reform = simulation(PERIOD, CAS_TYPE, reform)
    comp_result = compare(
        PERIOD,
        {
            "avant": simulation_base_castypes,
            "plf": simulation_plf_castypes,
            "apres": simulation_reform,
        },
        compute_deciles=False,
    )
    assert "total" in comp_result
    assert "res_brut" in comp_result
    # assert len(comp_result["deciles"])==10 Removed cause with the cas type description
    for key in ["avant", "apres", "plf"]:
        assert key in comp_result["total"]
        assert key in comp_result["res_brut"]
        assert len(comp_result["res_brut"][key]) == 6
Exemple #11
0
def ajustement_h5(
    input_h5="./Simulation_engine/dummy_data.h5",
    output_h5="./Simulation_engine/dummy_data_ajuste.h5",
    distribution_rfr_population="./Simulation_engine/Calib/ResFinalCalibSenat.csv",
    PERIOD=None,
):
    if PERIOD is None:
        PERIOD = annee_de_calcul
    ajuste_h5 = output_h5
    TBS = FranceTaxBenefitSystem()
    DUMMY_DATA = pandas.read_hdf(input_h5)
    # Keeping computations short with option to keep file under 1000 FF
    # DUMMY_DATA = DUMMY_DATA[DUMMY_DATA["idmen"] < 1000]
    simulation_base_deciles = simulation(PERIOD, DUMMY_DATA, TBS)
    df = aggregats_ff(PERIOD, simulation_base_deciles).sort_values(by="rfr")
    print("{} FF sur {} ont un revenu>0 , donc {:.2f}% ont que dalle ".format(
        len(df[df["rfr"] > 0.01]),
        len(df),
        100 - 100 * len(df[df["rfr"] > 0.01]) / len(df),
    ))

    # Step 1 : Ajustement du nombre de mecs à zéro...
    oldweight = 1 - df[df["rfr"] > 0.01]["wprm"].sum() / df["wprm"].sum()
    targetweight = 0.06
    redweightifrfr0 = targetweight * (1 - oldweight) / oldweight / (
        1 - targetweight)
    print(
        "Non en fait {} FF sur {} ont un revenu>0 , donc {:.2f}% ont que dalle. Je vais les ajuster."
        .format(
            df[df["rfr"] > 0.01]["wprm"].sum(),
            df["wprm"].sum(),
            100 - 100 * df[df["rfr"] > 0.01]["wprm"].sum() / df["wprm"].sum(),
        ))
    print("old : {} new : {} adj : {}".format(oldweight, targetweight,
                                              redweightifrfr0))

    # Ajustement de réduction du poids
    df["adjwstep0"] = 1
    df["realwprm"] = df["wprm"]
    df.loc[df["rfr"] < 0.01, "adjwstep0"] = redweightifrfr0
    df.loc[df["rfr"] < 0.01, "realwprm"] = df["wprm"] * redweightifrfr0
    # Calibration du nombre total de foyers fiscaux
    target_foyers_fiscaux = 38_332_977
    # src : https://www.impots.gouv.fr/portail/statistiques (2018)
    adjust_wprm = target_foyers_fiscaux / df["realwprm"].sum()
    df["realwprm"] = df["realwprm"] * adjust_wprm
    print(
        "Non en fait {} FF sur {} ont un revenu>0 , donc {:.2f}% ont que dalle "
        .format(
            df[df["rfr"] > 0.01]["wprm"].sum(),
            df["wprm"].sum(),
            100 - 100 * df[df["rfr"] > 0]["wprm"].sum() / df["wprm"].sum(),
        ))
    # Step 1.1 : Ajuster le 1er décile  (pour l'instant on fait que dalle, y a pas vraiment d'impact

    # Step 2 : PBP (pareto by parts)
    # Stats officielles
    so = pandas.read_csv(distribution_rfr_population)
    # doit contenir :
    # Colonne Rk : Revenu Fiscal de référence
    # Colonne Nk : Pourcentage de foyers fiscaux ayant un RFR >= à la colonne Rk
    # Colonne Ark : RFR moyen des foyers fiscaux  ayant un RFR >= à la colonne Rk (utilisée seulement pour la loi du
    # plus haut décile
    # Je vais désormais déterminer la distribution de tout le monde :
    # 2.0 - bon je vais associer le running weight de chaque mec...
    totw = df["realwprm"].sum()
    df = df.sort_values(by="rfr")
    df["nw"] = df["realwprm"] / totw  # normalized weight (total = 1)
    df["rsnw"] = df["nw"].cumsum(
    ) - df["nw"] / 2  # somme cumulée des nw.  on prend
    # 2.1 - dans le premier décile :  Les valeurs exactes de l'ERFS * un facteur scalaire qui permet de rendre le premier décile = ce que je veux.
    targetFirstDec = so["Rk"][1]
    limWeightFirstDec = so["Nk"][1]
    limOrigFirstDec = max(df[df["rsnw"] <= 1 - limWeightFirstDec]["rfr"])
    df["adjrevstep2"] = 1
    df.loc[df["rsnw"] <= 1 - limWeightFirstDec,
           "adjrevstep2"] = (targetFirstDec / limOrigFirstDec)
    # 2.2 - dans toutes les autres catégories (sauf la dernière) : la distrib restrinte à un intervalle est une loi de Pareto au premier paramètre = le
    # debut de l'intervalle et deuxième paramètre : celui qui permet d'obtenir le bon nombre de gens dans l'intervalle
    # Détermination de ce paramètre
    sonk = so["Nk"].values
    # parce que je sais toujours pas itérer ligne à ligne dans un DataFrame
    sork = so["Rk"].values
    paramsPareto = [-1]
    for i in range(1, len(sonk) - 1):
        n0 = sonk[i]
        n1 = sonk[i + 1]
        r0 = sork[i]
        r1 = sork[i + 1]

        newparam = math.log(n1 / n0) / math.log(r0 / r1)
        paramsPareto += [newparam]

    # 2.3 - dans la dernière catégorie : je prend le param de la loi de Pareto qui permet d'égaliser la moyenne de la dernière tranche
    # OK la moyenne d'une Pareto est : esp = (1 + 1/(k-1)) * xm
    #  k = 1/(esp/xm - 1) + 1
    lastaverage = so["dArk"].values[-1] * 1000
    lastthresh = sork[-1]
    paramsPareto += [1 / (lastaverage / lastthresh - 1) + 1]
    so["paramPareto"] = paramsPareto

    df["realrfr"] = df.apply(reverseCDF(so), axis=1)
    df["realrfrw"] = df["realrfr"] * df["realwprm"]

    # OK now that this great function works (does it? Why not try it? comparing it now to the original function??)
    # I can generate the REAL rfr

    # End of step 2.

    testerrorvalues(df, "rfr", "wprm")
    aa = testerrorvalues(df, "realrfr", "realwprm")
    print("Aggregated Error % after calibration :", aa)
    # OKOK bon maintenant mon df contient le bon rfr et le bon realwprm
    df["total_ajust_revenu"] = 1
    df.loc[df["rfr"] > 0, "total_ajust_revenu"] = df["realrfr"] / df["rfr"]
    df["total_ajust_poids"] = df["realwprm"] / df["wprm"]

    # Je vais ajuster le .h5
    to_transform = pandas.read_hdf(input_h5)
    tt_colonnes = to_transform.columns
    df_changes = df[["idfoy", "total_ajust_revenu", "total_ajust_poids"]]
    to_transform = to_transform.merge(df_changes, on="idfoy")
    colspoids = ["wprm"]
    colsrevenus = [
        "chomage_brut",
        "pensions_alimentaires_percues",
        "rag",
        "ric",
        "rnc",
        "salaire_de_base",
        "f4ba",
        # "loyer",
        # "taxe_habitation",
    ]
    colsrevenus = [col for col in colsrevenus if col in to_transform.columns]
    for cp in colspoids:
        to_transform[cp] = to_transform[cp] * to_transform["total_ajust_poids"]
    for cp in colsrevenus:
        to_transform[
            cp] = to_transform[cp] * to_transform["total_ajust_revenu"]
    to_transform = to_transform[tt_colonnes]
    to_transform.to_hdf(ajuste_h5, key="input")
Exemple #12
0
def test_h5_input(
    input_h5="./Simulation_engine/dummy_data.h5",
    name_variables=("rfr", "irpp", "nbptr"),
    aggfunc="sum",
    compdic=None,
    is_plf=False,
    PERIOD=None,
):
    if PERIOD is None:
        PERIOD = annee_de_calcul
    TBS = TBS_DEFAULT["plf"] if is_plf else FranceTaxBenefitSystem()
    DUMMY_DATA = pandas.read_hdf(input_h5)
    simulation_base_deciles = simulation(PERIOD, DUMMY_DATA, TBS)
    df = aggregats_ff(PERIOD, simulation_base_deciles,
                      name_variables).sort_values(by="rfr")
    if aggfunc == "sum":  # Pour la somme, on calcule les % d'erreur sur la répartition.
        testerrorvalues(df)
    aggs_to_compute = ["wprm", "salaire_de_base", "retraite_brute"
                       ] + list(name_variables)
    val_donnees_pac_agg = 0
    trpac_agg = [
        compdic[ag] for ag in ["nbF", "nbG", "nbH", "nbJ", "nbR"]
        if compdic is not None and ag in compdic
    ]
    val_reelle_pac_agg = sum(trpac_agg) if len(trpac_agg) else None
    for ag in aggs_to_compute:
        if aggfunc == "sum":
            nom_a_afficher = "Total aggrégé"
            if ag != "wprm":
                val_donnees = (df[ag] * df["wprm"]).sum()
            else:
                val_donnees = (df[ag]).sum()
        elif aggfunc == "countnonzero":
            if ag != "wprm":
                nom_a_afficher = "Non nuls"
                val_donnees = (df[df[ag] != 0]["wprm"]).sum()
            else:
                nom_a_afficher = "Nombre FF (c'est comme ça le count sur wprm)"
                val_donnees = df[ag].count()
        else:
            raise (
                "Only aggregation functions supported are sum and countnonzero. The rest is not very good if you want my opinion"
            )
        val_reelle = compdic[
            ag] if compdic is not None and ag in compdic else None
        print("{} {} : {:.0f} {} {}".format(
            nom_a_afficher,
            ag,
            val_donnees,
            val_reelle if val_reelle is not None else "",
            "{:.2f}%".format((val_donnees / val_reelle - 1) *
                             100) if val_reelle is not None else "",
        ))
        if ag in ["nbF", "nbG", "nbH", "nbJ", "nbR"]:
            val_donnees_pac_agg += val_donnees
    if val_reelle_pac_agg is not None:
        print("{} {} : {:.0f} {} {}".format(
            nom_a_afficher,
            "Enfants cumules",
            val_donnees_pac_agg,
            val_reelle_pac_agg if val_reelle_pac_agg is not None else "",
            "{:.2f}%".format((val_donnees_pac_agg / val_reelle_pac_agg - 1) *
                             100) if val_reelle_pac_agg is not None else "",
        ))
Exemple #13
0
def test_useless_variables(
        input_h5="./Simulation_engine/dummy_data.h5",
        outfile_path=None,
        name_variables=("rfr", "irpp", "nbptr"),
        PERIOD=None,
):
    if PERIOD is None:
        PERIOD = annee_de_calcul
    pandas.options.mode.chained_assignment = None
    list_useless_variables = []
    TBS = FranceTaxBenefitSystem()
    DUMMY_DATA = pandas.read_hdf(input_h5)
    simulation_base_deciles, dictionnaire_datagrouped = simulation(
        PERIOD, DUMMY_DATA, TBS)
    df = dictionnaire_datagrouped["foyer_fiscal"][["wprm"]]
    for nv in name_variables:
        df["{}_base".format(nv)] = simulation_base_deciles.calculate(
            nv, PERIOD)
    for col in DUMMY_DATA.columns:
        if col == "wprm":  # we don't want to remove this one
            continue
        isdif = False
        data_wo_column = DUMMY_DATA[[
            k for k in DUMMY_DATA.columns if k != col
        ]]
        try:
            newsim, ddg2 = simulation(PERIOD, data_wo_column, TBS)
            resvar = {nv: {} for nv in name_variables}
            for nv in name_variables:
                df["{}_{}".format(nv, col)] = newsim.calculate(nv, PERIOD)
                resvar[nv]["countdif"] = len(df[
                    df["{}_{}".format(nv, col)] != df["{}_base".format(nv)]])
                # print(col,nv,resvar[nv]["countdif"])
                # print(df[df["{}_{}".format(nv,col)]!=df["{}_base".format(nv)]],len(df[df["{}_{}".format(nv,col)]!=df["{}_base".format(nv)]]))
                isdif |= resvar[nv]["countdif"]
            if not isdif:
                list_useless_variables += [col]
            print(
                col,
                "is",
                "not" if isdif else "",
                "useless",
                "{}".format([resvar[nv]["countdif"]
                             for nv in name_variables]) if isdif else "",
            )
        except Exception:
            print(col, "is definitely not useless")
    data_wo_useless = DUMMY_DATA[[
        k for k in DUMMY_DATA.columns if k not in list_useless_variables
    ]]
    newsim, ddg2 = simulation(PERIOD, data_wo_column, TBS)
    isdif = False
    for nv in name_variables:
        # print(col,nv,resvar[nv]["countdif"])
        # print(df[df["{}_{}".format(nv,col)]!=df["{}_base".format(nv)]],len(df[df["{}_{}".format(nv,col)]!=df["{}_base".format(nv)]]))
        isdif |= len(
            df[df["{}_{}".format(nv, col)] != df["{}_base".format(nv)]])
    if isdif:
        print(
            "Removing all variables at once didn't work, good luck with that")
    else:
        if outfile_path is None:
            outfile_path = input_h5.replace(".h5", "_useful.h5")
        data_wo_useless.to_hdf(outfile_path, key="input")
        print(
            "It seems lots of columns don't do anything. Data with only useful columns was exported to {}"
            .format(outfile_path))
    return list_useless_variables