Exemple #1
0
def test_homemade_nbptr_function(
    reform_config_base_2020, nbptr_parametres_par_defaut, various_cas_types
):
    # Verifie que les resultats de nbptr et irpp sont les mêmes avec la fonction par defaut
    period = "2020"
    data = dataframe_from_cas_types_description(various_cas_types)
    tbs_reforme_sans_nbptr = IncomeTaxReform(
        FranceTaxBenefitSystem(), reform_config_base_2020, period
    )
    tbs_reforme_avec_nbptr = IncomeTaxReform(
        FranceTaxBenefitSystem(),
        {
            "impot_revenu": {
                **(reform_config_base_2020["impot_revenu"]),
                **nbptr_parametres_par_defaut,
            }
        },
        period,
    )

    sim_sans_nbptr, _ = simulation(period, data, tbs_reforme_sans_nbptr)
    sim_avec_nbptr, _ = simulation(period, data, tbs_reforme_avec_nbptr)

    print("sans", sim_sans_nbptr.calculate("nbptr", period))
    print("avec", sim_avec_nbptr.calculate("nbptr", period))

    assert array_equal(
        sim_sans_nbptr.calculate("nbptr", period),
        sim_avec_nbptr.calculate("nbptr", period),
    )
    assert array_equal(
        sim_sans_nbptr.calculate("irpp", period),
        sim_avec_nbptr.calculate("irpp", period),
    )
Exemple #2
0
def compare_input_data(
        input_h5="./Simulation_engine/dummy_data.h5",
        input_h5_b="./Simulation_engine/dummy_data.h5",
        name_variables=("rfr", "irpp", "nbptr"),
        PERIOD=None,
):
    if PERIOD is None:
        PERIOD = annee_de_calcul
    TBS = FranceTaxBenefitSystem()
    DUMMY_DATA = pandas.read_hdf(input_h5)
    simulation_base_deciles, dictionnaire_datagrouped = simulation(
        PERIOD, DUMMY_DATA, TBS)
    df = dictionnaire_datagrouped["foyer_fiscal"][["wprm"]]
    for nv in name_variables:
        df["{}_base".format(nv)] = simulation_base_deciles.calculate(
            nv, PERIOD)
    isdif = False
    data2 = pandas.read_hdf(input_h5_b)
    col = "b"
    newsim, ddg2 = simulation(PERIOD, data2, TBS)
    for nv in name_variables:
        df["{}_{}".format(nv, col)] = newsim.calculate(nv, PERIOD)

        isdif |= len(df[df["{}_{}".format(nv, col)] -
                        df["{}_base".format(nv)] > 0.01]) + len(
                            df[df["{}_{}".format(nv, col)] -
                               df["{}_base".format(nv)] < -0.01])
    return not isdif
Exemple #3
0
def test_h5_input(input_h5="./Simulation_engine/dummy_data.h5",
                  name_variables=("rfr", "irpp", "nbptr"),
                  aggfunc="sum",
                  compdic=None,
                  is_plf=False):
    PERIOD = "2018"
    TBS = TBS_PLF if is_plf else FranceTaxBenefitSystem()
    DUMMY_DATA = pandas.read_hdf(input_h5)
    simulation_base_deciles = simulation(PERIOD, DUMMY_DATA, TBS)
    df = aggregats_ff(PERIOD, simulation_base_deciles,
                      name_variables).sort_values(by="rfr")
    if aggfunc == "sum":  # Pour la somme, on calcule les % d'erreur sur la répartition.
        testerrorvalues(df)
    aggs_to_compute = ["wprm", "salaire_de_base", "retraite_brute"
                       ] + list(name_variables)
    val_donnees_pac_agg = 0
    trpac_agg = [
        compdic[ag] for ag in ["nbF", "nbG", "nbH", "nbJ", "nbR"]
        if compdic is not None and ag in compdic
    ]
    val_reelle_pac_agg = sum(trpac_agg) if len(trpac_agg) else None
    for ag in aggs_to_compute:
        if aggfunc == "sum":
            nom_a_afficher = "Total aggrégé"
            if ag != "wprm":
                val_donnees = (df[ag] * df["wprm"]).sum()
            else:
                val_donnees = (df[ag]).sum()
        elif aggfunc == "countnonzero":
            if ag != "wprm":
                nom_a_afficher = "Non nuls"
                val_donnees = (df[df[ag] != 0]["wprm"]).sum()
            else:
                nom_a_afficher = "Nombre FF (c'est comme ça le count sur wprm)"
                val_donnees = df[ag].count()
        else:
            raise (
                "Only aggregation functions supported are sum and countnonzero. The rest is not very good if you want my opinion"
            )
        val_reelle = compdic[
            ag] if compdic is not None and ag in compdic else None
        print("{} {} : {:.0f} {} {}".format(
            nom_a_afficher,
            ag,
            val_donnees,
            val_reelle if val_reelle is not None else "",
            "{:.2f}%".format((val_donnees / val_reelle - 1) *
                             100) if val_reelle is not None else "",
        ))
        if ag in ["nbF", "nbG", "nbH", "nbJ", "nbR"]:
            val_donnees_pac_agg += val_donnees
    if val_reelle_pac_agg is not None:
        print("{} {} : {:.0f} {} {}".format(
            nom_a_afficher,
            "Enfants cumules",
            val_donnees_pac_agg,
            val_reelle_pac_agg if val_reelle_pac_agg is not None else "",
            "{:.2f}%".format((val_donnees_pac_agg / val_reelle_pac_agg - 1) *
                             100) if val_reelle_pac_agg is not None else "",
        ))
def test_decomposition_variables():
    tbs = FranceTaxBenefitSystem()
    path = Path('openfisca_dash_ui/decomposition.yaml')
    yaml = YAML(typ='safe')
    decomposition = yaml.load(path)
    def check(tree):
        for node in tree:
            assert node['code'] in tbs.variables
            check(node['children'])
def test_coefficient_proratisation_only_contract_periods_wide():
    tax_benefit_system = FranceTaxBenefitSystem()
    scenario = tax_benefit_system.new_scenario()
    scenario.init_single_entity(period='2017', # wide: we simulate for the year
        parent1=dict(salaire_de_base={'2017-11':2300},
        effectif_entreprise=1,
        code_postal_entreprise="75001",
        categorie_salarie=u'prive_non_cadre',
        contrat_de_travail_debut='2017-11-1',
        contrat_de_travail_fin='2017-12-01',
        allegement_fillon_mode_recouvrement=u'progressif'))
    simulation = scenario.new_simulation()
    assert_equal(simulation.calculate('coefficient_proratisation','2017-11'),1)
    assert_equal(simulation.calculate('coefficient_proratisation','2017-12'),0)
    assert_equal(simulation.calculate('coefficient_proratisation','2017-10'),0)
    assert_equal(simulation.calculate_add('coefficient_proratisation','2017'),1)
Exemple #6
0
def test_zero_nbptr(reform_config_base_2020, nbptr_zero, various_cas_types):
    # Verifie que les resultats de nbptr sont bien zero pour tout le monde si tous les param
    # sont à zéro
    period = "2020"
    data = dataframe_from_cas_types_description(various_cas_types)
    tbs_reforme_avec_nbptr = IncomeTaxReform(
        FranceTaxBenefitSystem(),
        {"impot_revenu": {**(reform_config_base_2020["impot_revenu"]), **nbptr_zero}},
        period,
    )

    sim_avec_nbptr, _ = simulation(period, data, tbs_reforme_avec_nbptr)

    resultats_nbptr = sim_avec_nbptr.calculate("nbptr", period)

    assert not resultats_nbptr.any()
def scenar_values(
    minv, maxv, var_brute, var_nette, pourcentage_hausse=0.001, valeur_hausse=100
):
    """
    Calcule les valeurs de var_nette pour var_brute dans [minv, maxv]
    et exporte dans un CSV avec les colonnes suivantes : var_brute,var_nette
    """
    df = calcule_maillage_intervalle(
        var_brute, minv, maxv, pourcentage_hausse, valeur_hausse
    )
    PERIOD = str(annee_de_calcul)
    TBS = FranceTaxBenefitSystem()
    # définit un ménage par ligne
    sim = simulation(PERIOD, df, TBS)
    net = var_nette
    df[net] = sim[0].calculate_add(net, PERIOD)
    return df[[var_brute, var_nette]]
Exemple #8
0
def test_deux_adultes_ancien_combattants_deux_enfants(reform_config_base_2020):
    # données
    foyer = {
        "declarants": [
            {
                "ancienCombattant": True,
                "invalide": False,
                "parentIsole": False,
                "retraite": False,
                "veuf": False
            },
            {
                "ancienCombattant": True,
                "invalide": False,
                "parentIsole": False,
                "retraite": False,
                "veuf": False
            }
        ],
        "personnesACharge": [
            {
                "chargePartagee": False,
                "invalide": False
            },
            {
                "chargePartagee": False,
                "invalide": False
            }
        ],
        "residence": "metropole",
        "revenuImposable": 120000
    }
    data = dataframe_from_cas_types_description([foyer])
    period = "2020"

    # loi française + réforme IR
    tbs_reforme_impot_revenu = IncomeTaxReform(
        FranceTaxBenefitSystem(), reform_config_base_2020, period
    )
    built_simulation, _dict_data_by_entity = simulation(
        period, data, tbs_reforme_impot_revenu
    )

    nbptr = built_simulation.calculate("nbptr", period)
    assert nbptr == [3.5]
def test_coefficient_proratisation_only_contract_periods_narrow():
    tax_benefit_system = FranceTaxBenefitSystem()
    scenario = tax_benefit_system.new_scenario()
    init_single_entity(
        scenario,
        period='2017-11',  # narrow: we simulate for the month
        parent1=dict(salaire_de_base={'2017-11': 2300},
                     effectif_entreprise=1,
                     code_postal_entreprise="75001",
                     categorie_salarie='prive_non_cadre',
                     contrat_de_travail_debut={2017: '2017-11-01'},
                     contrat_de_travail_fin={2017: '2017-12-01'},
                     allegement_fillon_mode_recouvrement='progressif'))
    simulation = scenario.new_simulation()
    assert simulation.calculate('coefficient_proratisation', '2017-11') == 1
    assert simulation.calculate('coefficient_proratisation', '2017-12') == 0
    assert simulation.calculate('coefficient_proratisation', '2017-10') == 0
    assert simulation.calculate_add('coefficient_proratisation', '2017') == 1
def create_individu_for_inversion(year, revenu_type='net'):
    assert revenu_type in ['net', 'imposable']
    assert year is not None

    # Using data produced by preprocessing.build_merged_dataframes
    temporary_store = get_store(file_name='erfs_fpr')
    individus = temporary_store['individus_{}_post_01'.format(year)]

    if revenu_type == 'net':
        old_by_new_variables = {
            'chomage_i': 'chomage_net',
            'pens_alim_recue_i': 'pensions_alimentaires_percues',
            'rag_i': 'rag_net',
            'retraites_i': 'retraite_nette',
            'ric_i': 'ric_net',
            'rnc_i': 'rnc_net',
            'salaires_i': 'salaire_net',
        }
    elif revenu_type == 'imposable':
        old_by_new_variables = {
            'chomage_i': 'chomage_imposable',
            'pens_alim_recue_i': 'pensions_alimentaires_percues',
            'rag_i': 'rag_net',
            'retraites_i': 'retraite_imposable',
            'ric_i': 'ric_net',
            'rnc_i': 'rnc_net',
            'salaires_i': 'salaire_imposable',
        }

    for variable in old_by_new_variables:
        assert variable in individus.columns.tolist(
        ), "La variable {} n'est pas présente".format(variable)

    individus.rename(
        columns=old_by_new_variables,
        inplace=True,
    )

    created_variables = []
    create_ages(individus, year)
    created_variables.append('age')
    created_variables.append('age_en_mois')

    create_date_naissance(individus,
                          age_variable=None,
                          annee_naissance_variable='naia',
                          mois_naissance='naim',
                          year=year)
    created_variables.append('date_naissance')

    period = periods.period(year)
    # create_revenus(individus, revenu_type = revenu_type)
    # created_variables.append('taux_csg_remplacement')

    create_contrat_de_travail(individus,
                              period=period,
                              salaire_type=revenu_type)
    created_variables.append('contrat_de_travail')
    created_variables.append('heures_remunerees_volume')

    create_categorie_salarie(individus, period=period)
    created_variables.append('categorie_salarie')

    tax_benefit_system = FranceTaxBenefitSystem()
    create_salaire_de_base(individus,
                           period=period,
                           revenu_type=revenu_type,
                           tax_benefit_system=tax_benefit_system)
    created_variables.append('salaire_de_base')

    create_effectif_entreprise(individus, period=period)
    created_variables.append('effectif_entreprise')

    create_traitement_indiciaire_brut(individus,
                                      period=period,
                                      revenu_type=revenu_type,
                                      tax_benefit_system=tax_benefit_system)
    created_variables.append('traitement_indiciaire_brut')
    created_variables.append('primes_fonction_publique')

    other_variables = ['salaire_{}'.format(revenu_type)]
    temporary_store['individu_for_inversion_{}'.format(year)] = individus[
        created_variables + other_variables]
from openfisca_core.parameters import ParameterNode, Scale
from openfisca_france import FranceTaxBenefitSystem

tax_benefit_system = FranceTaxBenefitSystem()
parameters = tax_benefit_system.parameters


def get_parameters_by_unit(parameter, parameters_by_unit=None):
    """
    Build a dictionnary collecting the legislation parameters according to their units
    """
    if parameters_by_unit is None:
        parameters_by_unit = dict(
            scale_none=list(),
            scale_currency=list(),
            none=list(),
            currency=list(),
            rate=list(),
            year=list(),
        )

    for sub_parameter in parameter.children.values():
        if isinstance(sub_parameter, ParameterNode):
            get_parameters_by_unit(sub_parameter, parameters_by_unit)
        else:
            if isinstance(sub_parameter, Scale):
                unit = sub_parameter.metadata.get('unit')
                rate_unit = sub_parameter.metadata.get('rate_unit')
                threshold_unit = sub_parameter.metadata.get('threshold_unit')
                if unit is not None:
                    raise ValueError(
Exemple #12
0
    apresy = dfv[bestsol + 1][0]
    avantx = dfv[bestsol][1]
    apresx = dfv[bestsol + 1][1]
    lambda_ = (val_brute - avanty) / (apresy - avanty)
    return lambda_ * apresx + (1 - lambda_) * avantx


conversion_variables = {}

conversion_variables["salaire_de_base_to_salaire_imposable"] = scenar_values(
    0, 12_000_000, "salaire_de_base", "salaire_imposable")
conversion_variables["retraite_brute_to_retraite_imposable"] = scenar_values(
    0, 12_000_000, "retraite_brute", "retraite_imposable")

PERIOD = "2018"
TBS = FranceTaxBenefitSystem()
TBS_PLF = IncomeTaxReform(TBS, reformePLF, PERIOD)
CAS_TYPE = load_data("DCT.csv")
SIMCAT = partial(simulation, period=PERIOD, data=CAS_TYPE)
SIMCAT_BASE = SIMCAT(tbs=TBS)

if not version_beta_sans_simu_pop:
    # Initialisation des données utilisées pour le calcul sur la population
    DUMMY_DATA = load_data(data_path).sort_values(by="idfoy")
    print(
        "Dummy Data loaded",
        len(DUMMY_DATA),
        "lines",
        len(DUMMY_DATA["idfoy"].unique()),
        "foyers fiscaux",
    )
Exemple #13
0
    #         return list(x)
    #     return x
    # with Path("decomposition.json").open('w') as fd:
    #     json.dump(decomposition_tree, fd, indent=2, default=serialize)

    return decomposition_tree


decomposition_file_path = Path("decomposition.json")
if decomposition_file_path.is_file():
    print("Loading decomposition from file...")
    with decomposition_file_path.open() as fd:
        decomposition_tree = json.load(fd)
else:
    print("Initializing France tax and benefit system...")
    tbs = FranceTaxBenefitSystem()
    print("Pre-calculating decomposition...")
    decomposition_tree = precalculate_decomposition_json(tbs)

app = dash.Dash()
server = app.server  # Referenced by Procfile

app.layout = html.Div(children=[
    html.H1(children='OpenFisca'),
    html.P(children=[
        "Salaire de base : ",
        html.Span(id="salaire-de-base-value"),
        " € / an",
    ]),
    dcc.Slider(
        id="salaire-de-base",
Exemple #14
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-s',
        '--source-dir',
        default='yaml-clean',
        help='path of source directory containing clean IPP YAML files')
    parser.add_argument(
        '-t',
        '--target',
        default='ipp-tax-and-benefit-tables-to-openfisca-parameters.yaml',
        help=
        'path of generated YAML file containing the association between IPP fields to OpenFisca parameters'
    )
    parser.add_argument('-v',
                        '--verbose',
                        action='store_true',
                        default=False,
                        help="increase output verbosity")
    args = parser.parse_args()
    logging.basicConfig(
        level=logging.DEBUG if args.verbose else logging.WARNING,
        stream=sys.stdout)

    file_system_encoding = sys.getfilesystemencoding()

    ipp_infos_by_value = {}
    for source_dir_encoded, directories_name_encoded, filenames_encoded in os.walk(
            args.source_dir):
        directories_name_encoded.sort()
        for filename_encoded in sorted(filenames_encoded):
            if not filename_encoded.endswith('.yaml'):
                continue
            filename = filename_encoded.decode(file_system_encoding)
            sheet_name = os.path.splitext(filename)[0]
            source_file_path_encoded = os.path.join(source_dir_encoded,
                                                    filename_encoded)
            relative_file_path_encoded = source_file_path_encoded[
                len(args.source_dir):].lstrip(os.sep)
            relative_file_path = relative_file_path_encoded.decode(
                file_system_encoding)
            if sheet_name.isupper():
                continue
            assert sheet_name.islower(), sheet_name
            log.info(u'Loading file {}'.format(relative_file_path))
            with open(source_file_path_encoded) as source_file:
                data = yaml.load(source_file)
            rows = data.get(u"Valeurs")
            if rows is None:
                log.info(u'  Skipping file {} without "Valeurs"'.format(
                    relative_file_path))
                continue
            for row in rows:
                start = row.get(u"Date d'effet")
                if start is None:
                    for date_name in date_names:
                        start = row.get(date_name)
                        if start is not None:
                            break
                    else:
                        # No date found. Skip row.
                        continue
                elif not isinstance(start, datetime.date):
                    start = start[u"Année Revenus"]

                for name, child in row.iteritems():
                    if name in date_names:
                        continue
                    for path, value in iter_ipp_values(child):
                        if isinstance(value, basestring):
                            split_value = value.split()
                            if len(split_value) == 2 and split_value[1] in (
                                    u'%',
                                    u'AF',  # anciens francs
                                    u'CFA',  # francs CFA
                                    u'COTISATIONS',
                                    u'EUR',
                                    u'FRF',
                            ):
                                value = float(split_value[0])
                        if isinstance(value, float) and value == int(value):
                            value = int(value)
                        full_path = tuple(
                            relative_file_path.split(os.sep)[:-1]) + (
                                sheet_name, name) + tuple(path)
                        ipp_infos_by_value.setdefault(value, []).append(
                            dict(
                                path=full_path,
                                start=start,
                            ))

#    print yaml.dump(ipp_infos_by_value, allow_unicode = True, default_flow_style = False, indent = 2, width = 120)

    tax_benefit_system = FranceTaxBenefitSystem()

    # print yaml.dump(tax_benefit_system.legislation_json, allow_unicode = True, default_flow_style = False, indent = 2,
    #     width = 120)

    #    openfisca_infos_by_value = {}
    #    for path, start, value in iter_openfisca_values(tax_benefit_system.legislation_json):
    #        openfisca_infos_by_value.setdefault(value, []).append(dict(
    #            path = tuple(path),
    #            start = start,
    #            ))
    #    print yaml.dump(openfisca_infos_by_value, allow_unicode = True, default_flow_style = False, indent = 2, width = 120)

    #    ipp_count = {}
    #    for path, start, value in iter_openfisca_values(tax_benefit_system.legislation_json):
    #        ipp_infos = ipp_infos_by_value.get(value)
    #        if ipp_infos is None:
    #            # OpenFisca parameter doesn't exit in IPP.
    #            continue
    #        for ipp_info in ipp_infos:
    #            if ipp_info['start'] == start:
    #                ipp_child = ipp_count
    #                ipp_path = ipp_info['path']
    #                for name in path:
    #                    ipp_child = ipp_child.setdefault(name, {})
    #                    ipp_child_count = ipp_child.setdefault('count_by_path', {})
    #                    for ipp_index in range(len(ipp_path)):
    #                        ipp_sub_path = ipp_path[:ipp_index + 1]
    #                        ipp_child_count[ipp_sub_path] = ipp_child_count.get(ipp_sub_path, 0) + 1
    #    print yaml.dump(ipp_count, allow_unicode = True, default_flow_style = False, indent = 2, width = 120)

    starts_by_ipp_path_by_openfisca_path = {}
    starts_by_openfisca_path_by_ipp_path = {}
    for path, start, value in iter_openfisca_values(
            tax_benefit_system.legislation_json):
        ipp_infos = ipp_infos_by_value.get(value)
        if ipp_infos is None:
            # OpenFisca parameter doesn't exit in IPP.
            continue
        same_start_ipp_paths = [
            ipp_info['path'] for ipp_info in ipp_infos
            if ipp_info['start'] == start
        ]
        if len(same_start_ipp_paths) == 1:
            ipp_path = same_start_ipp_paths[0]
            starts_by_ipp_path_by_openfisca_path.setdefault(
                tuple(path), {}).setdefault(ipp_path, set()).add(start)
            starts_by_openfisca_path_by_ipp_path.setdefault(
                ipp_path, {}).setdefault(tuple(path), set()).add(start)


#    for openfisca_path, starts_by_ipp_path in sorted(starts_by_ipp_path_by_openfisca_path.iteritems()):
##        if len(starts_by_ipp_path) == 1:
##            print u'.'.join(openfisca_path), '->', u' / '.join(starts_by_ipp_path.keys()[0])
#        if len(starts_by_ipp_path) > 1:
#            print u'.'.join(openfisca_path), '->', starts_by_ipp_path

#    for ipp_path, starts_by_openfisca_path in sorted(starts_by_openfisca_path_by_ipp_path.iteritems()):
#        if len(starts_by_openfisca_path) == 1:
#            print u' / '.join(ipp_path), '->', u'.'.join(
#                unicode(fragment)
#                for fragment in starts_by_openfisca_path.keys()[0]
#                )
##        if len(starts_by_openfisca_path) > 1:
##            print u' / '.join(ipp_path), '->', u'.'.join(
##                unicode(fragment)
##                for fragment in starts_by_openfisca_path.keys()[0]
##                )

    openfisca_path_by_ipp_tree = collections.OrderedDict()
    for ipp_path, starts_by_openfisca_path in sorted(
            starts_by_openfisca_path_by_ipp_path.iteritems()):
        openfisca_path_by_ipp_sub_tree = openfisca_path_by_ipp_tree
        for ipp_name in ipp_path[:-1]:
            openfisca_path_by_ipp_sub_tree = openfisca_path_by_ipp_sub_tree.setdefault(
                ipp_name, collections.OrderedDict())
        ipp_name = ipp_path[-1]
        openfisca_path_by_ipp_sub_tree[ipp_name] = [
            u'.'.join(unicode(fragment) for fragment in openfisca_name)
            for openfisca_name in sorted(starts_by_openfisca_path)
        ]
    with open(args.target, 'w') as target_file:
        yaml.dump(openfisca_path_by_ipp_tree,
                  target_file,
                  allow_unicode=True,
                  default_flow_style=False,
                  indent=2,
                  width=120)

    return 0
Exemple #15
0
def ajustement_h5(
    input_h5="./Simulation_engine/dummy_data.h5",
    output_h5="./Simulation_engine/dummy_data_ajuste.h5",
    distribution_rfr_population="./Simulation_engine/Calib/ResFinalCalibSenat.csv",
    PERIOD=None,
):
    if PERIOD is None:
        PERIOD = annee_de_calcul
    ajuste_h5 = output_h5
    TBS = FranceTaxBenefitSystem()
    DUMMY_DATA = pandas.read_hdf(input_h5)
    # Keeping computations short with option to keep file under 1000 FF
    # DUMMY_DATA = DUMMY_DATA[DUMMY_DATA["idmen"] < 1000]
    simulation_base_deciles = simulation(PERIOD, DUMMY_DATA, TBS)
    df = aggregats_ff(PERIOD, simulation_base_deciles).sort_values(by="rfr")
    print("{} FF sur {} ont un revenu>0 , donc {:.2f}% ont que dalle ".format(
        len(df[df["rfr"] > 0.01]),
        len(df),
        100 - 100 * len(df[df["rfr"] > 0.01]) / len(df),
    ))

    # Step 1 : Ajustement du nombre de mecs à zéro...
    oldweight = 1 - df[df["rfr"] > 0.01]["wprm"].sum() / df["wprm"].sum()
    targetweight = 0.06
    redweightifrfr0 = targetweight * (1 - oldweight) / oldweight / (
        1 - targetweight)
    print(
        "Non en fait {} FF sur {} ont un revenu>0 , donc {:.2f}% ont que dalle. Je vais les ajuster."
        .format(
            df[df["rfr"] > 0.01]["wprm"].sum(),
            df["wprm"].sum(),
            100 - 100 * df[df["rfr"] > 0.01]["wprm"].sum() / df["wprm"].sum(),
        ))
    print("old : {} new : {} adj : {}".format(oldweight, targetweight,
                                              redweightifrfr0))

    # Ajustement de réduction du poids
    df["adjwstep0"] = 1
    df["realwprm"] = df["wprm"]
    df.loc[df["rfr"] < 0.01, "adjwstep0"] = redweightifrfr0
    df.loc[df["rfr"] < 0.01, "realwprm"] = df["wprm"] * redweightifrfr0
    # Calibration du nombre total de foyers fiscaux
    target_foyers_fiscaux = 38_332_977
    # src : https://www.impots.gouv.fr/portail/statistiques (2018)
    adjust_wprm = target_foyers_fiscaux / df["realwprm"].sum()
    df["realwprm"] = df["realwprm"] * adjust_wprm
    print(
        "Non en fait {} FF sur {} ont un revenu>0 , donc {:.2f}% ont que dalle "
        .format(
            df[df["rfr"] > 0.01]["wprm"].sum(),
            df["wprm"].sum(),
            100 - 100 * df[df["rfr"] > 0]["wprm"].sum() / df["wprm"].sum(),
        ))
    # Step 1.1 : Ajuster le 1er décile  (pour l'instant on fait que dalle, y a pas vraiment d'impact

    # Step 2 : PBP (pareto by parts)
    # Stats officielles
    so = pandas.read_csv(distribution_rfr_population)
    # doit contenir :
    # Colonne Rk : Revenu Fiscal de référence
    # Colonne Nk : Pourcentage de foyers fiscaux ayant un RFR >= à la colonne Rk
    # Colonne Ark : RFR moyen des foyers fiscaux  ayant un RFR >= à la colonne Rk (utilisée seulement pour la loi du
    # plus haut décile
    # Je vais désormais déterminer la distribution de tout le monde :
    # 2.0 - bon je vais associer le running weight de chaque mec...
    totw = df["realwprm"].sum()
    df = df.sort_values(by="rfr")
    df["nw"] = df["realwprm"] / totw  # normalized weight (total = 1)
    df["rsnw"] = df["nw"].cumsum(
    ) - df["nw"] / 2  # somme cumulée des nw.  on prend
    # 2.1 - dans le premier décile :  Les valeurs exactes de l'ERFS * un facteur scalaire qui permet de rendre le premier décile = ce que je veux.
    targetFirstDec = so["Rk"][1]
    limWeightFirstDec = so["Nk"][1]
    limOrigFirstDec = max(df[df["rsnw"] <= 1 - limWeightFirstDec]["rfr"])
    df["adjrevstep2"] = 1
    df.loc[df["rsnw"] <= 1 - limWeightFirstDec,
           "adjrevstep2"] = (targetFirstDec / limOrigFirstDec)
    # 2.2 - dans toutes les autres catégories (sauf la dernière) : la distrib restrinte à un intervalle est une loi de Pareto au premier paramètre = le
    # debut de l'intervalle et deuxième paramètre : celui qui permet d'obtenir le bon nombre de gens dans l'intervalle
    # Détermination de ce paramètre
    sonk = so["Nk"].values
    # parce que je sais toujours pas itérer ligne à ligne dans un DataFrame
    sork = so["Rk"].values
    paramsPareto = [-1]
    for i in range(1, len(sonk) - 1):
        n0 = sonk[i]
        n1 = sonk[i + 1]
        r0 = sork[i]
        r1 = sork[i + 1]

        newparam = math.log(n1 / n0) / math.log(r0 / r1)
        paramsPareto += [newparam]

    # 2.3 - dans la dernière catégorie : je prend le param de la loi de Pareto qui permet d'égaliser la moyenne de la dernière tranche
    # OK la moyenne d'une Pareto est : esp = (1 + 1/(k-1)) * xm
    #  k = 1/(esp/xm - 1) + 1
    lastaverage = so["dArk"].values[-1] * 1000
    lastthresh = sork[-1]
    paramsPareto += [1 / (lastaverage / lastthresh - 1) + 1]
    so["paramPareto"] = paramsPareto

    df["realrfr"] = df.apply(reverseCDF(so), axis=1)
    df["realrfrw"] = df["realrfr"] * df["realwprm"]

    # OK now that this great function works (does it? Why not try it? comparing it now to the original function??)
    # I can generate the REAL rfr

    # End of step 2.

    testerrorvalues(df, "rfr", "wprm")
    aa = testerrorvalues(df, "realrfr", "realwprm")
    print("Aggregated Error % after calibration :", aa)
    # OKOK bon maintenant mon df contient le bon rfr et le bon realwprm
    df["total_ajust_revenu"] = 1
    df.loc[df["rfr"] > 0, "total_ajust_revenu"] = df["realrfr"] / df["rfr"]
    df["total_ajust_poids"] = df["realwprm"] / df["wprm"]

    # Je vais ajuster le .h5
    to_transform = pandas.read_hdf(input_h5)
    tt_colonnes = to_transform.columns
    df_changes = df[["idfoy", "total_ajust_revenu", "total_ajust_poids"]]
    to_transform = to_transform.merge(df_changes, on="idfoy")
    colspoids = ["wprm"]
    colsrevenus = [
        "chomage_brut",
        "pensions_alimentaires_percues",
        "rag",
        "ric",
        "rnc",
        "salaire_de_base",
        "f4ba",
        # "loyer",
        # "taxe_habitation",
    ]
    colsrevenus = [col for col in colsrevenus if col in to_transform.columns]
    for cp in colspoids:
        to_transform[cp] = to_transform[cp] * to_transform["total_ajust_poids"]
    for cp in colsrevenus:
        to_transform[
            cp] = to_transform[cp] * to_transform["total_ajust_revenu"]
    to_transform = to_transform[tt_colonnes]
    to_transform.to_hdf(ajuste_h5, key="input")
Exemple #16
0
def test_useless_variables(
        input_h5="./Simulation_engine/dummy_data.h5",
        outfile_path=None,
        name_variables=("rfr", "irpp", "nbptr"),
        PERIOD=None,
):
    if PERIOD is None:
        PERIOD = annee_de_calcul
    pandas.options.mode.chained_assignment = None
    list_useless_variables = []
    TBS = FranceTaxBenefitSystem()
    DUMMY_DATA = pandas.read_hdf(input_h5)
    simulation_base_deciles, dictionnaire_datagrouped = simulation(
        PERIOD, DUMMY_DATA, TBS)
    df = dictionnaire_datagrouped["foyer_fiscal"][["wprm"]]
    for nv in name_variables:
        df["{}_base".format(nv)] = simulation_base_deciles.calculate(
            nv, PERIOD)
    for col in DUMMY_DATA.columns:
        if col == "wprm":  # we don't want to remove this one
            continue
        isdif = False
        data_wo_column = DUMMY_DATA[[
            k for k in DUMMY_DATA.columns if k != col
        ]]
        try:
            newsim, ddg2 = simulation(PERIOD, data_wo_column, TBS)
            resvar = {nv: {} for nv in name_variables}
            for nv in name_variables:
                df["{}_{}".format(nv, col)] = newsim.calculate(nv, PERIOD)
                resvar[nv]["countdif"] = len(df[
                    df["{}_{}".format(nv, col)] != df["{}_base".format(nv)]])
                # print(col,nv,resvar[nv]["countdif"])
                # print(df[df["{}_{}".format(nv,col)]!=df["{}_base".format(nv)]],len(df[df["{}_{}".format(nv,col)]!=df["{}_base".format(nv)]]))
                isdif |= resvar[nv]["countdif"]
            if not isdif:
                list_useless_variables += [col]
            print(
                col,
                "is",
                "not" if isdif else "",
                "useless",
                "{}".format([resvar[nv]["countdif"]
                             for nv in name_variables]) if isdif else "",
            )
        except Exception:
            print(col, "is definitely not useless")
    data_wo_useless = DUMMY_DATA[[
        k for k in DUMMY_DATA.columns if k not in list_useless_variables
    ]]
    newsim, ddg2 = simulation(PERIOD, data_wo_column, TBS)
    isdif = False
    for nv in name_variables:
        # print(col,nv,resvar[nv]["countdif"])
        # print(df[df["{}_{}".format(nv,col)]!=df["{}_base".format(nv)]],len(df[df["{}_{}".format(nv,col)]!=df["{}_base".format(nv)]]))
        isdif |= len(
            df[df["{}_{}".format(nv, col)] != df["{}_base".format(nv)]])
    if isdif:
        print(
            "Removing all variables at once didn't work, good luck with that")
    else:
        if outfile_path is None:
            outfile_path = input_h5.replace(".h5", "_useful.h5")
        data_wo_useless.to_hdf(outfile_path, key="input")
        print(
            "It seems lots of columns don't do anything. Data with only useful columns was exported to {}"
            .format(outfile_path))
    return list_useless_variables