def guess_coicop_from_bdf(year=2011):
    assert year == 2011
    from openfisca_france_indirect_taxation.utils import get_transfert_data_frames
    matrice_passage_data_frame, _ = get_transfert_data_frames(year)
    matrice_passage_data_frame.rename(columns={
        'poste{}'.format(year): 'poste_bdf',
        'label{}'.format(year): 'label',
    },
                                      inplace=True)
    dirty_produits = matrice_passage_data_frame['poste_bdf'].unique()
    entries = list()

    for dirty_produit in dirty_produits:
        dirty_produit_str = str(dirty_produit)
        selection = matrice_passage_data_frame.poste_bdf == dirty_produit
        code_coicop = '0' + '.'.join(dirty_produit_str) \
            if len(dirty_produit_str) <= 4 \
            else dirty_produit_str[:2] + '.' + '.'.join(dirty_produit_str[2:])
        entries.append(
            dict(
                code_coicop=code_coicop,
                label=matrice_passage_data_frame.loc[selection,
                                                     'label'].unique()[0],
                code_bdf=dirty_produit_str,
            ))

    result = pandas.DataFrame(entries)
    assert not result.code_coicop.duplicated().any()
    return result
def guess_coicop_from_bdf(year = 2011):
    assert year == 2011
    from openfisca_france_indirect_taxation.utils import get_transfert_data_frames
    matrice_passage_data_frame, _ = get_transfert_data_frames(year)
    matrice_passage_data_frame.rename(
        columns = {
            'poste{}'.format(year): 'poste_bdf',
            'label{}'.format(year): 'label',
            },
        inplace = True
        )
    dirty_produits = matrice_passage_data_frame['poste_bdf'].unique()
    entries = list()

    for dirty_produit in dirty_produits:
        dirty_produit_str = str(dirty_produit)
        selection = matrice_passage_data_frame.poste_bdf == dirty_produit
        code_coicop = '0' + '.'.join(dirty_produit_str) \
            if len(dirty_produit_str) <= 4 \
            else dirty_produit_str[:2] + '.' + '.'.join(dirty_produit_str[2:])
        entries.append(dict(
            code_coicop = code_coicop,
            label = matrice_passage_data_frame.loc[selection, 'label'].unique()[0],
            code_bdf = dirty_produit_str,
            ))

    result = pandas.DataFrame(entries)
    assert not result.code_coicop.duplicated().any()
    return result
Exemple #3
0
def build_depenses_homogenisees(temporary_store=None, year=None):
    """Build menage consumption by categorie fiscale dataframe """
    assert temporary_store is not None
    assert year is not None

    bdf_survey_collection = SurveyCollection.load(
        collection='budget_des_familles',
        config_files_directory=config_files_directory)
    survey = bdf_survey_collection.get_survey(
        'budget_des_familles_{}'.format(year))

    # Homogénéisation des bases de données de dépenses

    if year == 1995:
        socioscm = survey.get_values(table="socioscm")
        poids = socioscm[['mena', 'ponderrd', 'exdep', 'exrev']]
        # cette étape de ne garder que les données dont on est sûr de la qualité et de la véracité
        # exdep = 1 si les données sont bien remplies pour les dépenses du ménage
        # exrev = 1 si les données sont bien remplies pour les revenus du ménage
        poids = poids[(poids.exdep == 1) & (poids.exrev == 1)]
        del poids['exdep'], poids['exrev']
        poids.rename(columns={
            'mena': 'ident_men',
            'ponderrd': 'pondmen',
        },
                     inplace=True)
        poids.set_index('ident_men', inplace=True)

        conso = survey.get_values(table="depnom")
        conso = conso[["valeur", "montant", "mena", "nomen5"]]
        conso = conso.groupby(["mena", "nomen5"]).sum()
        conso = conso.reset_index()
        conso.rename(columns={
            'mena': 'ident_men',
            'nomen5': 'poste{}'.format(year),
            'valeur': 'depense',
            'montant': 'depense_avt_imput',
        },
                     inplace=True)

        # Passage à l'euro
        conso.depense = conso.depense / 6.55957
        conso.depense_avt_imput = conso.depense_avt_imput / 6.55957
        conso_small = conso[[u'ident_men', u'poste1995', u'depense']]

        conso_unstacked = conso_small.set_index(['ident_men', 'poste1995'
                                                 ]).unstack('poste1995')
        conso_unstacked = conso_unstacked.fillna(0)

        levels = conso_unstacked.columns.levels[1]
        labels = conso_unstacked.columns.labels[1]
        conso_unstacked.columns = levels[labels]
        conso_unstacked.rename(index={0: 'ident_men'}, inplace=True)
        conso = conso_unstacked.merge(poids, left_index=True, right_index=True)
        conso = conso.reset_index()

    if year == 2000:
        conso = survey.get_values(table="consomen")
        conso.rename(
            columns={
                'ident': 'ident_men',
                'pondmen': 'pondmen',
            },
            inplace=True,
        )
        for variable in ['ctotale', 'c99', 'c99999'] + \
                        ["c0{}".format(i) for i in range(1, 10)] + \
                        ["c{}".format(i) for i in range(10, 14)]:
            del conso[variable]

    if year == 2005:
        conso = survey.get_values(table="c05d")

    if year == 2011:
        try:
            conso = survey.get_values(table="C05")
        except:
            conso = survey.get_values(table="c05")
        conso.rename(
            columns={
                'ident_me': 'ident_men',
            },
            inplace=True,
        )
        del conso['ctot']

    # Grouping by coicop

    poids = conso[['ident_men', 'pondmen']].copy()
    poids.set_index('ident_men', inplace=True)
    conso.drop('pondmen', axis=1, inplace=True)
    conso.set_index('ident_men', inplace=True)

    matrice_passage_data_frame, selected_parametres_fiscalite_data_frame = get_transfert_data_frames(
        year)

    coicop_poste_bdf = matrice_passage_data_frame[[
        'poste{}'.format(year), 'posteCOICOP'
    ]]
    coicop_poste_bdf.set_index('poste{}'.format(year), inplace=True)
    coicop_by_poste_bdf = coicop_poste_bdf.to_dict()['posteCOICOP']
    del coicop_poste_bdf

    def reformat_consumption_column_coicop(coicop):
        try:
            return int(coicop.replace('c', '').lstrip('0'))
        except:
            return numpy.NaN

    # cette étape permet d'harmoniser les df pour 1995 qui ne se présentent pas de la même façon
    # que pour les trois autres années
    if year == 1995:
        coicop_labels = [
            normalize_code_coicop(coicop_by_poste_bdf.get(poste_bdf))
            for poste_bdf in conso.columns
        ]
    else:
        coicop_labels = [
            normalize_code_coicop(
                coicop_by_poste_bdf.get(
                    reformat_consumption_column_coicop(poste_bdf)))
            for poste_bdf in conso.columns
        ]
    tuples = zip(coicop_labels, conso.columns)
    conso.columns = pandas.MultiIndex.from_tuples(
        tuples, names=['coicop', 'poste{}'.format(year)])
    coicop_data_frame = conso.groupby(level=0, axis=1).sum()

    depenses = coicop_data_frame.merge(poids,
                                       left_index=True,
                                       right_index=True)

    # Création de gros postes, les 12 postes sur lesquels le calage se fera
    def select_gros_postes(coicop):
        try:
            coicop = unicode(coicop)
        except:
            coicop = coicop
        normalized_coicop = normalize_code_coicop(coicop)
        grosposte = normalized_coicop[0:2]
        return int(grosposte)

    grospostes = [
        select_gros_postes(coicop) for coicop in coicop_data_frame.columns
    ]
    tuples_gros_poste = zip(coicop_data_frame.columns, grospostes)
    coicop_data_frame.columns = pandas.MultiIndex.from_tuples(
        tuples_gros_poste, names=['coicop', 'grosposte'])

    depenses_by_grosposte = coicop_data_frame.groupby(level=1, axis=1).sum()
    depenses_by_grosposte = depenses_by_grosposte.merge(poids,
                                                        left_index=True,
                                                        right_index=True)

    # TODO : understand why it does not work: depenses.rename(columns = {u'0421': 'poste_coicop_421'}, inplace = True)

    produits = [column for column in depenses.columns if column.isdigit()]
    for code in produits:
        if code[-1:] == '0':
            depenses.rename(columns={code: code[:-1]}, inplace=True)
        else:
            depenses.rename(columns={code: code}, inplace=True)
    produits = [column for column in depenses.columns if column.isdigit()]
    for code in produits:
        if code[0:1] == '0':
            depenses.rename(columns={code: code[1:]}, inplace=True)
        else:
            depenses.rename(columns={code: code}, inplace=True)
    produits = [column for column in depenses.columns if column.isdigit()]
    for code in produits:
        depenses.rename(columns={code: 'poste_coicop_' + code}, inplace=True)

    temporary_store['depenses_{}'.format(year)] = depenses

    depenses_by_grosposte.columns = depenses_by_grosposte.columns.astype(str)
    liste_grospostes = [
        column for column in depenses_by_grosposte.columns if column.isdigit()
    ]
    for grosposte in liste_grospostes:
        depenses_by_grosposte.rename(
            columns={grosposte: 'coicop12_' + grosposte}, inplace=True)

    temporary_store['depenses_by_grosposte_{}'.format(
        year)] = depenses_by_grosposte
def build_depenses_homogenisees(temporary_store = None, year = None):
    """Build menage consumption by categorie fiscale dataframe """
    assert temporary_store is not None
    assert year is not None

    bdf_survey_collection = SurveyCollection.load(
        collection = 'budget_des_familles', config_files_directory = config_files_directory
        )
    survey = bdf_survey_collection.get_survey('budget_des_familles_{}'.format(year))

    # Homogénéisation des bases de données de dépenses

    if year == 1995:
        socioscm = survey.get_values(table = "socioscm")
        poids = socioscm[['mena', 'ponderrd', 'exdep', 'exrev']]
        # cette étape de ne garder que les données dont on est sûr de la qualité et de la véracité
        # exdep = 1 si les données sont bien remplies pour les dépenses du ménage
        # exrev = 1 si les données sont bien remplies pour les revenus du ménage
        poids = poids[(poids.exdep == 1) & (poids.exrev == 1)]
        del poids['exdep'], poids['exrev']
        poids.rename(
            columns = {
                'mena': 'ident_men',
                'ponderrd': 'pondmen',
                },
            inplace = True
            )
        poids.set_index('ident_men', inplace = True)

        conso = survey.get_values(table = "depnom")
        conso = conso[["valeur", "montant", "mena", "nomen5"]]
        conso = conso.groupby(["mena", "nomen5"]).sum()
        conso = conso.reset_index()
        conso.rename(
            columns = {
                'mena': 'ident_men',
                'nomen5': 'poste{}'.format(year),
                'valeur': 'depense',
                'montant': 'depense_avt_imput',
                },
            inplace = True
            )

        # Passage à l'euro
        conso.depense = conso.depense / 6.55957
        conso.depense_avt_imput = conso.depense_avt_imput / 6.55957
        conso_small = conso[[u'ident_men', u'poste1995', u'depense']]

        conso_unstacked = conso_small.set_index(['ident_men', 'poste1995']).unstack('poste1995')
        conso_unstacked = conso_unstacked.fillna(0)

        levels = conso_unstacked.columns.levels[1]
        labels = conso_unstacked.columns.labels[1]
        conso_unstacked.columns = levels[labels]
        conso_unstacked.rename(index = {0: 'ident_men'}, inplace = True)
        conso = conso_unstacked.merge(poids, left_index = True, right_index = True)
        conso = conso.reset_index()

    if year == 2000:
        conso = survey.get_values(table = "consomen")
        conso.rename(
            columns = {
                'ident': 'ident_men',
                'pondmen': 'pondmen',
                },
            inplace = True,
            )
        for variable in ['ctotale', 'c99', 'c99999'] + \
                        ["c0{}".format(i) for i in range(1, 10)] + \
                        ["c{}".format(i) for i in range(10, 14)]:
            del conso[variable]

    if year == 2005:
        conso = survey.get_values(table = "c05d")

    if year == 2011:
        try:
            conso = survey.get_values(table = "C05")
        except:
            conso = survey.get_values(table = "c05")
        conso.rename(
            columns = {
                'ident_me': 'ident_men',
                },
            inplace = True,
            )
        del conso['ctot']

    # Grouping by coicop

    poids = conso[['ident_men', 'pondmen']].copy()
    poids.set_index('ident_men', inplace = True)
    conso.drop('pondmen', axis = 1, inplace = True)
    conso.set_index('ident_men', inplace = True)

    matrice_passage_data_frame, selected_parametres_fiscalite_data_frame = get_transfert_data_frames(year)

    coicop_poste_bdf = matrice_passage_data_frame[['poste{}'.format(year), 'posteCOICOP']]
    coicop_poste_bdf.set_index('poste{}'.format(year), inplace = True)
    coicop_by_poste_bdf = coicop_poste_bdf.to_dict()['posteCOICOP']
    del coicop_poste_bdf

    def reformat_consumption_column_coicop(coicop):
        try:
            return int(coicop.replace('c', '').lstrip('0'))
        except:
            return numpy.NaN
    # cette étape permet d'harmoniser les df pour 1995 qui ne se présentent pas de la même façon
    # que pour les trois autres années
    if year == 1995:
        coicop_labels = [
            normalize_code_coicop(coicop_by_poste_bdf.get(poste_bdf))
            for poste_bdf in conso.columns
            ]
    else:
        coicop_labels = [
            normalize_code_coicop(coicop_by_poste_bdf.get(reformat_consumption_column_coicop(poste_bdf)))
            for poste_bdf in conso.columns
            ]
    tuples = zip(coicop_labels, conso.columns)
    conso.columns = pandas.MultiIndex.from_tuples(tuples, names=['coicop', 'poste{}'.format(year)])
    coicop_data_frame = conso.groupby(level = 0, axis = 1).sum()

    depenses = coicop_data_frame.merge(poids, left_index = True, right_index = True)

    # Création de gros postes, les 12 postes sur lesquels le calage se fera
    def select_gros_postes(coicop):
        try:
            coicop = unicode(coicop)
        except:
            coicop = coicop
        normalized_coicop = normalize_code_coicop(coicop)
        grosposte = normalized_coicop[0:2]
        return int(grosposte)

    grospostes = [
        select_gros_postes(coicop)
        for coicop in coicop_data_frame.columns
        ]
    tuples_gros_poste = zip(coicop_data_frame.columns, grospostes)
    coicop_data_frame.columns = pandas.MultiIndex.from_tuples(tuples_gros_poste, names=['coicop', 'grosposte'])

    depenses_by_grosposte = coicop_data_frame.groupby(level = 1, axis = 1).sum()
    depenses_by_grosposte = depenses_by_grosposte.merge(poids, left_index = True, right_index = True)

    # TODO : understand why it does not work: depenses.rename(columns = {u'0421': 'poste_coicop_421'}, inplace = True)

    produits = [column for column in depenses.columns if column.isdigit()]
    for code in produits:
        if code[-1:] == '0':
            depenses.rename(columns = {code: code[:-1]}, inplace = True)
        else:
            depenses.rename(columns = {code: code}, inplace = True)
    produits = [column for column in depenses.columns if column.isdigit()]
    for code in produits:
        if code[0:1] == '0':
            depenses.rename(columns = {code: code[1:]}, inplace = True)
        else:
            depenses.rename(columns = {code: code}, inplace = True)
    produits = [column for column in depenses.columns if column.isdigit()]
    for code in produits:
        depenses.rename(columns = {code: 'poste_coicop_' + code}, inplace = True)

    temporary_store['depenses_{}'.format(year)] = depenses

    depenses_by_grosposte.columns = depenses_by_grosposte.columns.astype(str)
    liste_grospostes = [column for column in depenses_by_grosposte.columns if column.isdigit()]
    for grosposte in liste_grospostes:
        depenses_by_grosposte.rename(columns = {grosposte: 'coicop12_' + grosposte}, inplace = True)

    temporary_store['depenses_by_grosposte_{}'.format(year)] = depenses_by_grosposte