Esempio n. 1
0
def create_fip(year = None):
    assert year is not None
    # fip : fichier d'imposition des personnes
    """
    Creates a 'fipDat' table containing all these 'fip individuals'
    """
    # Some individuals are declared as 'personne à charge' (pac) on 'tax forms'
    # but are not present in the erf or eec tables.
    # We add them to ensure consistency between concepts.

    temporary_store = TemporaryStore.create(file_name = "erfs")

    replace = create_replace(year)

    erfs_survey_collection = SurveyCollection.load(
        collection = 'erfs', config_files_directory = config_files_directory)
    survey = erfs_survey_collection.get_survey('erfs_{}'.format(year))

    log.info(u"Démarrage de 03_fip")

    # anaisenf is a string containing letter code of pac (F,G,H,I,J,N,R) and year of birth (example: 'F1990H1992')
    # when a child is invalid, he appears twice in anaisenf (example: F1900G1900 is a single invalid child born in 1990)
    erfFoyVar = ['declar', 'anaisenf']
    foyer = survey.get_values(table = replace["foyer"], variables = erfFoyVar)
    foyer.replace({'anaisenf': {'NA': np.nan}}, inplace = True)

    log.info(u"Etape 1 : on récupere les personnes à charge des foyers")
    log.info(u"    1.1 : Création des codes des enfants")
    foyer['anaisenf'] = foyer['anaisenf'].astype('string')
    nb_pac_max = len(max(foyer['anaisenf'], key=len)) / 5
    log.info(u"il ya a au maximum {} pac par foyer".format(nb_pac_max))

    # Separating the string coding the pac of each "déclaration".
    # Creating a list containing the new variables.

    # Creating the multi_index for the columns
    multi_index_columns = []
    assert int(nb_pac_max) == nb_pac_max, "nb_pac_max = {} which is not an integer".format(nb_pac_max)
    nb_pac_max = int(nb_pac_max)
    for i in range(1, nb_pac_max + 1):
        pac_tuples_list = [
            (i, 'declaration'),
            (i, 'type_pac'),
            (i, 'naia')
            ]
        multi_index_columns += pac_tuples_list

    columns = MultiIndex.from_tuples(
        multi_index_columns,
        names = ['pac_number', 'variable']
        )
    fip = DataFrame(np.random.randn(len(foyer), 3 * nb_pac_max), columns = columns)
    log.info("{}".format(fip.describe()))
    log.info("{}".format(fip.info()))

    for i in range(1, nb_pac_max + 1):  # TODO: using values to deal with mismatching indexes
        fip[(i, 'declaration')] = foyer['declar'].values
        fip[(i, 'type_pac')] = foyer['anaisenf'].str[5 * (i - 1)].values
        fip[(i, 'naia')] = foyer['anaisenf'].str[5 * (i - 1) + 1: 5 * i].values

    fip = fip.stack("pac_number")
    fip.reset_index(inplace = True)
    fip.drop(['level_0'], axis = 1, inplace = True)

    log.info(u"    1.2 : elimination des foyers fiscaux sans pac")
    # Clearing missing values and changing data format
    fip = fip[(fip.type_pac.notnull()) & (fip.naia != 'an') & (fip.naia != '')].copy()
    fip = fip.sort(columns = ['declaration', 'naia', 'type_pac'])
    # TODO: check if useful
    fip.set_index(["declaration", "pac_number"], inplace = True)
    fip = fip.reset_index()
    fip.drop(['pac_number'], axis = 1, inplace = True)
    # TODO: rajouter la case I : "Dont enfants titulaires de la carte d’invalidité"
    assert fip.type_pac.isin(["F", "G", "H", "I", "J", "N", "R"]).all(), "Certains type de PAC sont inconnus"
    # TODO: find a more explicit message

#    control(fip, debug=True, verbose=True, verbose_columns=['naia'])

    log.info(u"    1.3 : on enlève les individus F pour lesquels il existe un individu G")
    type_FG = fip[fip.type_pac.isin(['F', 'G'])].copy()  # Filtre pour ne travailler que sur F & G

    type_FG['same_pair'] = type_FG.duplicated(subset = ['declaration', 'naia'], take_last = True)
    type_FG['is_twin'] = type_FG.duplicated(subset = ['declaration', 'naia', 'type_pac'])
    type_FG['to_keep'] = ~(type_FG['same_pair']) | type_FG['is_twin']
    # Note : On conserve ceux qui ont des couples déclar/naia différents et les jumeaux
    #       puis on retire les autres (à la fois F et G)
    log.info(u"longueur fip {}".format(len(fip)))

    fip['to_keep'] = np.nan
    fip.update(type_FG)

    log.info(u"    1.4 : on enlève les H pour lesquels il y a un I")
    type_HI = fip[fip.type_pac.isin(['H', 'I'])].copy()
    type_HI['same_pair'] = type_HI.duplicated(subset = ['declaration', 'naia'], take_last = True)
    type_HI['is_twin'] = type_HI.duplicated(subset = ['declaration', 'naia', 'type_pac'])
    type_HI['to_keep'] = (~(type_HI['same_pair']) | (type_HI['is_twin'])).values

    fip.update(type_HI)
    fip['to_keep'] = fip['to_keep'].fillna(True)
    log.info(u"nb lines to keep = {} / nb initial lines {}".format(len(fip[fip['to_keep']]), len(fip)))

    indivifip = fip[fip['to_keep']].copy()
    del indivifip['to_keep'], fip, type_FG, type_HI
    #
    # control(indivifip, debug=True)

    log.info(u"Step 2 : matching indivifip with eec file")
    indivi = temporary_store['indivim_{}'.format(year)]
    pac = indivi[(indivi.persfip.notnull()) & (indivi.persfip == 'pac')].copy()
    assert indivifip.naia.notnull().all(), "Il y a des valeurs manquantes de la variable naia"

    pac['naia'] = pac.naia.astype('int32')  # TODO: was float in pac fix upstream
    indivifip['naia'] = indivifip.naia.astype('int32')
    pac['key1'] = zip(pac.naia, pac['declar1'].str[:29])
    pac['key2'] = zip(pac.naia, pac['declar2'].str[:29])
    indivifip['key'] = zip(indivifip.naia.values, indivifip['declaration'].str[:29].values)
    assert pac.naia.dtype == indivifip.naia.dtype, \
        "Les dtypes de pac.naia {} et indvifip.naia {} sont différents".format(pac.naia.dtype, indivifip.naia.dtype)

    fip = indivifip[~(indivifip.key.isin(pac.key1.values))].copy()
    fip = fip[~(fip.key.isin(pac.key2.values))].copy()

    log.info(u"    2.1 new fip created")
#   We build a dataframe to link the pac to their type and noindiv
    tmp_pac1 = pac[['noindiv', 'key1']].copy()
    tmp_pac2 = pac[['noindiv', 'key2']].copy()
    tmp_indivifip = indivifip[['key', 'type_pac', 'naia']].copy()

    pac_ind1 = tmp_pac1.merge(tmp_indivifip, left_on='key1', right_on='key', how='inner')
    log.info(u"longueur pacInd1 {}".format(len(pac_ind1)))
    pac_ind2 = tmp_pac2.merge(tmp_indivifip, left_on='key2', right_on='key', how='inner')
    log.info(u"longueur pacInd2 {}".format(len(pac_ind2)))
    log.info(u"pacInd1 & pacInd2 créés")

    log.info("{}".format(pac_ind1.duplicated().sum()))
    log.info("{}".format(pac_ind2.duplicated().sum()))

    del pac_ind1['key1'], pac_ind2['key2']

    if len(pac_ind1.index) == 0:
        if len(pac_ind2.index) == 0:
                log.info(u"Warning : no link between pac and noindiv for both pacInd1&2")
        else:
            log.info(u"Warning : pacInd1 is an empty data frame")
            pacInd = pac_ind2
    elif len(pac_ind2.index) == 0:
        log.info(u"Warning : pacInd2 is an empty data frame")
        pacInd = pac_ind1
    else:
        pacInd = concat([pac_ind2, pac_ind1])
    log.info("{}{}{}".format(len(pac_ind1), len(pac_ind2), len(pacInd)))
    log.info("{}".format(pac_ind2.type_pac.isnull().sum()))
    log.info("{}".format(pacInd.type_pac.value_counts()))

    log.info(u"    2.2 : pacInd created")

    log.info(u"doublons noindiv, type_pac {}".format(pacInd.duplicated(['noindiv', 'type_pac']).sum()))
    log.info(u"doublons noindiv seulement {}".format(pacInd.duplicated('noindiv').sum()))
    log.info(u"nb de NaN {}".format(pacInd.type_pac.isnull().sum()))

    del pacInd["key"]
    pacIndiv = pacInd[~(pacInd.duplicated('noindiv'))].copy()
    # pacIndiv.reset_index(inplace=True)
    log.info("{}".format(pacIndiv.columns))

    temporary_store['pacIndiv_{}'.format(year)] = pacIndiv

    log.info("{}".format(pacIndiv.type_pac.value_counts()))
    gc.collect()

# # We keep the fip in the menage of their parents because it is used in to
# # build the famille. We should build an individual ident (ménage) for the fip that are
# # older than 18 since they are not in their parents' menage according to the eec

# individec1 <- subset(indivi, (declar1 %in% fip$declar) & (persfip=="vous"))
# individec1 <- individec1[,c("declar1","noidec","ident","rga","ztsai","ztsao")]
# individec1 <- upData(individec1,rename=c(declar1="declar"))
# fip1       <- merge(fip,individec1)
# indivi$noidec <- as.numeric(substr(indivi$declar1,1,2))
    log.info("{}".format(indivi['declar1'].str[0:2].value_counts()))
    log.info("{}".format(indivi['declar1'].str[0:2].describe()))
    log.info("{}".format(indivi['declar1'].str[0:2].notnull().all()))
    log.info("{}".format(indivi.info()))
    selection = indivi['declar1'].str[0:2] != ""
    indivi['noidec'] = indivi.declar1[selection].str[0:2].astype('int32')  # To be used later to set idfoy

    individec1 = indivi[(indivi.declar1.isin(fip.declaration.values)) & (indivi.persfip == "vous")]
    individec1 = individec1[["declar1", "noidec", "ident", "rga", "ztsai", "ztsao"]].copy()
    individec1 = individec1.rename(columns = {'declar1': 'declaration'})
    fip1 = fip.merge(individec1, on = 'declaration')
    log.info(u"    2.3 : fip1 created")

# # TODO: On ne s'occupe pas des declar2 pour l'instant
# # individec2 <- subset(indivi, (declar2 %in% fip$declar) & (persfip=="vous"))
# # individec2 <- individec2[,c("declar2","noidec","ident","rga","ztsai","ztsao")]
# # individec2 <- upData(individec2,rename=c(declar2="declar"))
# # fip2 <-merge(fip,individec2)

    individec2 = indivi[(indivi.declar2.isin(fip.declaration.values)) & (indivi['persfip'] == "vous")]
    individec2 = individec2[["declar2", "noidec", "ident", "rga", "ztsai", "ztsao"]].copy()
    individec2.rename(columns = {'declar2': 'declaration'}, inplace = True)
    fip2 = fip.merge(individec2)
    log.info(u"    2.4 : fip2 created")

    fip1.duplicated().value_counts()
    fip2.duplicated().value_counts()

    fip = concat([fip1, fip2])

    fip['persfip'] = 'pac'
    fip['year'] = year
    fip['year'] = fip['year'].astype('float')  # BUG; pas de colonne année dans la DF
    fip['noi'] = 99
    fip['noicon'] = None
    fip['noindiv'] = fip['declaration']
    fip['noiper'] = None
    fip['noimer'] = None
    fip['declar1'] = fip['declaration']  # TODO: declar ?
    fip['naim'] = 99
    fip['lien'] = None
    fip['quelfic'] = 'FIP'
    fip['acteu'] = None
    fip['agepf'] = fip['year'] - fip.naia.astype('float')
    fip['lpr'] = (fip['agepf'] <= 20) * 3 + (fip['agepf'] > 20) * 4
    fip['stc'] = None
    fip['contra'] = None
    fip['titc'] = None
    fip['mrec'] = None
    fip['forter'] = None
    fip['rstg'] = None
    fip['retrai'] = None
    fip['cohab'] = None
    fip['sexe'] = None
    fip['persfip'] = "pac"
    fip['agepr'] = None
    fip['actrec'] = (fip['agepf'] <= 15) * 9 + (fip['agepf'] > 15) * 5

## TODO: probleme actrec des enfants fip entre 16 et 20 ans : on ne sait pas s'ils sont étudiants ou salariés */
## TODO problème avec les mois des enfants FIP : voir si on ne peut pas remonter à ces valeurs: Alexis : clairement non

# Reassigning noi for fip children if they are more than one per foyer fiscal
# while ( any(duplicated( fip[,c("noi","ident")]) ) ) {
#   dup <- duplicated( fip[, c("noi","ident")])
#   tmp <- fip[dup,"noi"]
#   fip[dup, "noi"] <- (tmp-1)
# }
    # TODO: Le vecteur dup est-il correct
    fip["noi"] = fip["noi"].astype("int64")
    fip["ident"] = fip["ident"].astype("int64")

    fip_tmp = fip[['noi', 'ident']]

    while any(fip.duplicated(cols=['noi', 'ident'])):
        fip_tmp = fip.loc[:, ['noi', 'ident']]
        dup = fip_tmp.duplicated()
        tmp = fip.loc[dup, 'noi']
        log.info("{}".format(len(tmp)))
        fip.loc[dup, 'noi'] = tmp.astype('int64') - 1

    fip['idfoy'] = 100 * fip['ident'] + fip['noidec']
    fip['noindiv'] = 100 * fip['ident'] + fip['noi']
    fip['type_pac'] = 0
    fip['key'] = 0

    log.info("{}".format(fip.duplicated('noindiv').value_counts()))
    temporary_store['fipDat_{}'.format(year)] = fip
    del fip, fip1, individec1, indivifip, indivi, pac
    log.info(u"fip sauvegardé")
def create_indivim_menage_en_mois(year=None):
    """
    Création des tables ménages et individus concaténée (merged)
    """
    temporary_store = TemporaryStore.create(file_name="erfs")
    assert year is not None
    # load data
    erfs_survey_collection = SurveyCollection.load(
        collection='erfs', config_files_directory=config_files_directory)

    replace = create_replace(year)
    survey = erfs_survey_collection.get_survey('erfs_{}'.format(year))
    erfmen = survey.get_values(table=replace["erf_menage"])
    eecmen = survey.get_values(table=replace["eec_menage"])
    erfind = survey.get_values(table=replace["erf_indivi"])
    eecind = survey.get_values(table=replace["eec_indivi"])

    # travail sur la cohérence entre les bases
    noappar_m = eecmen[~(eecmen.ident.isin(erfmen.ident.values))].copy()

    noappar_i = eecmen[~(eecind.ident.isin(erfind.ident.values))].copy()
    noappar_i = noappar_i.drop_duplicates(subset='ident', take_last=True)
    # TODO: vérifier qu'il n'y a théoriquement pas de doublon

    difference = set(noappar_i.ident).symmetric_difference(noappar_m.ident)
    intersection = set(noappar_i.ident) & set(noappar_m.ident)
    log.info("There are {} differences and {} intersections".format(
        len(difference), len(intersection)))
    del noappar_i, noappar_m, difference, intersection
    gc.collect()

    # fusion enquete emploi et source fiscale
    menage_en_mois = erfmen.merge(eecmen)
    indivim = eecind.merge(erfind, on=['noindiv', 'ident', 'noi'], how="inner")

    # optimisation des types? Controle de l'existence en passant
    # TODO: minimal dtype
    # TODO: this should be done somewhere else
    var_list = ([
        'acteu',
        'agepr',
        'cohab',
        'contra',
        'encadr',
        'forter',
        'lien',
        'mrec',
        'naia',
        'noicon',
        'noimer',
        'noiper',
        'prosa',
        'retrai',
        'rstg',
        'statut',
        'stc',
        'titc',
        'txtppb',
    ])

    for var in var_list:
        assert indivim[var].dtype == numpy.dtype('int')

    ########################
    # création de variables#
    ########################


#   actrec : activité recodée comme preconisé par l'INSEE p84 du guide utilisateur
    indivim["actrec"] = numpy.nan
    # Attention : Q: pas de 6 ?!! A : Non pas de 6, la variable recodée de l'INSEE (voit p84 du guide methodo), ici \
    # la même nomenclatue à été adopée

    # 3: contrat a durée déterminée
    indivim['actrec'][indivim.acteu == 1] = 3
    # 8 : femme (homme) au foyer, autre inactif
    indivim['actrec'][indivim.acteu == 3] = 8
    # 1 : actif occupé non salarié
    filter1 = (indivim.acteu == 1) & (indivim.stc.isin(
        [1, 3]))  # actifs occupés non salariés à son compte ou pour un
    indivim["actrec"][filter1] = 1  # membre de sa famille
    # 2 : salarié pour une durée non limitée
    filter2 = (indivim.acteu == 1) & (((indivim.stc == 2) &
                                       (indivim.contra == 1)) |
                                      (indivim.titc == 2))
    indivim['actrec'][filter2] = 2
    # 4 : au chomage
    filter4 = (indivim.acteu == 2) | ((indivim.acteu == 3) &
                                      (indivim.mrec == 1))
    indivim['actrec'][filter4] = 4
    # 5 : élève étudiant , stagiaire non rémunéré
    filter5 = (indivim.acteu == 3) & ((indivim.forter == 2) |
                                      (indivim.rstg == 1))
    indivim['actrec'][filter5] = 5
    # 7 : retraité, préretraité, retiré des affaires unchecked
    filter7 = (indivim.acteu == 3) & ((indivim.retrai == 1) |
                                      (indivim.retrai == 2))
    indivim['actrec'][filter7] = 7
    # 9 : probablement enfants de - de 16 ans TODO: check that fact in database and questionnaire
    indivim['actrec'][indivim.acteu == 0] = 9

    indivim.actrec = indivim.actrec.astype("int8")
    assert_dtype(indivim.actrec, "int8")
    assert indivim.actrec.isin(range(
        1, 10)).all(), 'actrec values are outside the interval [1, 9]'

    #   TODO : compare the result with results provided by Insee
    #   tu99
    if year == 2009:
        erfind['tu99'] = None  # TODO: why ?

    # Locataire
    menage_en_mois["locataire"] = menage_en_mois.so.isin([3, 4, 5])
    assert_dtype(menage_en_mois.locataire, "bool")

    transfert = indivim.loc[indivim.lpr == 1, ['ident', 'ddipl']].copy()
    menage_en_mois = menage_en_mois.merge(transfert)

    # Correction
    def _manually_remove_errors():
        '''
        This method is here because some oddities can make it through the controls throughout the procedure
        It is here to remove all these individual errors that compromise the process.
        '''
        if year == 2006:
            indivim.lien[indivim.noindiv == 603018905] = 2
            indivim.noimer[indivim.noindiv == 603018905] = 1
            log.info("{}".format(
                indivim[indivim.noindiv == 603018905].to_string()))

    _manually_remove_errors()

    temporary_store['menage_en_mois_{}'.format(year)] = menage_en_mois
    del eecmen, erfmen, menage_en_mois, transfert
    gc.collect()
    temporary_store['indivim_{}'.format(year)] = indivim
    del erfind, eecind
    gc.collect()
    temporary_store.close()
Esempio n. 3
0
def create_totals(year = None):

    assert year is not None
    temporary_store = TemporaryStore.create(file_name = "erfs")
    replace = create_replace(year)

    # On part de la table individu de l'ERFS
    # on renomme les variables

    log.info(u"Creating Totals")
    log.info(u"Etape 1 : Chargement des données")

    erfs_survey_collection = SurveyCollection.load(collection = 'erfs', config_files_directory = config_files_directory)
    data = erfs_survey_collection.get_survey('erfs_{}'.format(year))

    indivim = temporary_store['indivim_{}'.format(year)]

    assert not indivim.duplicated(['noindiv']).any(), "Présence de doublons"

    # Deals individuals with imputed income : some individuals are in 'erf individu table' but
    # not in the 'foyer' table. We need to create a foyer for them.

    selection = Series()
    for var in ["zsali", "zchoi", "zrsti", "zalri", "zrtoi", "zragi", "zrici", "zrnci"]:
        varo = var[:-1] + "o"
        test = indivim[var] != indivim[varo]
        if len(selection) == 0:
            selection = test
        else:
            selection = (test) | (selection)

    indivi_i = indivim[selection].copy()
    indivi_i.rename(
        columns = {
            "ident": "idmen",
            "persfip": "quifoy",
            "zsali": "sali2",  # Inclu les salaires non imposables des agents d'assurance
            "zchoi": "choi2",
            "zrsti": "rsti2",
            "zalri": "alr2"
            },
        inplace = True,
        )

    indivi_i.quifoy = where(indivi_i.quifoy.isnull(), "vous", indivi_i.quifoy)
    indivi_i.quelfic = "FIP_IMP"

    # We merge them with the other individuals
    indivim.rename(
        columns = dict(
            ident = "idmen",
            persfip = "quifoy",
            zsali = "sali2",  # Inclu les salaires non imposables des agents d'assurance
            zchoi = "choi2",
            zrsti = "rsti2",
            zalri = "alr2",
            ),
        inplace = True,
        )

    if not (set(list(indivim.noindiv)) > set(list(indivi_i.noindiv))):
        raise Exception("Individual ")
    indivim.set_index("noindiv", inplace = True)
    indivi_i.set_index("noindiv", inplace = True)
    indivi = indivim
    del indivim
    indivi.update(indivi_i)

    indivi.reset_index(inplace = True)

    log.info("Etape 2 : isolation des FIP")
    fip_imp = indivi.quelfic == "FIP_IMP"
    indivi["idfoy"] = (
        indivi.idmen.astype("int64") * 100 +
        (indivi.declar1.str[0:2]).convert_objects(convert_numeric=True)
        )

    indivi.loc[fip_imp, "idfoy"] = np.nan
    # Certains FIP (ou du moins avec revenus imputés) ont un numéro de déclaration d'impôt ( pourquoi ?)
    fip_has_declar = (fip_imp) & (indivi.declar1.notnull())

    indivi["idfoy"] = where(
        fip_has_declar,
        indivi.idmen * 100 + indivi.declar1.str[0:2].convert_objects(convert_numeric = True),
        indivi.idfoy)
    del fip_has_declar

    fip_no_declar = (fip_imp) & (indivi.declar1.isnull())
    del fip_imp
    indivi["idfoy"] = where(fip_no_declar, indivi["idmen"] * 100 + 50, indivi["idfoy"])

    indivi_fnd = indivi[["idfoy", "noindiv"]][fip_no_declar].copy()

    while any(indivi_fnd.duplicated(cols=["idfoy"])):
        indivi_fnd["idfoy"] = where(
            indivi_fnd.duplicated(cols=["idfoy"]),
            indivi_fnd["idfoy"] + 1,
            indivi_fnd["idfoy"]
            )

    # assert indivi_fnd["idfoy"].duplicated().value_counts()[False] == len(indivi_fnd["idfoy"].values), "Duplicates remaining"
    assert len(indivi[indivi.duplicated(['noindiv'])]) == 0, "Doublons"

    indivi.idfoy[fip_no_declar] = indivi_fnd.idfoy.copy()
    del indivi_fnd, fip_no_declar

    log.info(u"Etape 3 : Récupération des EE_NRT")

    nrt = indivi.quelfic == "EE_NRT"
    indivi.idfoy = where(nrt, indivi.idmen * 100 + indivi.noi, indivi.idfoy)
    indivi.quifoy[nrt] = "vous"
    del nrt

    pref_or_cref = indivi.lpr.isin([1, 2])
    adults = (indivi.quelfic.isin(["EE", "EE_CAF"])) & (pref_or_cref)
    indivi.idfoy = where(adults, indivi.idmen * 100 + indivi.noi, indivi.idfoy)
    indivi.loc[adults, "quifoy"] = "vous"
    del adults
    # TODO: hack to avoid assert error
    log.info("{}".format(indivi.loc[indivi['lpr'].isin([1, 2]), "idfoy"].notnull().value_counts()))
    assert indivi.idfoy[indivi.lpr.dropna().isin([1, 2])].all()

    log.info(u"Etape 4 : Rattachement des enfants aux déclarations")

    assert not(indivi.noindiv.duplicated().any()), "Some noindiv appear twice"
    lpr3_or_lpr4 = indivi['lpr'].isin([3, 4])
    enf_ee = (lpr3_or_lpr4) & (indivi.quelfic.isin(["EE", "EE_CAF"]))
    assert indivi.noindiv[enf_ee].notnull().all(), " Some noindiv are not set, which will ruin next stage"
    assert not(indivi.noindiv[enf_ee].duplicated().any()), "Some noindiv appear twice"

    pere = DataFrame({
        "noindiv_enf": indivi.noindiv.loc[enf_ee],
        "noindiv": 100 * indivi.idmen.loc[enf_ee] + indivi.noiper.loc[enf_ee]
        })
    mere = DataFrame({
        "noindiv_enf": indivi.noindiv.loc[enf_ee],
        "noindiv": 100 * indivi.idmen.loc[enf_ee] + indivi.noimer.loc[enf_ee]
        })

    foyer = data.get_values(variables = ["noindiv", "zimpof"], table = replace["foyer"])
    pere = pere.merge(foyer, how = "inner", on = "noindiv")
    mere = mere.merge(foyer, how = "inner", on = "noindiv")
    df = pere.merge(mere, how = "outer", on = "noindiv_enf", suffixes=('_p', '_m'))

    log.info(u"    4.1 : gestion des personnes dans 2 foyers")
    for col in ["noindiv_p", "noindiv_m", "noindiv_enf"]:
        df[col] = df[col].fillna(0, inplace = True)  # beacause groupby drop groups with NA in index
    df = df.groupby(by = ["noindiv_p", "noindiv_m", "noindiv_enf"]).sum()
    df.reset_index(inplace = True)

    df["which"] = ""
    df.which = where((df.zimpof_m.notnull()) & (df.zimpof_p.isnull()), "mere", "")
    df.which = where((df.zimpof_p.notnull()) & (df.zimpof_m.isnull()), "pere", "")
    both = (df.zimpof_p.notnull()) & (df.zimpof_m.notnull())
    df.which = where(both & (df.zimpof_p > df.zimpof_m), "pere", "mere")
    df.which = where(both & (df.zimpof_m >= df.zimpof_p), "mere", "pere")

    assert df.which.notnull().all(), "Some enf_ee individuals are not matched with any pere or mere"
    del lpr3_or_lpr4, pere, mere

    df.rename(columns = {"noindiv_enf": "noindiv"}, inplace = True)
    df['idfoy'] = where(df.which == "pere", df.noindiv_p, df.noindiv_m)
    df['idfoy'] = where(df.which == "mere", df.noindiv_m, df.noindiv_p)

    assert df["idfoy"].notnull().all()

    dropped = [col for col in df.columns if col not in ["idfoy", "noindiv"]]
    df.drop(dropped, axis = 1, inplace = True)

    assert not(df.duplicated().any())

    df.set_index("noindiv", inplace = True, verify_integrity = True)
    indivi.set_index("noindiv", inplace = True, verify_integrity = True)

    ind_notnull = indivi["idfoy"].notnull().sum()
    ind_isnull = indivi["idfoy"].isnull().sum()
    indivi = indivi.combine_first(df)
    assert ind_notnull + ind_isnull == (
        indivi["idfoy"].notnull().sum() +
        indivi["idfoy"].isnull().sum()
        )
    indivi.reset_index(inplace = True)
    assert not(indivi.duplicated().any())

    # MBJ: issue delt with when moving from R code to python
    # TODO il faut rajouterles enfants_fip et créer un ménage pour les majeurs
    # On suit guide méthodo erf 2003 page 135
    # On supprime les conjoints FIP et les FIP de 25 ans et plus;
    # On conserve les enfants FIP de 19 à 24 ans;
    # On supprime les FIP de 18 ans et moins, exceptés les FIP nés en 2002 dans un
    # ménage en 6ème interrogation car ce sont des enfants nés aprés la date d'enquète
    # EEC que l'on ne retrouvera pas dans les EEC suivantes.
    #
    log.info(u"    4.2 : On enlève les individus pour lesquels il manque le déclarant")
    fip = temporary_store['fipDat_{}'.format(year)]
    fip["declar"] = np.nan
    fip["agepf"] = np.nan

    fip.drop(["actrec", "year", "noidec"], axis = 1, inplace = True)
    fip.naia = fip.naia.astype("int32")
    fip.rename(
        columns = dict(
            ident = "idmen",
            persfip = "quifoy",
            zsali = "sali2",  # Inclu les salaires non imposables des agents d'assurance
            zchoi = "choi2",
            zrsti = "rsti2",
            zalri = "alr2"),
        inplace = True)

    is_fip_19_25 = ((year - fip.naia - 1) >= 19) & ((year - fip.naia - 1) < 25)

    # TODO: BUT for the time being we keep them in thier vous menage so the following lines are commented
    # The idmen are of the form 60XXXX we use idmen 61XXXX, 62XXXX for the idmen of the kids over 18 and less than 25
    # fip[is_fip_19_25 ,"idmen"] <- (99-fip[is_fip_19_25,"noi"]+1)*100000 + fip[is_fip_19_25,"idmen"]
    # fip[is_fip_19_25 ,"lpr"]  <- 1
    #
    # indivi <- rbind.fill(indivi,fip[is_fip_19_25,])

    indivi = concat([indivi, fip.loc[is_fip_19_25]])
    del is_fip_19_25
    indivi['age'] = year - indivi.naia - 1
    indivi['age_en_mois'] = 12 * indivi.age + 12 - indivi.naim

    indivi["quimen"] = 0
    indivi.quimen[indivi.lpr == 1] = 0
    indivi.quimen[indivi.lpr == 2] = 1
    indivi.quimen[indivi.lpr == 3] = 2
    indivi.quimen[indivi.lpr == 4] = 3
    indivi['not_pr_cpr'] = None  # Create a new row
    indivi.not_pr_cpr[indivi.lpr <= 2] = False
    indivi.not_pr_cpr[indivi.lpr > 2] = True

    assert indivi.not_pr_cpr.isin([True, False]).all()

    log.info(u"    4.3 : Creating non pr=0 and cpr=1 idmen's")
    indivi.reset_index(inplace = True)

    test1 = indivi[['quimen', 'idmen']][indivi.not_pr_cpr].copy()
    test1['quimen'] = 2

    j = 2
    while any(test1.duplicated(['quimen', 'idmen'])):
        test1.loc[test1.duplicated(['quimen', 'idmen']), 'quimen'] = j + 1
        j += 1
    print_id(indivi)
    indivi.update(test1)

    print_id(indivi)

    # indivi.set_index(['quimen']) #TODO: check relevance
    # TODO problème avec certains idfoy qui n'ont pas de vous
    log.info(u"Etape 5 : Gestion des idfoy qui n'ont pas de vous")
    all_ind = indivi.drop_duplicates('idfoy')
    with_ = indivi.loc[indivi.quifoy == 'vous', 'idfoy']
    without = all_ind[~(all_ind.idfoy.isin(with_.values))]

    log.info(u"On cherche si le déclarant donné par la deuxième déclaration est bien un vous")

    # TODO: the following should be delt with at the import of the tables
    indivi.replace(
        to_replace = {
            'declar2': {'NA': np.nan, '': np.nan}
            },
        inplace = True
        )

    has_declar2 = (indivi.idfoy.isin(without.idfoy.values)) & (indivi.declar2.notnull())

    decl2_idfoy = (
        indivi.loc[has_declar2, "idmen"].astype('int') * 100 +
        indivi.loc[has_declar2, "declar2"].str[0:2].astype('int')        )
    indivi.loc[has_declar2, 'idfoy'] = where(decl2_idfoy.isin(with_.values), decl2_idfoy, None)
    del all_ind, with_, without, has_declar2

    log.info(u"    5.1 : Elimination idfoy restant")
    idfoyList = indivi.loc[indivi.quifoy == "vous", 'idfoy'].drop_duplicates()
    indivi = indivi[indivi.idfoy.isin(idfoyList.values)]
    del idfoyList
    print_id(indivi)

    # Sélectionne les variables à garder pour les steps suivants
    myvars = [
        "actrec",
        "age",
        "age_en_mois",
        "chpub",
        "encadr",
        "idfoy",
        "idmen",
        "nbsala",
        "noi",
        "noindiv",
        "prosa",
        "quelfic",
        "quifoy",
        "quimen",
        "statut",
        "titc",
        "txtppb",
        "wprm",
        "rc1rev",
        "maahe",
        ]

    assert len(set(myvars).difference(set(indivi.columns))) == 0, \
        "Manquent les colonnes suivantes : {}".format(set(myvars).difference(set(indivi.columns)))

    indivi = indivi[myvars].copy()
    # TODO les actrec des fip ne sont pas codées (on le fera à la fin quand on aura rassemblé
    # les infos provenant des déclarations)
    log.info(u"Etape 6 : Création des variables descriptives")
    log.info(u"    6.1 : variable activité")
    indivi['activite'] = None
    indivi['activite'][indivi.actrec <= 3] = 0
    indivi['activite'][indivi.actrec == 4] = 1
    indivi['activite'][indivi.actrec == 5] = 2
    indivi['activite'][indivi.actrec == 7] = 3
    indivi['activite'][indivi.actrec == 8] = 4
    indivi['activite'][indivi.age <= 13] = 2  # ce sont en fait les actrec=9
    log.info("{}".format(indivi['activite'].value_counts(dropna = False)))
    # TODO: MBJ problem avec les actrec
    # TODO: FIX AND REMOVE
    indivi.activite[indivi.actrec.isnull()] = 5
    indivi.titc[indivi.titc.isnull()] = 0
    assert indivi.titc.notnull().all(), u"Problème avec les titc" # On a 420 NaN pour les varaibels statut, titc etc

    log.info(u"    6.2 : variable statut")
    indivi.statut[indivi.statut.isnull()] = 0
    indivi.statut = indivi.statut.astype('int')
    indivi.statut[indivi.statut == 11] = 1
    indivi.statut[indivi.statut == 12] = 2
    indivi.statut[indivi.statut == 13] = 3
    indivi.statut[indivi.statut == 21] = 4
    indivi.statut[indivi.statut == 22] = 5
    indivi.statut[indivi.statut == 33] = 6
    indivi.statut[indivi.statut == 34] = 7
    indivi.statut[indivi.statut == 35] = 8
    indivi.statut[indivi.statut == 43] = 9
    indivi.statut[indivi.statut == 44] = 10
    indivi.statut[indivi.statut == 45] = 11
    assert indivi.statut.isin(range(12)).all(), u"statut value over range"


    log.info(u"    6.3 : variable txtppb")
    indivi.txtppb.fillna(0, inplace = True)
    assert indivi.txtppb.notnull().all()

    indivi.nbsala.fillna(0, inplace = True)
    indivi['nbsala'] = indivi.nbsala.astype('int')
    indivi.nbsala[indivi.nbsala == 99] = 10
    assert indivi.nbsala.isin(range(11)).all()

    log.info(u"    6.4 : variable chpub et CSP")
    indivi.chpub.fillna(0, inplace = True)
    indivi.chpub = indivi.chpub.astype('int')
    indivi.chpub[indivi.chpub.isnull()] = 0
    assert indivi.chpub.isin(range(11)).all()

    indivi['cadre'] = 0
    indivi.prosa.fillna(0, inplace = True)
    assert indivi['prosa'].notnull().all()
    log.info("{}".format(indivi['encadr'].value_counts(dropna = False)))

    # encadr : 1=oui, 2=non
    indivi.encadr.fillna(2, inplace = True)
    indivi.encadr[indivi.encadr == 0] = 2

    assert indivi.encadr.notnull().all()
    assert indivi.encadr.isin([1, 2]).all()

    indivi['cadre'][indivi.prosa.isin([7, 8])] = 1
    indivi['cadre'][(indivi.prosa == 9) & (indivi.encadr == 1)] = 1

    assert indivi['cadre'].isin(range(2)).all()

    log.info(
        u"Etape 7: on vérifie qu'il ne manque pas d'info sur les liens avec la personne de référence")
    log.info(
        u"nb de doublons idfam/quifam {}".format(len(indivi[indivi.duplicated(cols=['idfoy', 'quifoy'])])))

    log.info(u"On crée les n° de personnes à charge")
    assert indivi['idfoy'].notnull().all()
    print_id(indivi)
    indivi['quifoy2'] = 2
    indivi.quifoy2[indivi.quifoy == 'vous'] = 0
    indivi.quifoy2[indivi.quifoy == 'conj'] = 1
    indivi.quifoy2[indivi.quifoy == 'pac'] = 2

    del indivi['quifoy']
    indivi['quifoy'] = indivi.quifoy2
    del indivi['quifoy2']

    print_id(indivi)
    test2 = indivi[['quifoy', 'idfoy', 'noindiv']][indivi['quifoy'] == 2].copy()
    print_id(test2)

    j = 2
    while test2.duplicated(['quifoy', 'idfoy']).any():
        test2.loc[test2.duplicated(['quifoy', 'idfoy']), 'quifoy'] = j
        j += 1

    print_id(test2)
    indivi = indivi.merge(test2, on = ['noindiv', 'idfoy'], how = "left")
    indivi['quifoy'] = indivi['quifoy_x']
    indivi['quifoy'] = where(indivi['quifoy_x'] == 2, indivi['quifoy_y'], indivi['quifoy_x'])
    del indivi['quifoy_x'], indivi['quifoy_y']
    print_id(indivi)

    del test2, fip
    log.info(
        u"nb de doublons idfam/quifam' {}".format(
            len(indivi[indivi.duplicated(subset = ['idfoy', 'quifoy'])])
            )
        )
    print_id(indivi)

    log.info(u"Etape 8 : création des fichiers totaux")
    famille = temporary_store['famc_{}'.format(year)]

    log.info(u"    8.1 : création de tot2 & tot3")
    tot2 = indivi.merge(famille, on = 'noindiv', how = 'inner')
#     del famille # TODO: MBJ increase in number of menage/foyer when merging with family ...
    del famille

    control(tot2, debug = True, verbose = True)
    assert tot2.quifam.notnull().all()

    temporary_store['tot2_{}'.format(year)] = tot2
    del indivi
    log.info(u"    tot2 saved")

    tot2.merge(foyer, how = 'left')

    tot2 = tot2[tot2.idmen.notnull()].copy()

    print_id(tot2)
    tot3 = tot2
    # TODO: check where they come from
    tot3 = tot3.drop_duplicates(subset = 'noindiv')
    log.info("{}".format(len(tot3)))

    # Block to remove any unwanted duplicated pair
    control(tot3, debug = True, verbose = True)
    tot3 = tot3.drop_duplicates(subset = ['idfoy', 'quifoy'])
    tot3 = tot3.drop_duplicates(subset = ['idfam', 'quifam'])
    tot3 = tot3.drop_duplicates(subset = ['idmen', 'quimen'])
    tot3 = tot3.drop_duplicates(subset = ['noindiv'])
    control(tot3)

    log.info(u"    8.2 : On ajoute les variables individualisables")

    allvars = temporary_store['ind_vars_to_remove_{}'.format(year)]
    vars2 = set(tot3.columns).difference(set(allvars))
    tot3 = tot3[list(vars2)]
    log.info("{}".format(len(tot3)))

    assert not(tot3.duplicated(subset = ['noindiv']).any()), "doublon dans tot3['noindiv']"
    lg_dup = len(tot3[tot3.duplicated(['idfoy', 'quifoy'])])
    assert lg_dup == 0, "{} pairs of idfoy/quifoy in tot3 are duplicated".format(lg_dup)

    temporary_store['tot3_{}'.format(year)] = tot3
    control(tot3)

    del tot2, allvars, tot3, vars2
    log.info(u"tot3 sauvegardé")
    gc.collect()
def create_indivim_menage_en_mois(year = None):
    """
    Création des tables ménages et individus concaténée (merged)
    """
    temporary_store = TemporaryStore.create(file_name = "erfs")
    assert year is not None
    # load data
    erfs_survey_collection = SurveyCollection.load(
        collection = 'erfs', config_files_directory = config_files_directory)

    replace = create_replace(year)
    survey = erfs_survey_collection.get_survey('erfs_{}'.format(year))
    erfmen = survey.get_values(table = replace["erf_menage"])
    eecmen = survey.get_values(table = replace["eec_menage"])
    erfind = survey.get_values(table = replace["erf_indivi"])
    eecind = survey.get_values(table = replace["eec_indivi"])

    # travail sur la cohérence entre les bases
    noappar_m = eecmen[~(eecmen.ident.isin(erfmen.ident.values))].copy()

    noappar_i = eecmen[~(eecind.ident.isin(erfind.ident.values))].copy()
    noappar_i = noappar_i.drop_duplicates(subset = 'ident', take_last = True)
    # TODO: vérifier qu'il n'y a théoriquement pas de doublon

    difference = set(noappar_i.ident).symmetric_difference(noappar_m.ident)
    intersection = set(noappar_i.ident) & set(noappar_m.ident)
    log.info("There are {} differences and {} intersections".format(len(difference), len(intersection)))
    del noappar_i, noappar_m, difference, intersection
    gc.collect()

    # fusion enquete emploi et source fiscale
    menage_en_mois = erfmen.merge(eecmen)
    indivim = eecind.merge(erfind, on = ['noindiv', 'ident', 'noi'], how = "inner")

    # optimisation des types? Controle de l'existence en passant
    # TODO: minimal dtype
    # TODO: this should be done somewhere else
    var_list = ([
        'acteu',
        'agepr',
        'cohab',
        'contra',
        'encadr',
        'forter',
        'lien',
        'mrec',
        'naia',
        'noicon',
        'noimer',
        'noiper',
        'prosa',
        'retrai',
        'rstg',
        'statut',
        'stc',
        'titc',
        'txtppb',
        ])

    for var in var_list:
        assert indivim[var].dtype == numpy.dtype('int')

    ########################
    # création de variables#
    ########################

#   actrec : activité recodée comme preconisé par l'INSEE p84 du guide utilisateur
    indivim["actrec"] = numpy.nan
    # Attention : Q: pas de 6 ?!! A : Non pas de 6, la variable recodée de l'INSEE (voit p84 du guide methodo), ici \
    # la même nomenclatue à été adopée

    # 3: contrat a durée déterminée
    indivim['actrec'][indivim.acteu == 1] = 3
    # 8 : femme (homme) au foyer, autre inactif
    indivim['actrec'][indivim.acteu == 3] = 8
    # 1 : actif occupé non salarié
    filter1 = (indivim.acteu == 1) & (indivim.stc.isin([1, 3]))  # actifs occupés non salariés à son compte ou pour un
    indivim["actrec"][filter1] = 1                                # membre de sa famille
    # 2 : salarié pour une durée non limitée
    filter2 = (indivim.acteu == 1) & (((indivim.stc == 2) & (indivim.contra == 1)) | (indivim.titc == 2))
    indivim['actrec'][filter2] = 2
    # 4 : au chomage
    filter4 = (indivim.acteu == 2) | ((indivim.acteu == 3) & (indivim.mrec == 1))
    indivim['actrec'][filter4] = 4
    # 5 : élève étudiant , stagiaire non rémunéré
    filter5 = (indivim.acteu == 3) & ((indivim.forter == 2) | (indivim.rstg == 1))
    indivim['actrec'][filter5] = 5
    # 7 : retraité, préretraité, retiré des affaires unchecked
    filter7 = (indivim.acteu == 3) & ((indivim.retrai == 1) | (indivim.retrai == 2))
    indivim['actrec'][filter7] = 7
    # 9 : probablement enfants de - de 16 ans TODO: check that fact in database and questionnaire
    indivim['actrec'][indivim.acteu == 0] = 9

    indivim.actrec = indivim.actrec.astype("int8")
    assert_dtype(indivim.actrec, "int8")
    assert indivim.actrec.isin(range(1, 10)).all(), 'actrec values are outside the interval [1, 9]'

#   TODO : compare the result with results provided by Insee
#   tu99
    if year == 2009:
        erfind['tu99'] = None  # TODO: why ?

    # Locataire
    menage_en_mois["locataire"] = menage_en_mois.so.isin([3, 4, 5])
    assert_dtype(menage_en_mois.locataire, "bool")

    transfert = indivim.loc[indivim.lpr == 1, ['ident', 'ddipl']].copy()
    menage_en_mois = menage_en_mois.merge(transfert)

    # Correction
    def _manually_remove_errors():
        '''
        This method is here because some oddities can make it through the controls throughout the procedure
        It is here to remove all these individual errors that compromise the process.
        '''
        if year == 2006:
            indivim.lien[indivim.noindiv == 603018905] = 2
            indivim.noimer[indivim.noindiv == 603018905] = 1
            log.info("{}".format(indivim[indivim.noindiv == 603018905].to_string()))

    _manually_remove_errors()

    temporary_store['menage_en_mois_{}'.format(year)] = menage_en_mois
    del eecmen, erfmen, menage_en_mois, transfert
    gc.collect()
    temporary_store['indivim_{}'.format(year)] = indivim
    del erfind, eecind
    gc.collect()
    temporary_store.close()
def create_enfants_a_naitre(year=None):
    '''
    '''
    assert year is not None
    temporary_store = TemporaryStore.create(file_name="erfs")

    erfs_survey_collection = SurveyCollection.load(
        collection='erfs', config_files_directory=config_files_directory)
    survey = erfs_survey_collection.get_survey('erfs_{}'.format(year))
    # Enfant à naître (NN pour nouveaux nés)
    individual_vars = [
        'acteu',
        'agepr',
        'cohab',
        'contra',
        'forter',
        'ident',
        'lien',
        'lpr',
        'mrec',
        'naia',
        'naim',
        'noi',
        'noicon',
        'noimer',
        'noindiv',
        'noiper',
        'retrai',
        'rga',
        'rstg',
        'sexe',
        'stc',
        'titc',
    ]
    replace = create_replace(year)
    eeccmp1 = survey.get_values(table=replace["eec_cmp_1"],
                                variables=individual_vars)
    eeccmp2 = survey.get_values(table=replace["eec_cmp_2"],
                                variables=individual_vars)
    eeccmp3 = survey.get_values(table=replace["eec_cmp_3"],
                                variables=individual_vars)
    tmp = eeccmp1.merge(eeccmp2, how="outer")
    enfants_a_naitre = tmp.merge(eeccmp3, how="outer")

    # optimisation des types? Controle de l'existence en passant
    # pourquoi pas des int quand c'est possible
    # TODO: minimal dtype TODO: shoudln't be here
    for var in individual_vars:
        assert_dtype(enfants_a_naitre[var], 'float')
    del eeccmp1, eeccmp2, eeccmp3, individual_vars, tmp  #TODO: Adrien: me fait planter python
    gc.collect()

    # création de variables
    enfants_a_naitre['declar1'] = ''
    enfants_a_naitre['noidec'] = 0
    enfants_a_naitre['ztsai'] = 0
    enfants_a_naitre['year'] = year
    enfants_a_naitre.year = enfants_a_naitre.year.astype(
        "float32")  # TODO: should be an integer but NaN are present
    enfants_a_naitre['agepf'] = enfants_a_naitre.year - enfants_a_naitre.naia
    enfants_a_naitre['agepf'][enfants_a_naitre.naim >= 7] -= 1
    enfants_a_naitre['actrec'] = 9
    enfants_a_naitre['quelfic'] = 'ENF_NN'
    enfants_a_naitre['persfip'] = ""

    # TODO: deal with agepf
    for series_name in ['actrec', 'noidec', 'ztsai']:
        assert_dtype(enfants_a_naitre[series_name], "int")

    # selection
    enfants_a_naitre = enfants_a_naitre[(
        (enfants_a_naitre.naia == enfants_a_naitre.year) &
        (enfants_a_naitre.naim >= 10)) | (
            (enfants_a_naitre.naia == enfants_a_naitre.year + 1) &
            (enfants_a_naitre.naim <= 5))].copy()

    temporary_store["enfants_a_naitre_{}".format(year)] = enfants_a_naitre
    temporary_store.close()
    gc.collect()
def create_comparable_erf_data_frame(year):
    '''
    Imputation des loyers
    '''
    temporary_store = TemporaryStore.create(file_name = "erfs")
    assert year is not None
    # Préparation des variables qui serviront à l'imputation
    replace = create_replace(year)
    menm_vars = [
        "aai1",
        "agpr",
        "cstotpr",
        "ident",
        "nat28pr",
        "nb_uci",
        "nbenfc",
        "nbpiec",
        "pol99",
        "reg",
        "so",
        "spr",
        "tau99"
        "tu99",
        "typmen5",
        "wprm",
        "zperm",
        "zracm",
        "zragm",
        "zricm",
        "zrncm",
        "ztsam",
        ]

    if year == 2008:  # Tau99 not present
        menm_vars = menm_vars.pop('tau99')

    indm_vars = ["dip11", 'ident', "lpr", "noi"]
    # Travail sur la base ERF
    # Preparing ERF menages tables
    erfmenm = temporary_store.select('menage_en_mois_{}'.format(year))

    erfmenm['revtot'] = (
        erfmenm.ztsam + erfmenm.zperm + erfmenm.zragm + erfmenm.zricm + erfmenm.zrncm + erfmenm.zracm
        )
    # Niveau de vie de la personne de référence
    erfmenm['nvpr'] = erfmenm.revtot.astype(numpy.float64) / erfmenm.nb_uci.astype("float")
    erfmenm.nvpr[erfmenm.nvpr < 0] = 0
    erfmenm['logt'] = erfmenm.so

    # Preparing ERF individuals table
    erfindm = temporary_store['indivim_{}'.format(year)]

    # TODO: clean this later
    erfindm['dip11'] = 0
    count_NA('dip11', erfindm) == 0
#     erfindm['dip11'] = 99
    erfindm = erfindm[['ident', 'dip11']][erfindm['lpr'] == 1].copy()

    log.info(u"Merging talbes menage et individus de l'ERF")
    erf = erfmenm.merge(erfindm, on ='ident', how='inner')
    erf = erf.drop_duplicates('ident')

    dec, values = mark_weighted_percentiles(
        erf.nvpr.values,
        numpy.arange(1, 11),
        erf.wprm.values,
        2,
        return_quantiles = True
        )
    values.sort()
    erf['deci'] = (
        1 +
        (erf.nvpr > values[1]) +
        (erf.nvpr > values[2]) +
        (erf.nvpr > values[3]) +
        (erf.nvpr > values[4]) +
        (erf.nvpr > values[5]) +
        (erf.nvpr > values[6]) +
        (erf.nvpr > values[7]) +
        (erf.nvpr > values[8]) +
        (erf.nvpr > values[9])
        )
    assert_variable_in_range('deci', [1, 11], erf)
    count_NA('deci', erf)
    del dec, values
    gc.collect()

    # TODO: faire le lien avec men_vars,
    # il manque "pol99","reg","tau99" et ici on a en plus logt, 'nvpr','revtot','dip11','deci'
    erf = erf[
        [
            'ident',
            'ztsam',
            'zperm',
            'zragm',
            'zricm',
            'zrncm',
            'zracm',
            'nb_uci',
            'logt',
            'nbpiec',
            'typmen5',
            'spr',
            'nbenfc',
            'agpr',
            'cstotpr',
            'nat28pr',
            'tu99',
            'aai1',
            'wprm',
            'nvpr',
            'revtot',
            'dip11',
            'deci',
            ]
        ][erf['so'].isin(range(3, 6))]

    erf.rename(
        columns = {
            'aai1': 'iaat',
            'dip11': 'mdiplo',
            'nbpiec': 'hnph2',
            'nat28pr': 'mnatio',
            'nbpiec': 'hnph2',
            },
        inplace = True)

    count_NA('agpr', erf)
    if (erf.agpr == '.').any():
        erf = erf[erf.agpr != '.'].copy()

    erf['agpr'] = erf['agpr'].astype('int64')
    # TODO: moche, pourquoi créer deux variables quand une suffit ?
    erf['magtr'] = 3
    erf.magtr[erf.agpr < 65] = 2
    erf.magtr[erf.agpr < 40] = 1
    count_NA('magtr', erf)
    assert erf.magtr.isin(range(1, 5)).all()

    count_NA('cstotpr', erf)
    erf['mcs8'] = erf.cstotpr.astype('float') / 10.0
    erf.mcs8 = numpy.floor(erf.mcs8)
    count_NA('mcs8', erf)

    erf['mtybd'] = numpy.nan
    erf.mtybd[(erf.typmen5 == 1) & (erf.spr != 2)] = 1
    erf.mtybd[(erf.typmen5 == 1) & (erf.spr == 2)] = 2
    erf.mtybd[erf.typmen5 == 5] = 3
    erf.mtybd[erf.typmen5 == 3] = 7
    erf.mtybd[erf.nbenfc == 1] = 4
    erf.mtybd[erf.nbenfc == 2] = 5
    erf.mtybd[erf.nbenfc >= 3] = 6
    count_NA('mtybd', erf)     # TODO il reste 41 NA's 2003, 40 en 2009

    # TODO:
    # assert erf.mtybd.isin(range(1,8)).all() # bug,

    # TODO : 3 logements ont 0 pièces !!
    erf.hnph2[erf.hnph2 < 1] = 1
    erf.hnph2[erf.hnph2 >= 6] = 6
    count_NA('hnph2', erf)
    assert erf.hnph2.isin(range(1,7)).all()

    # TODO: il reste un NA 2003
    #       il rest un NA en 2008  (Not rechecked)

    erf.mnatio[erf.mnatio == 10] = 1
    mnatio_range = range(11, 16) + range(21, 30) + range(31, 33) + range(41, 49) + range(51, 53) + [60] + [62]
    erf.mnatio[erf.mnatio.isin(mnatio_range)] = 2
    count_NA('mnatio', erf)
    assert erf.mnatio.isin(range(1, 3)).all()

    erf.iaat[erf.mnatio.isin([1, 2, 3])] = 1
    erf.iaat[erf.mnatio == 4] = 2
    erf.iaat[erf.mnatio == 5] = 3
    erf.iaat[erf.mnatio == 6] = 4
    erf.iaat[erf.mnatio == 7] = 5
    erf.iaat[erf.mnatio == 8] = 6
    count_NA('iaat', erf)
    assert erf.iaat.isin(range(1, 7)).all()

    # Il reste un NA en 2003
    # reste un NA en 2008
    # TODO: comparer logement et erf pour être sur que cela colle

    # TODO: assert erf.mdiplo.unique() != 0
    erf.mdiplo[erf.mdiplo.isin([71, ""])] = 1
    erf.mdiplo[erf.mdiplo.isin([70, 60, 50])] = 2
    erf.mdiplo[erf.mdiplo.isin([41, 42, 31, 33])] = 3
    erf.mdiplo[erf.mdiplo.isin([10, 11, 30])] = 4
    count_NA('mdiplo', erf)
    # assert_variable_inrange('mdiplo', [1,5], erf) # On a un 99 qui se balade
    erf['tu99_recoded'] = erf['tu99'].copy()
    erf.tu99_recoded[erf['tu99'] == 0] = 1
    erf.tu99_recoded[erf['tu99'].isin([1, 2, 3])] = 2
    erf.tu99_recoded[erf['tu99'].isin([4, 5, 6])] = 3
    erf.tu99_recoded[erf['tu99'] == 7] = 4
    erf.tu99_recoded[erf['tu99'] == 8] = 5
    count_NA('tu99_recoded', erf)
    assert erf.tu99_recoded.isin(range(1, 6)).all()

    erf.mcs8[erf.mcs8.isin([4, 8])] = 4
    erf.mcs8[erf.mcs8.isin([5, 6, 7])] = 5
    count_NA('mcs8', erf)

    # Drop NA = 0 values
    if not erf.mcs8.isin(range(1, 6)).all():
        erf = erf[erf.mcs8 != 0].copy()

    erf.wprm = erf.wprm.astype('int64')
    count_NA('wprm', erf)

    for dropped_variable in ['agpr', 'cstotpr', 'nbenfc', 'spr', 'tu99', 'typmen5']:
        del erf[dropped_variable]
    gc.collect()

    check_variables = [
        'logt',
        'magtr',
        'mcs8',
        'mtybd',
        'hnph2',
        'mnatio',
        'iaat',
        'mdiplo',
        'tu99_recoded',
        ]

    non_integer_variables_to_filter = list()
    integer_variables_to_filter = list()
    for check_variable in check_variables:
        if erf[check_variable].isnull().any():
            log.info("Des valeurs NaN sont encore présentes dans la variable {} la table ERF".format(check_variable))
            non_integer_variables_to_filter.append(check_variable)
        if (erf[check_variable] == 0).any():
            if numpy.issubdtype(erf[check_variable].dtype, numpy.integer):
                log.info(
                    "Des valeurs nulles sont encore présentes dans la variable {} la table ERF".format(check_variable))
                integer_variables_to_filter.append(check_variable)

    erf = erf.dropna(subset = non_integer_variables_to_filter)
    # On vérifie au final que l'on n'a pas de doublons d'individus
    assert not erf['ident'].duplicated().any(), 'Il y a {} doublons'.format(erf['ident'].duplicated().sum())
    return erf
def create_enfants_a_naitre(year = None):
    '''
    '''
    assert year is not None
    temporary_store = TemporaryStore.create(file_name = "erfs")

    erfs_survey_collection = SurveyCollection.load(
        collection = 'erfs', config_files_directory = config_files_directory)
    survey = erfs_survey_collection.get_survey('erfs_{}'.format(year))
    # Enfant à naître (NN pour nouveaux nés)
    individual_vars = [
        'acteu',
        'agepr',
        'cohab',
        'contra',
        'forter',
        'ident',
        'lien',
        'lpr',
        'mrec',
        'naia',
        'naim',
        'noi',
        'noicon',
        'noimer',
        'noindiv',
        'noiper',
        'retrai',
        'rga',
        'rstg',
        'sexe',
        'stc',
        'titc',
        ]
    replace = create_replace(year)
    eeccmp1 = survey.get_values(table = replace["eec_cmp_1"], variables = individual_vars)
    eeccmp2 = survey.get_values(table = replace["eec_cmp_2"], variables = individual_vars)
    eeccmp3 = survey.get_values(table = replace["eec_cmp_3"], variables = individual_vars)
    tmp = eeccmp1.merge(eeccmp2, how = "outer")
    enfants_a_naitre = tmp.merge(eeccmp3, how = "outer")

    # optimisation des types? Controle de l'existence en passant
    # pourquoi pas des int quand c'est possible
    # TODO: minimal dtype TODO: shoudln't be here
    for var in individual_vars:
        assert_dtype(enfants_a_naitre[var], 'float')
    del eeccmp1, eeccmp2, eeccmp3, individual_vars, tmp  #TODO: Adrien: me fait planter python
    gc.collect()

    # création de variables
    enfants_a_naitre['declar1'] = ''
    enfants_a_naitre['noidec'] = 0
    enfants_a_naitre['ztsai'] = 0
    enfants_a_naitre['year'] = year
    enfants_a_naitre.year = enfants_a_naitre.year.astype("float32")  # TODO: should be an integer but NaN are present
    enfants_a_naitre['agepf'] = enfants_a_naitre.year - enfants_a_naitre.naia
    enfants_a_naitre['agepf'][enfants_a_naitre.naim >= 7] -= 1
    enfants_a_naitre['actrec'] = 9
    enfants_a_naitre['quelfic'] = 'ENF_NN'
    enfants_a_naitre['persfip'] = ""

    # TODO: deal with agepf
    for series_name in ['actrec', 'noidec', 'ztsai']:
        assert_dtype(enfants_a_naitre[series_name], "int")

    # selection
    enfants_a_naitre = enfants_a_naitre[
        (
            (enfants_a_naitre.naia == enfants_a_naitre.year) & (enfants_a_naitre.naim >= 10)
            ) | (
                (enfants_a_naitre.naia == enfants_a_naitre.year + 1) & (enfants_a_naitre.naim <= 5)
                )
        ].copy()

    temporary_store["enfants_a_naitre_{}".format(year)] = enfants_a_naitre
    temporary_store.close()
    gc.collect()
def sif(year):

    temporary_store = TemporaryStore.create(file_name="erfs")

    replace = create_replace(year)

    erfs_survey_collection = SurveyCollection.load(
        collection='erfs', config_files_directory=config_files_directory)
    data = erfs_survey_collection.get_survey('erfs_{}'.format(year))

    log.info("05_foyer: extraction des données foyer")
    # TODO Comment choisir le rfr n -2 pour éxonération de TH ?
    # mnrvka  Revenu TH n-2
    # mnrvkh  revenu TH (revenu fiscal de référence)
    #
    # On récupère les variables du code sif
    # vars <- c("noindiv", 'sif', "nbptr", "mnrvka")
    vars = ["noindiv", 'sif', "nbptr", "mnrvka", "rbg", "tsrvbg"]
    sif = data.get_values(variables=vars, table=replace["foyer"])

    sif['statmarit'] = 0

    if year == 2009:
        old_sif = sif['sif'][sif['noindiv'] == 901803201].copy()
        new_sif = old_sif.str[0:59] + old_sif.str[60:] + "0"
        sif['sif'][sif['noindiv'] == 901803201] = new_sif.values
        old_sif = sif['sif'][sif['noindiv'] == 900872201]
        new_sif = old_sif.str[0:58] + " " + old_sif.str[58:]
        sif['sif'][sif['noindiv'] == 900872201] = new_sif.values
        del old_sif, new_sif

    sif["rbg"] = sif["rbg"] * ((sif["tsrvbg"] == '+').astype(int) -
                               (sif["tsrvbg"] == '-').astype(int))
    sif["stamar"] = sif.sif.str[4:5]

    # Converting marital status
    statmarit_dict = {"M": 1, "C": 2, "D": 3, "V": 4, "O": 5}
    for key, val in statmarit_dict.iteritems():
        sif["statmarit"][sif.stamar == key] = val

    sif["birthvous"] = sif.sif.str[5:9]
    sif["birthconj"] = sif.sif.str[10:14]
    sif["caseE"] = sif.sif.str[15:16] == "E"
    sif["caseF"] = sif.sif.str[16:17] == "F"
    sif["caseG"] = sif.sif.str[17:18] == "G"
    sif["caseK"] = sif.sif.str[18:19] == "K"

    d = 0
    if year in [2006, 2007]:
        sif["caseL"] = sif.sif.str[19:20] == "L"
        sif["caseP"] = sif.sif.str[20:21] == "P"
        sif["caseS"] = sif.sif.str[21:22] == "S"
        sif["caseW"] = sif.sif.str[22:23] == "W"
        sif["caseN"] = sif.sif.str[23:24] == "N"
        sif["caseH"] = sif.sif.str[24:28]
        sif["caseT"] = sif.sif.str[28:29] == "T"

    if year in [2008]:
        d = -1  # fin de la case L
        sif["caseP"] = sif.sif.str[20 + d:21 + d] == "P"
        sif["caseS"] = sif.sif.str[21 + d:22 + d] == "S"
        sif["caseW"] = sif.sif.str[22 + d:23 + d] == "W"
        sif["caseN"] = sif.sif.str[23 + d:24 + d] == "N"
        sif["caseH"] = sif.sif.str[24 + d:28 + d]
        sif["caseT"] = sif.sif.str[28 + d:29 + d] == "T"

    if year in [2009]:
        sif["caseL"] = sif.sif.str[19:20] == "L"
        sif["caseP"] = sif.sif.str[20:21] == "P"
        sif["caseS"] = sif.sif.str[21:22] == "S"
        sif["caseW"] = sif.sif.str[22:23] == "W"
        sif["caseN"] = sif.sif.str[23:24] == "N"
        # caseH en moins par rapport à 2008 (mais case en L en plus)
        # donc décalage par rapport à 2006
        d = -4
        sif["caseT"] = sif.sif.str[28 + d:29 + d] == "T"

    sif["caseX"] = sif.sif.str[33 + d:34 + d] == "X"
    sif["dateX"] = sif.sif.str[34 + d:42 + d]
    sif["caseY"] = sif.sif.str[42 + d:43 + d] == "Y"
    sif["dateY"] = sif.sif.str[43 + d:51 + d]
    sif["caseZ"] = sif.sif.str[51 + d:53 + d] == "Z"
    sif["dateZ"] = sif.sif.str[52 + d:60 + d]
    sif["causeXYZ"] = sif.sif.str[60 + d:61 + d]

    # TODO: convert dateXYZ to appropriate date in pandas
    # print sif["dateY"].unique()

    sif["nbptr"] = sif.nbptr.values / 100
    sif["rfr_n_2"] = sif.mnrvka.values

    sif["nbF"] = sif.sif.str[64 + d:66 + d]
    sif["nbG"] = sif.sif.str[67 + d:69 + d]
    sif["nbR"] = sif.sif.str[70 + d:72 + d]
    sif["nbJ"] = sif.sif.str[73 + d:75 + d]
    sif["nbN"] = sif.sif.str[76 + d:78 + d]
    sif["nbH"] = sif.sif.str[79 + d:81 + d]
    sif["nbI"] = sif.sif.str[82 + d:84 + d]

    if (year != 2009):
        sif["nbP"] = sif.sif.str[85 + d:87 + d]

    del sif["sif"], sif["stamar"]

    log.info("Number of individuals: {}".format(len(sif.noindiv)))
    log.info("Number of distinct individuals: {}".format(
        len(sif.noindiv.value_counts())))

    sif_drop_duplicated = sif.drop_duplicates("noindiv")
    assert len(sif["noindiv"].value_counts()) == len(sif_drop_duplicated["noindiv"]), \
        "Number of distinct individuals after removing duplicates is not correct"

    log.info(u"Saving sif")
    temporary_store['sif_{}'.format(year)] = sif
    del sif
    gc.collect()
def foyer_all(year):
    year = 2009
    replace = create_replace(year)
    temporary_store = TemporaryStore.create(file_name="erfs")

    # On ajoute les cases de la déclaration
    erfs_survey_collection = SurveyCollection.load(
        collection='erfs', config_files_directory=config_files_directory)
    data = erfs_survey_collection.get_survey('erfs_{}'.format(year))
    foyer_all = data.get_values(table=replace["foyer"])
    # on ne garde que les cases de la déclaration ('_xzz') ou ^_[0-9][a-z]{2}")
    regex = re.compile("^_[0-9][a-z]{2}")
    variables = [x for x in foyer_all.columns if regex.match(x)]
    # rename variable to fxzz ou ^f[0-9][a-z]{2}")
    renamed_variables = ["f{}".format(x[1:]) for x in variables]

    foyer = foyer_all[variables + ["noindiv"]].copy()  # Memory expensive ...
    del foyer_all
    gc.collect()
    foyer.rename(columns=dict(zip(variables, renamed_variables)), inplace=True)

    # On aggrège les déclarations dans le cas où un individu a fait plusieurs déclarations
    foyer = foyer.groupby("noindiv", as_index=False).aggregate(numpy.sum)
    print_id(foyer)

    # On récupère les variables individualisables
    var_dict = {
        'sali': ['f1aj', 'f1bj', 'f1cj', 'f1dj', 'f1ej'],
        'choi': ['f1ap', 'f1bp', 'f1cp', 'f1dp', 'f1ep'],
        'fra': ['f1ak', 'f1bk', 'f1ck', 'f1dk', 'f1ek'],
        'cho_ld': ['f1ai', 'f1bi', 'f1ci', 'f1di', 'f1ei'],
        'ppe_tp_sa': ['f1ax', 'f1bx', 'f1cx', 'f1dx', 'f1qx'],
        'ppe_du_sa': ['f1av', 'f1bv', 'f1cv', 'f1dv', 'f1qv'],
        'rsti': ['f1as', 'f1bs', 'f1cs', 'f1ds', 'f1es'],
        'alr': ['f1ao', 'f1bo', 'f1co', 'f1do', 'f1eo'],
        'f1tv': ['f1tv', 'f1uv'],
        'f1tw': ['f1tw', 'f1uw'],
        'f1tx': ['f1tx', 'f1ux'],
        'ppe_tp_ns': ['f5nw', 'f5ow', 'f5pw'],
        'ppe_du_ns': ['f5nv', 'f5ov', 'f5pv'],
        'frag_exon': ['f5hn', 'f5in', 'f5jn'],
        'frag_impo': ['f5ho', 'f5io', 'f5jo'],
        'arag_exon': ['f5hb', 'f5ib', 'f5jb'],
        'arag_impg': ['f5hc', 'f5ic', 'f5jc'],
        'arag_defi': ['f5hf', 'f5if', 'f5jf'],
        'nrag_exon': ['f5hh', 'f5ih', 'f5jh'],
        'nrag_impg': ['f5hi', 'f5ii', 'f5ji'],
        'nrag_defi': ['f5hl', 'f5il', 'f5jl'],
        'nrag_ajag': ['f5hm', 'f5im', 'f5jm'],
        'mbic_exon': ['f5kn', 'f5ln', 'f5mn'],
        'abic_exon': ['f5kb', 'f5lb', 'f5mb'],
        'nbic_exon': ['f5kh', 'f5lh', 'f5mh'],
        'mbic_impv': ['f5ko', 'f5lo', 'f5mo'],
        'mbic_imps': ['f5kp', 'f5lp', 'f5mp'],
        'abic_impn': ['f5kc', 'f5lc', 'f5mc'],
        'abic_imps': ['f5kd', 'f5ld', 'f5md'],
        'nbic_impn': ['f5ki', 'f5li', 'f5mi'],
        'nbic_imps': ['f5kj', 'f5lj', 'f5mj'],
        'abic_defn': ['f5kf', 'f5lf', 'f5mf'],
        'abic_defs': ['f5kg', 'f5lg', 'f5mg'],
        'nbic_defn': ['f5kl', 'f5ll', 'f5ml'],
        'nbic_defs': ['f5km', 'f5lm', 'f5mm'],
        'nbic_apch': ['f5ks', 'f5ls', 'f5ms'],
        'macc_exon': ['f5nn', 'f5on', 'f5pn'],
        'aacc_exon': ['f5nb', 'f5ob', 'f5pb'],
        'nacc_exon': ['f5nh', 'f5oh', 'f5ph'],
        'macc_impv': ['f5no', 'f5oo', 'f5po'],
        'macc_imps': ['f5np', 'f5op', 'f5pp'],
        'aacc_impn': ['f5nc', 'f5oc', 'f5pc'],
        'aacc_imps': ['f5nd', 'f5od', 'f5pd'],
        'aacc_defn': ['f5nf', 'f5of', 'f5pf'],
        'aacc_defs': ['f5ng', 'f5og', 'f5pg'],
        'nacc_impn': ['f5ni', 'f5oi', 'f5pi'],
        'nacc_imps': ['f5nj', 'f5oj', 'f5pj'],
        'nacc_defn': ['f5nl', 'f5ol', 'f5pl'],
        'nacc_defs': ['f5nm', 'f5om', 'f5pm'],
        'mncn_impo': ['f5ku', 'f5lu', 'f5mu'],
        'cncn_bene': ['f5sn', 'f5ns', 'f5os'],
        'cncn_defi': ['f5sp', 'f5nu', 'f5ou', 'f5sr'],  # TODO: check
        'mbnc_exon': ['f5hp', 'f5ip', 'f5jp'],
        'abnc_exon': ['f5qb', 'f5rb', 'f5sb'],
        'nbnc_exon': ['f5qh', 'f5rh', 'f5sh'],
        'mbnc_impo': ['f5hq', 'f5iq', 'f5jq'],
        'abnc_impo': ['f5qc', 'f5rc', 'f5sc'],
        'abnc_defi': ['f5qe', 'f5re', 'f5se'],
        'nbnc_impo': ['f5qi', 'f5ri', 'f5si'],
        'nbnc_defi': ['f5qk', 'f5rk', 'f5sk'],
        # 'ebic_impv' : ['f5ta','f5ua', 'f5va'],
        # 'ebic_imps' : ['f5tb','f5ub', 'f5vb'],
        'mbic_mvct': ['f5hu'],
        'macc_mvct': ['f5iu'],
        'mncn_mvct': ['f5ju'],
        'mbnc_mvct': ['f5kz'],
        'frag_pvct': ['f5hw', 'f5iw', 'f5jw'],
        'mbic_pvct': ['f5kx', 'f5lx', 'f5mx'],
        'macc_pvct': ['f5nx', 'f5ox', 'f5px'],
        'mbnc_pvct': ['f5hv', 'f5iv', 'f5jv'],
        'mncn_pvct': ['f5ky', 'f5ly', 'f5my'],
        'mbic_mvlt': ['f5kr', 'f5lr', 'f5mr'],
        'macc_mvlt': ['f5nr', 'f5or', 'f5pr'],
        'mncn_mvlt': ['f5kw', 'f5lw', 'f5mw'],
        'mbnc_mvlt': ['f5hs', 'f5is', 'f5js'],
        'frag_pvce': ['f5hx', 'f5ix', 'f5jx'],
        'arag_pvce': ['f5he', 'f5ie', 'f5je'],
        'nrag_pvce': ['f5hk', 'f5lk', 'f5jk'],
        'mbic_pvce': ['f5kq', 'f5lq', 'f5mq'],
        'abic_pvce': ['f5ke', 'f5le', 'f5me'],
        'nbic_pvce': ['f5kk', 'f5ik', 'f5mk'],
        'macc_pvce': ['f5nq', 'f5oq', 'f5pq'],
        'aacc_pvce': ['f5ne', 'f5oe', 'f5pe'],
        'nacc_pvce': ['f5nk', 'f5ok', 'f5pk'],
        'mncn_pvce': ['f5kv', 'f5lv', 'f5mv'],
        'cncn_pvce': ['f5so', 'f5nt', 'f5ot'],
        'mbnc_pvce': ['f5hr', 'f5ir', 'f5jr'],
        'abnc_pvce': ['f5qd', 'f5rd', 'f5sd'],
        'nbnc_pvce': ['f5qj', 'f5rj', 'f5sj'],
        'demenage': ['f1ar', 'f1br', 'f1cr', 'f1dr',
                     'f1er'],  # (déménagement) uniquement en 2006
        # Charges déductibles
        'f6de':
        ['f6de'],  # CSG déductible calculée sur les revenus du patrimoine
        # Pensions alimentaires
        'f6gi':
        ['f6gi'
         ],  # Pensions alimenaires versées à des enfants majeurs (1e enfant)
        'f6gj':
        ['f6gj'
         ],  # Pensions alimenaires versées à des enfants majeurs (2e enfant)
        'f6el': [
            'f6el'
        ],  # Autres pensions alimenaires versées à des enfants majeurs (1e enfant)
        'f6el': [
            'f6em'
        ],  # Autres pensions alimenaires versées à des enfants majeurs (2e enfant)
        'f6gp': [
            'f6gp'
        ],  # Autres pensions alimenaires versées (décision de juscitce avant 2006)
        'f6gu': ['f6gu'],  # Autres pensions alimenaires versées
        'f6dd': ['f6dd'],  # Déductions diverses
        # Epargne retraite PERP
        'f6rs': ['f6rs', 'f6rt', 'f6ru'],  #
        'f6ss': ['f6ss', 'f6st', 'f6su'],  #
        'f6ps': ['f6ps', 'f6pt', 'f6pu'],  #
        'f6qr': ['f6qr'],  #
        'f6qw': ['f6qw'],  #
        'f6qs': ['f6qs', 'f6qt', 'f6qu'],  #
        # Réductions d'impôt
        'f7ud': ['f7ud'],  # Dons
        'f7ue': ['f7ue'],  # Dons
        'f7uf': ['f7uf'],  # Autres dons
        'f7xs': ['f7xs'],  # report années antérieures
        'f7xt': ['f7xt'],  # report années antérieures
        'f7xu': ['f7xu'],  # report années antérieures
        'f7xw': ['f7xw'],  # report années antérieures
        'f7xy': ['f7xy'],  # report années antérieures
        'f7ac': ['f7ac', 'f7ae', 'f7ag'],  # Cotisations syndicales
        'f7ad': ['f7ad', 'f7af', 'f7ah'],  #
        # Enfants poursuivant leurs études
        'f7ea': ['f7ea'],  #
        'f7ec': ['f7ec'],  #
        'f7ef': ['f7ef'],  #
        'f7eb': ['f7eb'],  #
        'f7ed': ['f7ed'],  #
        'f7eg': ['f7eg'],  #
        # Salarié à domicile
        'f7db': ['f7db'],  #
        'f7df': ['f7df'],  #
        'f7dq': ['f7dq'],  #
        'f7dg': ['f7dg'],  #
        'f7dl': ['f7dl'],  #
        'f7gz': ['f7gz'],  # Prime de rente survie, contrat d'épargne handicap
        'f7cd': ['f7cd'],  #
        'f7ce': ['f7ce'],  #
        # Dépenses en faveur de la qualité environnementale de l'habitation principale
        'f7we': ['f7we'],  #
        'f7wh': ['f7wh'],  #
        'f7wk': ['f7wk'],  #
        'f7wf': ['f7wf'],  #
        'f7wg': ['f7wg'],  #
        'f7wj': ['f7wj'],  #
        'f7wi': ['f7wi'],  #
        'f7vz': ['f7vz'],  #
        'f7vx': ['f7vx'],  #
        # Divers
        'f8by': ['f8by', 'f8cy'],  # Elus locaux
        'f8ut': ['f8ut'],  #
        #Revenus à l'étranger
        'f8ti': ['f8ti'],  # Revenus à l'étranger
        'f8tl': ['f8tl'],  #
        'f8tk': ['f8tk'],  #
        'f8fv': ['f8fv'],  #
        'f8tt': ['f8tt'],  #
        'f8uu': ['f8uu'],  #
    }

    vars_sets = [set(var_list) for var_list in var_dict.values()]
    eligible_vars = (set().union(*vars_sets)).intersection(
        set(list(foyer.columns)))

    log.info(u"From {} variables, we keep {} eligibles variables".format(
        len(set().union(*vars_sets)),
        len(eligible_vars),
    ))

    qui = ['vous', 'conj', 'pac1', 'pac2', 'pac3']
    #    err = 0
    #    err_vars = {}

    foy_ind = DataFrame()
    for individual_var, foyer_vars in var_dict.iteritems():
        try:
            selection = foyer[foyer_vars + ["noindiv"]].copy()
        except KeyError:
            # Testing if at least one variable of foyers_vars is in the eligible list
            presence = [x in eligible_vars for x in foyer_vars]
            if not any(presence):
                log.info("{} is not present".format(individual_var))
                continue
            else:
                # Shrink the list
                foyer_vars_cleaned = [
                    var for var, present in zip(foyer_vars, presence)
                    if present is True
                ]
                selection = foyer[foyer_vars_cleaned + ["noindiv"]].copy()

        # Reshape the dataframe
        selection.rename(columns=dict(zip(foyer_vars, qui)), inplace=True)
        selection.set_index("noindiv", inplace=True)
        selection.columns.name = "quifoy"

        selection = selection.stack()
        selection.name = individual_var
        selection = selection.reset_index(
        )  # A Series cannot see its index resetted to produce a DataFrame
        selection = selection.set_index(["quifoy", "noindiv"])
        selection = selection[selection[individual_var] != 0].copy()

        if len(foy_ind) == 0:
            foy_ind = selection
        else:
            foy_ind = concat([foy_ind, selection], axis=1, join='outer')

    foy_ind.reset_index(inplace=True)

    ind_vars_to_remove = Series(list(eligible_vars))
    temporary_store['ind_vars_to_remove_{}'.format(year)] = ind_vars_to_remove
    foy_ind.rename(columns={"noindiv": "idfoy"}, inplace=True)

    print_id(foy_ind)
    foy_ind['quifoy'][foy_ind.quifoy == 'vous'] = 0
    foy_ind['quifoy'][foy_ind.quifoy == 'conj'] = 1
    foy_ind['quifoy'][foy_ind.quifoy == 'pac1'] = 2
    foy_ind['quifoy'][foy_ind.quifoy == 'pac2'] = 3
    foy_ind['quifoy'][foy_ind.quifoy == 'pac3'] = 4

    assert foy_ind.quifoy.isin(
        range(5)).all(), 'présence de valeurs aberrantes dans quifoy'

    log.info('saving foy_ind')
    print_id(foy_ind)
    temporary_store['foy_ind_{}'.format(year)] = foy_ind

    return
def create_comparable_erf_data_frame(year):
    '''
    Imputation des loyers
    '''
    temporary_store = TemporaryStore.create(file_name="erfs")
    assert year is not None
    # Préparation des variables qui serviront à l'imputation
    replace = create_replace(year)
    menm_vars = [
        "aai1",
        "agpr",
        "cstotpr",
        "ident",
        "nat28pr",
        "nb_uci",
        "nbenfc",
        "nbpiec",
        "pol99",
        "reg",
        "so",
        "spr",
        "tau99"
        "tu99",
        "typmen5",
        "wprm",
        "zperm",
        "zracm",
        "zragm",
        "zricm",
        "zrncm",
        "ztsam",
    ]

    if year == 2008:  # Tau99 not present
        menm_vars = menm_vars.pop('tau99')

    indm_vars = ["dip11", 'ident', "lpr", "noi"]
    # Travail sur la base ERF
    # Preparing ERF menages tables
    erfmenm = temporary_store.select('menage_en_mois_{}'.format(year))

    erfmenm['revtot'] = (erfmenm.ztsam + erfmenm.zperm + erfmenm.zragm +
                         erfmenm.zricm + erfmenm.zrncm + erfmenm.zracm)
    # Niveau de vie de la personne de référence
    erfmenm['nvpr'] = erfmenm.revtot.astype(
        numpy.float64) / erfmenm.nb_uci.astype("float")
    erfmenm.nvpr[erfmenm.nvpr < 0] = 0
    erfmenm['logt'] = erfmenm.so

    # Preparing ERF individuals table
    erfindm = temporary_store['indivim_{}'.format(year)]

    # TODO: clean this later
    erfindm['dip11'] = 0
    count_NA('dip11', erfindm) == 0
    #     erfindm['dip11'] = 99
    erfindm = erfindm[['ident', 'dip11']][erfindm['lpr'] == 1].copy()

    log.info(u"Merging talbes menage et individus de l'ERF")
    erf = erfmenm.merge(erfindm, on='ident', how='inner')
    erf = erf.drop_duplicates('ident')

    dec, values = mark_weighted_percentiles(erf.nvpr.values,
                                            numpy.arange(1, 11),
                                            erf.wprm.values,
                                            2,
                                            return_quantiles=True)
    values.sort()
    erf['deci'] = (1 + (erf.nvpr > values[1]) + (erf.nvpr > values[2]) +
                   (erf.nvpr > values[3]) + (erf.nvpr > values[4]) +
                   (erf.nvpr > values[5]) + (erf.nvpr > values[6]) +
                   (erf.nvpr > values[7]) + (erf.nvpr > values[8]) +
                   (erf.nvpr > values[9]))
    assert_variable_in_range('deci', [1, 11], erf)
    count_NA('deci', erf)
    del dec, values
    gc.collect()

    # TODO: faire le lien avec men_vars,
    # il manque "pol99","reg","tau99" et ici on a en plus logt, 'nvpr','revtot','dip11','deci'
    erf = erf[[
        'ident',
        'ztsam',
        'zperm',
        'zragm',
        'zricm',
        'zrncm',
        'zracm',
        'nb_uci',
        'logt',
        'nbpiec',
        'typmen5',
        'spr',
        'nbenfc',
        'agpr',
        'cstotpr',
        'nat28pr',
        'tu99',
        'aai1',
        'wprm',
        'nvpr',
        'revtot',
        'dip11',
        'deci',
    ]][erf['so'].isin(range(3, 6))]

    erf.rename(columns={
        'aai1': 'iaat',
        'dip11': 'mdiplo',
        'nbpiec': 'hnph2',
        'nat28pr': 'mnatio',
        'nbpiec': 'hnph2',
    },
               inplace=True)

    count_NA('agpr', erf)
    if (erf.agpr == '.').any():
        erf = erf[erf.agpr != '.'].copy()

    erf['agpr'] = erf['agpr'].astype('int64')
    # TODO: moche, pourquoi créer deux variables quand une suffit ?
    erf['magtr'] = 3
    erf.magtr[erf.agpr < 65] = 2
    erf.magtr[erf.agpr < 40] = 1
    count_NA('magtr', erf)
    assert erf.magtr.isin(range(1, 5)).all()

    count_NA('cstotpr', erf)
    erf['mcs8'] = erf.cstotpr.astype('float') / 10.0
    erf.mcs8 = numpy.floor(erf.mcs8)
    count_NA('mcs8', erf)

    erf['mtybd'] = numpy.nan
    erf.mtybd[(erf.typmen5 == 1) & (erf.spr != 2)] = 1
    erf.mtybd[(erf.typmen5 == 1) & (erf.spr == 2)] = 2
    erf.mtybd[erf.typmen5 == 5] = 3
    erf.mtybd[erf.typmen5 == 3] = 7
    erf.mtybd[erf.nbenfc == 1] = 4
    erf.mtybd[erf.nbenfc == 2] = 5
    erf.mtybd[erf.nbenfc >= 3] = 6
    count_NA('mtybd', erf)  # TODO il reste 41 NA's 2003, 40 en 2009

    # TODO:
    # assert erf.mtybd.isin(range(1,8)).all() # bug,

    # TODO : 3 logements ont 0 pièces !!
    erf.hnph2[erf.hnph2 < 1] = 1
    erf.hnph2[erf.hnph2 >= 6] = 6
    count_NA('hnph2', erf)
    assert erf.hnph2.isin(range(1, 7)).all()

    # TODO: il reste un NA 2003
    #       il rest un NA en 2008  (Not rechecked)

    erf.mnatio[erf.mnatio == 10] = 1
    mnatio_range = range(11, 16) + range(21, 30) + range(31, 33) + range(
        41, 49) + range(51, 53) + [60] + [62]
    erf.mnatio[erf.mnatio.isin(mnatio_range)] = 2
    count_NA('mnatio', erf)
    assert erf.mnatio.isin(range(1, 3)).all()

    erf.iaat[erf.mnatio.isin([1, 2, 3])] = 1
    erf.iaat[erf.mnatio == 4] = 2
    erf.iaat[erf.mnatio == 5] = 3
    erf.iaat[erf.mnatio == 6] = 4
    erf.iaat[erf.mnatio == 7] = 5
    erf.iaat[erf.mnatio == 8] = 6
    count_NA('iaat', erf)
    assert erf.iaat.isin(range(1, 7)).all()

    # Il reste un NA en 2003
    # reste un NA en 2008
    # TODO: comparer logement et erf pour être sur que cela colle

    # TODO: assert erf.mdiplo.unique() != 0
    erf.mdiplo[erf.mdiplo.isin([71, ""])] = 1
    erf.mdiplo[erf.mdiplo.isin([70, 60, 50])] = 2
    erf.mdiplo[erf.mdiplo.isin([41, 42, 31, 33])] = 3
    erf.mdiplo[erf.mdiplo.isin([10, 11, 30])] = 4
    count_NA('mdiplo', erf)
    # assert_variable_inrange('mdiplo', [1,5], erf) # On a un 99 qui se balade
    erf['tu99_recoded'] = erf['tu99'].copy()
    erf.tu99_recoded[erf['tu99'] == 0] = 1
    erf.tu99_recoded[erf['tu99'].isin([1, 2, 3])] = 2
    erf.tu99_recoded[erf['tu99'].isin([4, 5, 6])] = 3
    erf.tu99_recoded[erf['tu99'] == 7] = 4
    erf.tu99_recoded[erf['tu99'] == 8] = 5
    count_NA('tu99_recoded', erf)
    assert erf.tu99_recoded.isin(range(1, 6)).all()

    erf.mcs8[erf.mcs8.isin([4, 8])] = 4
    erf.mcs8[erf.mcs8.isin([5, 6, 7])] = 5
    count_NA('mcs8', erf)

    # Drop NA = 0 values
    if not erf.mcs8.isin(range(1, 6)).all():
        erf = erf[erf.mcs8 != 0].copy()

    erf.wprm = erf.wprm.astype('int64')
    count_NA('wprm', erf)

    for dropped_variable in [
            'agpr', 'cstotpr', 'nbenfc', 'spr', 'tu99', 'typmen5'
    ]:
        del erf[dropped_variable]
    gc.collect()

    check_variables = [
        'logt',
        'magtr',
        'mcs8',
        'mtybd',
        'hnph2',
        'mnatio',
        'iaat',
        'mdiplo',
        'tu99_recoded',
    ]

    non_integer_variables_to_filter = list()
    integer_variables_to_filter = list()
    for check_variable in check_variables:
        if erf[check_variable].isnull().any():
            log.info(
                "Des valeurs NaN sont encore présentes dans la variable {} la table ERF"
                .format(check_variable))
            non_integer_variables_to_filter.append(check_variable)
        if (erf[check_variable] == 0).any():
            if numpy.issubdtype(erf[check_variable].dtype, numpy.integer):
                log.info(
                    "Des valeurs nulles sont encore présentes dans la variable {} la table ERF"
                    .format(check_variable))
                integer_variables_to_filter.append(check_variable)

    erf = erf.dropna(subset=non_integer_variables_to_filter)
    # On vérifie au final que l'on n'a pas de doublons d'individus
    assert not erf['ident'].duplicated().any(), 'Il y a {} doublons'.format(
        erf['ident'].duplicated().sum())
    return erf
def sif(year):

    temporary_store = TemporaryStore.create(file_name = "erfs")

    replace = create_replace(year)

    erfs_survey_collection = SurveyCollection.load(collection = 'erfs', config_files_directory = config_files_directory)
    data = erfs_survey_collection.get_survey('erfs_{}'.format(year))

    log.info("05_foyer: extraction des données foyer")
    # TODO Comment choisir le rfr n -2 pour éxonération de TH ?
    # mnrvka  Revenu TH n-2
    # mnrvkh  revenu TH (revenu fiscal de référence)
    #
    # On récupère les variables du code sif
    # vars <- c("noindiv", 'sif', "nbptr", "mnrvka")
    vars = ["noindiv", 'sif', "nbptr", "mnrvka", "rbg", "tsrvbg"]
    sif = data.get_values(variables = vars, table = replace["foyer"])

    sif['statmarit'] = 0

    if year == 2009:
        old_sif = sif['sif'][sif['noindiv'] == 901803201].copy()
        new_sif = old_sif.str[0:59] + old_sif.str[60:] + "0"
        sif['sif'][sif['noindiv'] == 901803201] = new_sif.values
        old_sif = sif['sif'][sif['noindiv'] == 900872201]
        new_sif = old_sif.str[0:58] + " " + old_sif.str[58:]
        sif['sif'][sif['noindiv'] == 900872201] = new_sif.values
        del old_sif, new_sif

    sif["rbg"] = sif["rbg"] * ((sif["tsrvbg"] == '+').astype(int) - (sif["tsrvbg"] == '-').astype(int))
    sif["stamar"] = sif.sif.str[4:5]

    # Converting marital status
    statmarit_dict = {"M": 1, "C": 2, "D": 3, "V": 4, "O": 5}
    for key, val in statmarit_dict.iteritems():
        sif["statmarit"][sif.stamar == key] = val

    sif["birthvous"] = sif.sif.str[5:9]
    sif["birthconj"] = sif.sif.str[10:14]
    sif["caseE"] = sif.sif.str[15:16] == "E"
    sif["caseF"] = sif.sif.str[16:17] == "F"
    sif["caseG"] = sif.sif.str[17:18] == "G"
    sif["caseK"] = sif.sif.str[18:19] == "K"

    d = 0
    if year in [2006, 2007]:
        sif["caseL"] = sif.sif.str[19:20] == "L"
        sif["caseP"] = sif.sif.str[20:21] == "P"
        sif["caseS"] = sif.sif.str[21:22] == "S"
        sif["caseW"] = sif.sif.str[22:23] == "W"
        sif["caseN"] = sif.sif.str[23:24] == "N"
        sif["caseH"] = sif.sif.str[24:28]
        sif["caseT"] = sif.sif.str[28:29] == "T"

    if year in [2008]:
        d = - 1  # fin de la case L
        sif["caseP"] = sif.sif.str[20 + d: 21 + d] == "P"
        sif["caseS"] = sif.sif.str[21 + d: 22 + d] == "S"
        sif["caseW"] = sif.sif.str[22 + d: 23 + d] == "W"
        sif["caseN"] = sif.sif.str[23 + d: 24 + d] == "N"
        sif["caseH"] = sif.sif.str[24 + d: 28 + d]
        sif["caseT"] = sif.sif.str[28 + d: 29 + d] == "T"

    if year in [2009]:
        sif["caseL"] = sif.sif.str[19: 20] == "L"
        sif["caseP"] = sif.sif.str[20: 21] == "P"
        sif["caseS"] = sif.sif.str[21: 22] == "S"
        sif["caseW"] = sif.sif.str[22: 23] == "W"
        sif["caseN"] = sif.sif.str[23: 24] == "N"
        # caseH en moins par rapport à 2008 (mais case en L en plus)
        # donc décalage par rapport à 2006
        d = -4
        sif["caseT"] = sif.sif.str[28 + d: 29 + d] == "T"

    sif["caseX"] = sif.sif.str[33 + d: 34 + d] == "X"
    sif["dateX"] = sif.sif.str[34 + d: 42 + d]
    sif["caseY"] = sif.sif.str[42 + d: 43 + d] == "Y"
    sif["dateY"] = sif.sif.str[43 + d: 51 + d]
    sif["caseZ"] = sif.sif.str[51 + d: 53 + d] == "Z"
    sif["dateZ"] = sif.sif.str[52 + d: 60 + d]
    sif["causeXYZ"] = sif.sif.str[60 + d: 61 + d]

    # TODO: convert dateXYZ to appropriate date in pandas
    # print sif["dateY"].unique()

    sif["nbptr"] = sif.nbptr.values / 100
    sif["rfr_n_2"] = sif.mnrvka.values

    sif["nbF"] = sif.sif.str[64 + d: 66 + d]
    sif["nbG"] = sif.sif.str[67 + d: 69 + d]
    sif["nbR"] = sif.sif.str[70 + d: 72 + d]
    sif["nbJ"] = sif.sif.str[73 + d: 75 + d]
    sif["nbN"] = sif.sif.str[76 + d: 78 + d]
    sif["nbH"] = sif.sif.str[79 + d: 81 + d]
    sif["nbI"] = sif.sif.str[82 + d: 84 + d]

    if (year != 2009):
        sif["nbP"] = sif.sif.str[85 + d: 87 + d]

    del sif["sif"], sif["stamar"]

    log.info("Number of individuals: {}".format(len(sif.noindiv)))
    log.info("Number of distinct individuals: {}".format(len(sif.noindiv.value_counts())))

    sif_drop_duplicated = sif.drop_duplicates("noindiv")
    assert len(sif["noindiv"].value_counts()) == len(sif_drop_duplicated["noindiv"]), \
        "Number of distinct individuals after removing duplicates is not correct"

    log.info(u"Saving sif")
    temporary_store['sif_{}'.format(year)] = sif
    del sif
    gc.collect()
def foyer_all(year):
    year = 2009
    replace = create_replace(year)
    temporary_store = TemporaryStore.create(file_name = "erfs")

    # On ajoute les cases de la déclaration
    erfs_survey_collection = SurveyCollection.load(collection = 'erfs', config_files_directory = config_files_directory)
    data = erfs_survey_collection.get_survey('erfs_{}'.format(year))
    foyer_all = data.get_values(table = replace["foyer"])
    # on ne garde que les cases de la déclaration ('_xzz') ou ^_[0-9][a-z]{2}")
    regex = re.compile("^_[0-9][a-z]{2}")
    variables = [x for x in foyer_all.columns if regex.match(x)]
    # rename variable to fxzz ou ^f[0-9][a-z]{2}")
    renamed_variables = ["f{}".format(x[1:]) for x in variables]

    foyer = foyer_all[variables + ["noindiv"]].copy()  # Memory expensive ...
    del foyer_all
    gc.collect()
    foyer.rename(columns = dict(zip(variables, renamed_variables)), inplace = True)


    # On aggrège les déclarations dans le cas où un individu a fait plusieurs déclarations
    foyer = foyer.groupby("noindiv", as_index = False).aggregate(numpy.sum)
    print_id(foyer)

    # On récupère les variables individualisables
    var_dict = {
        'sali': ['f1aj', 'f1bj', 'f1cj', 'f1dj', 'f1ej'],
        'choi': ['f1ap', 'f1bp', 'f1cp', 'f1dp', 'f1ep'],
        'fra': ['f1ak', 'f1bk', 'f1ck', 'f1dk', 'f1ek'],
        'cho_ld': ['f1ai', 'f1bi', 'f1ci', 'f1di', 'f1ei'],
        'ppe_tp_sa': ['f1ax', 'f1bx', 'f1cx', 'f1dx', 'f1qx'],
        'ppe_du_sa': ['f1av', 'f1bv', 'f1cv', 'f1dv', 'f1qv'],
        'rsti': ['f1as', 'f1bs', 'f1cs', 'f1ds', 'f1es'],
        'alr': ['f1ao', 'f1bo', 'f1co', 'f1do', 'f1eo'],
        'f1tv': ['f1tv', 'f1uv'],
        'f1tw': ['f1tw', 'f1uw'],
        'f1tx': ['f1tx', 'f1ux'],
        'ppe_tp_ns': ['f5nw', 'f5ow', 'f5pw'],
        'ppe_du_ns': ['f5nv', 'f5ov', 'f5pv'],
        'frag_exon': ['f5hn', 'f5in', 'f5jn'],
        'frag_impo': ['f5ho', 'f5io', 'f5jo'],
        'arag_exon': ['f5hb', 'f5ib', 'f5jb'],
        'arag_impg': ['f5hc', 'f5ic', 'f5jc'],
        'arag_defi': ['f5hf', 'f5if', 'f5jf'],
        'nrag_exon': ['f5hh', 'f5ih', 'f5jh'],
        'nrag_impg': ['f5hi', 'f5ii', 'f5ji'],
        'nrag_defi': ['f5hl', 'f5il', 'f5jl'],
        'nrag_ajag': ['f5hm', 'f5im', 'f5jm'],
        'mbic_exon': ['f5kn', 'f5ln', 'f5mn'],
        'abic_exon': ['f5kb', 'f5lb', 'f5mb'],
        'nbic_exon': ['f5kh', 'f5lh', 'f5mh'],
        'mbic_impv': ['f5ko', 'f5lo', 'f5mo'],
        'mbic_imps': ['f5kp', 'f5lp', 'f5mp'],
        'abic_impn': ['f5kc', 'f5lc', 'f5mc'],
        'abic_imps': ['f5kd', 'f5ld', 'f5md'],
        'nbic_impn': ['f5ki', 'f5li', 'f5mi'],
        'nbic_imps': ['f5kj', 'f5lj', 'f5mj'],
        'abic_defn': ['f5kf', 'f5lf', 'f5mf'],
        'abic_defs': ['f5kg', 'f5lg', 'f5mg'],
        'nbic_defn': ['f5kl', 'f5ll', 'f5ml'],
        'nbic_defs': ['f5km', 'f5lm', 'f5mm'],
        'nbic_apch': ['f5ks', 'f5ls', 'f5ms'],
        'macc_exon': ['f5nn', 'f5on', 'f5pn'],
        'aacc_exon': ['f5nb', 'f5ob', 'f5pb'],
        'nacc_exon': ['f5nh', 'f5oh', 'f5ph'],
        'macc_impv': ['f5no', 'f5oo', 'f5po'],
        'macc_imps': ['f5np', 'f5op', 'f5pp'],
        'aacc_impn': ['f5nc', 'f5oc', 'f5pc'],
        'aacc_imps': ['f5nd', 'f5od', 'f5pd'],
        'aacc_defn': ['f5nf', 'f5of', 'f5pf'],
        'aacc_defs': ['f5ng', 'f5og', 'f5pg'],
        'nacc_impn': ['f5ni', 'f5oi', 'f5pi'],
        'nacc_imps': ['f5nj', 'f5oj', 'f5pj'],
        'nacc_defn': ['f5nl', 'f5ol', 'f5pl'],
        'nacc_defs': ['f5nm', 'f5om', 'f5pm'],
        'mncn_impo': ['f5ku', 'f5lu', 'f5mu'],
        'cncn_bene': ['f5sn', 'f5ns', 'f5os'],
        'cncn_defi': ['f5sp', 'f5nu', 'f5ou', 'f5sr'], # TODO: check
        'mbnc_exon': ['f5hp', 'f5ip', 'f5jp'],
        'abnc_exon': ['f5qb', 'f5rb', 'f5sb'],
        'nbnc_exon': ['f5qh', 'f5rh', 'f5sh'],
        'mbnc_impo': ['f5hq', 'f5iq', 'f5jq'],
        'abnc_impo': ['f5qc', 'f5rc', 'f5sc'],
        'abnc_defi': ['f5qe', 'f5re', 'f5se'],
        'nbnc_impo': ['f5qi', 'f5ri', 'f5si'],
        'nbnc_defi': ['f5qk', 'f5rk', 'f5sk'],
        # 'ebic_impv' : ['f5ta','f5ua', 'f5va'],
        # 'ebic_imps' : ['f5tb','f5ub', 'f5vb'],
        'mbic_mvct': ['f5hu'],
        'macc_mvct': ['f5iu'],
        'mncn_mvct': ['f5ju'],
        'mbnc_mvct': ['f5kz'],
        'frag_pvct': ['f5hw', 'f5iw', 'f5jw'],
        'mbic_pvct': ['f5kx', 'f5lx', 'f5mx'],
        'macc_pvct': ['f5nx', 'f5ox', 'f5px'],
        'mbnc_pvct': ['f5hv', 'f5iv', 'f5jv'],
        'mncn_pvct': ['f5ky', 'f5ly', 'f5my'],
        'mbic_mvlt': ['f5kr', 'f5lr', 'f5mr'],
        'macc_mvlt': ['f5nr', 'f5or', 'f5pr'],
        'mncn_mvlt': ['f5kw', 'f5lw', 'f5mw'],
        'mbnc_mvlt': ['f5hs', 'f5is', 'f5js'],
        'frag_pvce': ['f5hx', 'f5ix', 'f5jx'],
        'arag_pvce': ['f5he', 'f5ie', 'f5je'],
        'nrag_pvce': ['f5hk', 'f5lk', 'f5jk'],
        'mbic_pvce': ['f5kq', 'f5lq', 'f5mq'],
        'abic_pvce': ['f5ke', 'f5le', 'f5me'],
        'nbic_pvce': ['f5kk', 'f5ik', 'f5mk'],
        'macc_pvce': ['f5nq', 'f5oq', 'f5pq'],
        'aacc_pvce': ['f5ne', 'f5oe', 'f5pe'],
        'nacc_pvce': ['f5nk', 'f5ok', 'f5pk'],
        'mncn_pvce': ['f5kv', 'f5lv', 'f5mv'],
        'cncn_pvce': ['f5so', 'f5nt', 'f5ot'],
        'mbnc_pvce': ['f5hr', 'f5ir', 'f5jr'],
        'abnc_pvce': ['f5qd', 'f5rd', 'f5sd'],
        'nbnc_pvce': ['f5qj', 'f5rj', 'f5sj'],
        'demenage': ['f1ar', 'f1br', 'f1cr', 'f1dr', 'f1er'],  # (déménagement) uniquement en 2006
        # Charges déductibles
        'f6de': ['f6de'],  # CSG déductible calculée sur les revenus du patrimoine
        # Pensions alimentaires
        'f6gi': ['f6gi'],  # Pensions alimenaires versées à des enfants majeurs (1e enfant)
        'f6gj': ['f6gj'],  # Pensions alimenaires versées à des enfants majeurs (2e enfant)
        'f6el': ['f6el'],  # Autres pensions alimenaires versées à des enfants majeurs (1e enfant)
        'f6el': ['f6em'],  # Autres pensions alimenaires versées à des enfants majeurs (2e enfant)
        'f6gp': ['f6gp'],  # Autres pensions alimenaires versées (décision de juscitce avant 2006)
        'f6gu': ['f6gu'],  # Autres pensions alimenaires versées
        'f6dd': ['f6dd'],  # Déductions diverses
        # Epargne retraite PERP
        'f6rs': ['f6rs','f6rt','f6ru'],  #
        'f6ss': ['f6ss','f6st','f6su'],  #
        'f6ps': ['f6ps','f6pt','f6pu'],  #
        'f6qr': ['f6qr'],  #
        'f6qw': ['f6qw'],  #
        'f6qs': ['f6qs','f6qt','f6qu'],  #
        # Réductions d'impôt
        'f7ud': ['f7ud'],  # Dons
        'f7ue': ['f7ue'],  # Dons
        'f7uf': ['f7uf'],  # Autres dons
        'f7xs': ['f7xs'],  # report années antérieures
        'f7xt': ['f7xt'],  # report années antérieures
        'f7xu': ['f7xu'],  # report années antérieures
        'f7xw': ['f7xw'],  # report années antérieures
        'f7xy': ['f7xy'],  # report années antérieures
        'f7ac': ['f7ac','f7ae','f7ag'],  # Cotisations syndicales
        'f7ad': ['f7ad','f7af','f7ah'],  #
        # Enfants poursuivant leurs études
        'f7ea': ['f7ea'],  #
        'f7ec': ['f7ec'],  #
        'f7ef': ['f7ef'],  #
        'f7eb': ['f7eb'],  #
        'f7ed': ['f7ed'],  #
        'f7eg': ['f7eg'],  #
        # Salarié à domicile
        'f7db': ['f7db'],  #
        'f7df': ['f7df'],  #
        'f7dq': ['f7dq'],  #
        'f7dg': ['f7dg'],  #
        'f7dl': ['f7dl'],  #
        'f7gz': ['f7gz'],  # Prime de rente survie, contrat d'épargne handicap
        'f7cd': ['f7cd'],  #
        'f7ce': ['f7ce'],  #
        # Dépenses en faveur de la qualité environnementale de l'habitation principale
        'f7we': ['f7we'],  #
        'f7wh': ['f7wh'],  #
        'f7wk': ['f7wk'],  #
        'f7wf': ['f7wf'],  #
        'f7wg': ['f7wg'],  #
        'f7wj': ['f7wj'],  #
        'f7wi': ['f7wi'],  #
        'f7vz': ['f7vz'],  #
        'f7vx': ['f7vx'],  #
        # Divers
        'f8by': ['f8by','f8cy'],  # Elus locaux
        'f8ut': ['f8ut'],  #
        #Revenus à l'étranger
        'f8ti': ['f8ti'],  # Revenus à l'étranger
        'f8tl': ['f8tl'],  #
        'f8tk': ['f8tk'],  #
        'f8fv': ['f8fv'],  #
        'f8tt': ['f8tt'],  #
        'f8uu': ['f8uu'],  #
        }

    vars_sets = [set(var_list) for var_list in var_dict.values()]
    eligible_vars = (set().union(*vars_sets)).intersection(set(list(foyer.columns)))

    log.info(
        u"From {} variables, we keep {} eligibles variables".format(
            len(set().union(*vars_sets)),
            len(eligible_vars),
            )
        )

    qui = ['vous', 'conj', 'pac1', 'pac2', 'pac3']
    #    err = 0
    #    err_vars = {}

    foy_ind = DataFrame()
    for individual_var, foyer_vars in var_dict.iteritems():
        try:
            selection = foyer[foyer_vars + ["noindiv"]].copy()
        except KeyError:
            # Testing if at least one variable of foyers_vars is in the eligible list
            presence = [x in eligible_vars for x in foyer_vars]
            if not any(presence):
                log.info("{} is not present".format(individual_var))
                continue
            else:
                # Shrink the list
                foyer_vars_cleaned = [var for var, present in zip(foyer_vars, presence) if present is True]
                selection = foyer[foyer_vars_cleaned + ["noindiv"]].copy()

        # Reshape the dataframe
        selection.rename(columns = dict(zip(foyer_vars, qui)), inplace = True)
        selection.set_index("noindiv", inplace = True)
        selection.columns.name = "quifoy"

        selection = selection.stack()
        selection.name = individual_var
        selection = selection.reset_index()  # A Series cannot see its index resetted to produce a DataFrame
        selection = selection.set_index(["quifoy", "noindiv"])
        selection = selection[selection[individual_var] != 0].copy()

        if len(foy_ind) == 0:
            foy_ind = selection
        else:
            foy_ind = concat([foy_ind, selection], axis = 1, join = 'outer')

    foy_ind.reset_index(inplace = True)

    ind_vars_to_remove = Series(list(eligible_vars))
    temporary_store['ind_vars_to_remove_{}'.format(year)] = ind_vars_to_remove
    foy_ind.rename(columns = {"noindiv": "idfoy"}, inplace = True)

    print_id(foy_ind)
    foy_ind['quifoy'][foy_ind.quifoy == 'vous'] = 0
    foy_ind['quifoy'][foy_ind.quifoy == 'conj'] = 1
    foy_ind['quifoy'][foy_ind.quifoy == 'pac1'] = 2
    foy_ind['quifoy'][foy_ind.quifoy == 'pac2'] = 3
    foy_ind['quifoy'][foy_ind.quifoy == 'pac3'] = 4

    assert foy_ind.quifoy .isin(range(5)).all(), 'présence de valeurs aberrantes dans quifoy'

    log.info('saving foy_ind')
    print_id(foy_ind)
    temporary_store['foy_ind_{}'.format(year)] = foy_ind

    return