Ejemplo n.º 1
0
def create_final(year):

    temporary_store = TemporaryStore.create(file_name = "erfs")

    log.info(u"création de final")
    foy_ind = temporary_store['foy_ind_{}'.format(year)]
    tot3 = temporary_store['tot3_{}'.format(year)]

    foy_ind.set_index(['idfoy', 'quifoy'], inplace = True)
    tot3.set_index(['idfoy', 'quifoy'], inplace = True)
    final = concat([tot3, foy_ind], join_axes=[tot3.index], axis=1)
    final.reset_index(inplace = True)
    foy_ind.reset_index(inplace = True)
    tot3.reset_index(inplace = True)

    # tot3 = tot3.drop_duplicates(cols=['idfam', 'quifam'])
    final = final[final.idmen.notnull()]

    control(final, verbose=True)
    del tot3, foy_ind
    gc.collect()

    # final <- merge(final, sif, by = c('noindiv'), all.x = TRUE)
    log.info("    loading fip")
    sif = temporary_store['sif_{}'.format(year)]

    log.info("{}".format(sif.columns))
    log.info("    update final using fip")
    final = final.merge(sif, on=["noindiv"], how="left")
    # TODO: IL FAUT UNE METHODE POUR GERER LES DOUBLES DECLARATIONS

    control(final, debug=True)

    final['caseP'] = final.caseP.fillna(False)
    final['caseF'] = final.caseF.fillna(False)
    print_id(final)

    temporary_store['final_{}'.format(year)] = final
    log.info(u"final sauvegardé")
    del sif, final
def create_final(temporary_store=None, year=None):

    assert temporary_store is not None
    assert year is not None

    log.info(u"création de final")
    foy_ind = temporary_store['foy_ind_{}'.format(year)]
    tot3 = temporary_store['tot3_{}'.format(year)]

    log.info(u"Stats on tot3")
    print_id(tot3)
    log.info(u"Stats on foy_ind")
    print_id(foy_ind)
    foy_ind.set_index(['idfoy', 'quifoy'], inplace=True, verify_integrity=True)
    tot3.set_index(['idfoy', 'quifoy'], inplace=True, verify_integrity=True)

    # tot3 = concat([tot3, foy_ind], join_axes=[tot3.index], axis=1, verify_integrity = True)

    # TODO improve this
    foy_ind.drop([u'alr', u'rsti', u'sali', u'choi'], axis=1, inplace=True)
    tot3 = tot3.join(foy_ind)
    tot3.reset_index(inplace=True)
    foy_ind.reset_index(inplace=True)

    # tot3 = tot3.drop_duplicates(subset=['idfam', 'quifam'])
    control(tot3, verbose=True)
    final = tot3.loc[tot3.idmen.notnull(), :].copy()

    control(final, verbose=True)
    del tot3, foy_ind
    gc.collect()

    log.info("    loading fip")
    sif = temporary_store['sif_{}'.format(year)]
    log.info("Columns from sif dataframe: {}".format(sif.columns))
    log.info("    update final using fip")
    final.set_index('noindiv', inplace=True, verify_integrity=True)

    # TODO: IL FAUT UNE METHODE POUR GERER LES DOUBLES DECLARATIONS
    #  On ne garde que les sif.noindiv qui correspondent à des idfoy == "vous"
    #  Et on enlève les duplicates
    idfoys = final.loc[final.quifoy == 0, "idfoy"]
    sif = sif[sif.noindiv.isin(idfoys)
              & ~(sif.change.isin(['M', 'S', 'Z']))].copy()
    sif.drop_duplicates(subset=['noindiv'], inplace=True)
    sif.set_index('noindiv', inplace=True, verify_integrity=True)
    final = final.join(sif)
    final.reset_index(inplace=True)

    control(final, debug=True)

    final['caseP'] = final.caseP.fillna(False)
    final['caseF'] = final.caseF.fillna(False)
    print_id(final)

    temporary_store['final_{}'.format(year)] = final
    log.info(u"final sauvegardé")
    del sif, final
def create_final(temporary_store = None, year = None):

    assert temporary_store is not None
    assert year is not None

    log.info(u"création de final")
    foy_ind = temporary_store['foy_ind_{}'.format(year)]
    tot3 = temporary_store['tot3_{}'.format(year)]

    log.info(u"Stats on tot3")
    print_id(tot3)
    log.info(u"Stats on foy_ind")
    print_id(foy_ind)
    foy_ind.set_index(['idfoy', 'quifoy'], inplace = True, verify_integrity = True)
    tot3.set_index(['idfoy', 'quifoy'], inplace = True, verify_integrity = True)

    # tot3 = concat([tot3, foy_ind], join_axes=[tot3.index], axis=1, verify_integrity = True)

    # TODO improve this
    foy_ind.drop([u'alr', u'rsti', u'sali', u'choi'], axis = 1, inplace = True)
    tot3 = tot3.join(foy_ind)
    tot3.reset_index(inplace = True)
    foy_ind.reset_index(inplace = True)

    # tot3 = tot3.drop_duplicates(subset=['idfam', 'quifam'])
    control(tot3, verbose=True)
    final = tot3.loc[tot3.idmen.notnull(), :].copy()

    control(final, verbose=True)
    del tot3, foy_ind
    gc.collect()

    log.info("    loading fip")
    sif = temporary_store['sif_{}'.format(year)]
    log.info("Columns from sif dataframe: {}".format(sif.columns))
    log.info("    update final using fip")
    final.set_index('noindiv', inplace = True, verify_integrity = True)

    # TODO: IL FAUT UNE METHODE POUR GERER LES DOUBLES DECLARATIONS
    #  On ne garde que les sif.noindiv qui correspondent à des idfoy == "vous"
    #  Et on enlève les duplicates
    idfoys = final.loc[final.quifoy == 0, "idfoy"]
    sif = sif[sif.noindiv.isin(idfoys) & ~(sif.change.isin(['M', 'S', 'Z']))].copy()
    sif.drop_duplicates(subset = ['noindiv'], inplace = True)
    sif.set_index('noindiv', inplace = True, verify_integrity = True)
    final = final.join(sif)
    final.reset_index(inplace = True)

    control(final, debug=True)

    final['caseP'] = final.caseP.fillna(False)
    final['caseF'] = final.caseF.fillna(False)
    print_id(final)

    temporary_store['final_{}'.format(year)] = final
    log.info(u"final sauvegardé")
    del sif, final
def create_totals_second_pass(temporary_store=None, year=None):
    assert temporary_store is not None
    assert year is not None

    log.info(u"    5.1 : Elimination idfoy restant")
    # Voiture balai
    # On a plein d'idfoy vides, on fait 1 ménage = 1 foyer fiscal
    indivi = temporary_store['indivi_step_06_{}'.format(year)]
    idfoyList = indivi.loc[indivi.quifoy == "vous", 'idfoy'].unique()
    indivi_without_idfoy = ~indivi.idfoy.isin(idfoyList)

    indivi.loc[indivi_without_idfoy, 'quifoy'] = "pac"

    indivi.loc[indivi_without_idfoy & (indivi.quimen == 0) &
               (indivi.age >= 18), 'quifoy'] = "vous"
    indivi.loc[indivi_without_idfoy & (indivi.quimen == 0) &
               (indivi.age >= 18), 'idfoy'] = (
                   indivi.loc[indivi_without_idfoy, "idmen"].astype('int') *
                   100 + 51)
    indivi.loc[indivi_without_idfoy & (indivi.quimen == 1) &
               (indivi.age >= 18), 'quifoy'] = "conj"

    del idfoyList
    print_id(indivi)

    # Sélectionne les variables à garder pour les steps suivants
    variables = [
        "actrec",
        "age",
        "age_en_mois",
        "chpub",
        "encadr",
        "idfoy",
        "idmen",
        "nbsala",
        "noi",
        "noindiv",
        "prosa",
        "quelfic",
        "quifoy",
        "quimen",
        "statut",
        "titc",
        "txtppb",
        "wprm",
        "rc1rev",
        "maahe",
        "sali",
        "rsti",
        "choi",
        "alr",
        "wprm",
    ]

    assert set(variables).issubset(set(indivi.columns)), \
        "Manquent les colonnes suivantes : {}".format(set(variables).difference(set(indivi.columns)))

    dropped_columns = [
        variable for variable in indivi.columns if variable not in variables
    ]
    indivi.drop(dropped_columns, axis=1, inplace=True)

    #  see http://stackoverflow.com/questions/11285613/selecting-columns
    indivi.reset_index(inplace=True)
    gc.collect()

    # TODO les actrec des fip ne sont pas codées (on le fera à la fin quand on aura rassemblé
    # les infos provenant des déclarations)
    log.info(u"Etape 6 : Création des variables descriptives")
    log.info(u"    6.1 : Variable activité")
    log.info(u"Variables présentes; \n {}".format(indivi.columns))

    indivi['activite'] = np.nan
    indivi.loc[indivi.actrec <= 3, 'activite'] = 0
    indivi.loc[indivi.actrec == 4, 'activite'] = 1
    indivi.loc[indivi.actrec == 5, 'activite'] = 2
    indivi.loc[indivi.actrec == 7, 'activite'] = 3
    indivi.loc[indivi.actrec == 8, 'activite'] = 4
    indivi.loc[indivi.age <= 13,
               'activite'] = 2  # ce sont en fait les actrec=9
    log.info("Valeurs prises par la variable activité \n {}".format(
        indivi['activite'].value_counts(dropna=False)))
    # TODO: MBJ problem avec les actrec
    # TODO: FIX AND REMOVE
    indivi.loc[indivi.actrec.isnull(), 'activite'] = 5
    indivi.loc[indivi.titc.isnull(), 'titc'] = 0
    assert indivi.titc.notnull().all(), \
        u"Problème avec les titc"  # On a 420 NaN pour les varaibels statut, titc etc

    log.info(u"    6.2 : Variable statut")
    indivi.loc[indivi.statut.isnull(), 'statut'] = 0
    indivi.statut = indivi.statut.astype('int')
    indivi.loc[indivi.statut == 11, 'statut'] = 1
    indivi.loc[indivi.statut == 12, 'statut'] = 2
    indivi.loc[indivi.statut == 13, 'statut'] = 3
    indivi.loc[indivi.statut == 21, 'statut'] = 4
    indivi.loc[indivi.statut == 22, 'statut'] = 5
    indivi.loc[indivi.statut == 33, 'statut'] = 6
    indivi.loc[indivi.statut == 34, 'statut'] = 7
    indivi.loc[indivi.statut == 35, 'statut'] = 8
    indivi.loc[indivi.statut == 43, 'statut'] = 9
    indivi.loc[indivi.statut == 44, 'statut'] = 10
    indivi.loc[indivi.statut == 45, 'statut'] = 11
    assert indivi.statut.isin(range(12)).all(), u"statut value over range"
    log.info("Valeurs prises par la variable statut \n {}".format(
        indivi['statut'].value_counts(dropna=False)))

    log.info(u"    6.3 : variable txtppb")
    indivi.loc[indivi.txtppb.isnull(), 'txtppb'] = 0
    assert indivi.txtppb.notnull().all()
    indivi.loc[indivi.nbsala.isnull(), 'nbsala'] = 0
    indivi.nbsala = indivi.nbsala.astype('int')
    indivi.loc[indivi.nbsala == 99, 'nbsala'] = 10
    assert indivi.nbsala.isin(range(11)).all()
    log.info("Valeurs prises par la variable txtppb \n {}".format(
        indivi['txtppb'].value_counts(dropna=False)))

    log.info(u"    6.4 : variable chpub et CSP")
    indivi.loc[indivi.chpub.isnull(), 'chpub'] = 0
    indivi.chpub = indivi.chpub.astype('int')
    assert indivi.chpub.isin(range(11)).all()

    indivi['cadre'] = 0
    indivi.loc[indivi.prosa.isnull(), 'prosa'] = 0
    assert indivi.prosa.notnull().all()
    log.info("Valeurs prises par la variable encadr \n {}".format(
        indivi['encadr'].value_counts(dropna=False)))

    # encadr : 1=oui, 2=non
    indivi.loc[indivi.encadr.isnull(), 'encadr'] = 2
    indivi.loc[indivi.encadr == 0, 'encadr'] = 2
    assert indivi.encadr.notnull().all()
    assert indivi.encadr.isin([1, 2]).all()

    indivi.loc[indivi.prosa.isin([7, 8]), 'cadre'] = 1
    indivi.loc[(indivi.prosa == 9) & (indivi.encadr == 1), 'cadre'] = 1
    assert indivi.cadre.isin(range(2)).all()

    log.info(
        u"Etape 7: on vérifie qu'il ne manque pas d'info sur les liens avec la personne de référence"
    )
    log.info(u"nb de doublons idfoy/quifoy {}".format(
        len(indivi[indivi.duplicated(subset=['idfoy', 'quifoy'])])))

    log.info(u"On crée les n° de personnes à charge dans le foyer fiscal")
    assert indivi.idfoy.notnull().all()
    print_id(indivi)
    indivi['quifoy_bis'] = 2
    indivi.loc[indivi.quifoy == 'vous', 'quifoy_bis'] = 0
    indivi.loc[indivi.quifoy == 'conj', 'quifoy_bis'] = 1
    indivi.loc[indivi.quifoy == 'pac', 'quifoy_bis'] = 2
    del indivi['quifoy']
    indivi['quifoy'] = indivi.quifoy_bis.copy()
    del indivi['quifoy_bis']

    print_id(indivi)
    pac = indivi.loc[indivi['quifoy'] == 2,
                     ['quifoy', 'idfoy', 'noindiv']].copy()
    print_id(pac)

    j = 2
    while pac.duplicated(['quifoy', 'idfoy']).any():
        pac.loc[pac.duplicated(['quifoy', 'idfoy']), 'quifoy'] = j
        j += 1

    print_id(pac)
    indivi = indivi.merge(pac, on=['noindiv', 'idfoy'], how="left")
    indivi['quifoy'] = indivi['quifoy_x']
    indivi['quifoy'] = where(indivi['quifoy_x'] == 2, indivi['quifoy_y'],
                             indivi['quifoy_x'])
    del indivi['quifoy_x'], indivi['quifoy_y']
    print_id(indivi)
    del pac
    assert len(indivi[indivi.duplicated(subset = ['idfoy', 'quifoy'])]) == 0, \
        u"Il y a {} doublons idfoy/quifoy".format(
            len(indivi[indivi.duplicated(subset = ['idfoy', 'quifoy'])])
            )
    print_id(indivi)

    log.info(u"Etape 8 : création des fichiers totaux")
    famille = temporary_store['famc_{}'.format(year)]

    log.info(u"    8.1 : création de tot2 & tot3")
    tot2 = indivi.merge(famille, on='noindiv', how='inner')
    # TODO: MBJ increase in number of menage/foyer when merging with family ...
    del famille

    control(tot2, debug=True, verbose=True)
    assert tot2.quifam.notnull().all()

    temporary_store['tot2_{}'.format(year)] = tot2
    del indivi
    log.info(u"    tot2 saved")
    tot2 = tot2[tot2.idmen.notnull()].copy()

    print_id(tot2)
    tot3 = tot2
    # TODO: check where they come from
    log.info("Avant élimination des doublons noindiv: {}".format(len(tot3)))

    tot3 = tot3.drop_duplicates(subset='noindiv')
    log.info("Après élimination des doublons noindiv: {}".format(len(tot3)))

    # Block to remove any unwanted duplicated pair

    control(tot3, debug=True, verbose=True)
    tot3 = tot3.drop_duplicates(subset=['idfoy', 'quifoy'])
    log.info("Après élimination des doublons idfoy, quifoy: {}".format(
        len(tot3)))
    tot3 = tot3.drop_duplicates(subset=['idfam', 'quifam'])
    log.info("Après élimination des doublons idfam, 'quifam: {}".format(
        len(tot3)))
    tot3 = tot3.drop_duplicates(subset=['idmen', 'quimen'])
    log.info("Après élimination des doublons idmen, quimen: {}".format(
        len(tot3)))
    tot3 = tot3.drop_duplicates(subset=['noindiv'])
    control(tot3)

    log.info(u"    8.2 : On ajoute les variables individualisables")

    allvars = temporary_store['ind_vars_to_remove_{}'.format(year)]
    vars2 = set(tot3.columns).difference(set(allvars))
    tot3 = tot3[list(vars2)]
    log.info("{}".format(len(tot3)))

    assert not (tot3.duplicated(
        subset=['noindiv']).any()), "doublon dans tot3['noindiv']"
    lg_dup = len(tot3[tot3.duplicated(['idfoy', 'quifoy'])])
    assert lg_dup == 0, "{} pairs of idfoy/quifoy in tot3 are duplicated".format(
        lg_dup)

    temporary_store['tot3_{}'.format(year)] = tot3
    control(tot3)

    del tot2, allvars, tot3, vars2
    gc.collect()
    log.info(u"tot3 sauvegardé")
def invalide(temporary_store=None, year=None):

    assert temporary_store is not None
    assert year is not None

    log.info(u"Entering 07_invalides: construction de la variable invalide")
    # # # Invalides
    # # #invalide = caseP (vous), caseF (conj) ou case G, caseI, ou caseR (pac)

    # # loadTmp("final.Rdata")
    # # invalides <- final[,c("noindiv","idmen","caseP","caseF","idfoy","quifoy")]
    # # invalides <- within(invalides,{
    # #   caseP <- ifelse(is.na(caseP),0,caseP)
    # #   caseF <- ifelse(is.na(caseF),0,caseF)
    # #   invalide <- FALSE})
    # # # Les "vous" invalides
    # # table(invalides[,c("caseF","quifoy")],useNA="ifany")
    # # invalides[(invalides$caseP==1) & (invalides$quifoy=="vous"),"invalide"] <- TRUE
    # #

    log.info(u"Etape 1 : création de la df invalides")
    log.info(u"    1.1 : déclarants invalides")
    final = temporary_store['final_{}'.format(year)]
    if "invalide" in final:
        # on drop les colones inv et alt au cas ou on aurait déjà lancé le step07
        final.drop(["invalide", "alt"], axis=1, inplace=True)

    invalides_vars = [
        "caseF",
        "caseP",
        "idfoy",
        "idmen",
        "noindiv",
        "quifoy",
    ]

    aah_eec_variables = ["rc1rev", "maahe"]
    aah_eec = False

    if set(aah_eec_variables) < set(final.columns):
        aah_eec = True
        invalides_vars += aah_eec_variables

    assert set(invalides_vars) < set(final.columns), \
        "Variables {} are missing".format(set(invalides_vars).difference(set(final.columns)))

    invalides = final.xs(invalides_vars, axis=1).copy()

    invalides['invalide'] = False
    invalides['alt'] = False
    for var in ["caseP", "caseF"]:
        assert invalides[var].notnull().all(), 'NaN values in {}'.format(var)

    # Les déclarants invalides
    invalides.loc[(invalides['caseP'] == 1) & (invalides['quifoy'] == 0),
                  'invalide'] = True
    log.info(u"Il y a {} invalides déclarants".format(
        invalides["invalide"].sum()))

    # Les personnes qui touchent l'aah dans l'enquête emploi
    if aah_eec:
        log.info(u"Inspecting rc1rev")
        log.info(invalides['rc1rev'].value_counts())
        invalides.loc[invalides.maahe > 0, 'invalide'] = True
        invalides.loc[invalides.rc1rev == 4,
                      'invalide'] = True  # TODO: vérifier le format.
        #  TODO:      invalides.rc1rev.astype("str") voir mai mahdi pour pendre en compte 14 24 etc

        log.info(u"Il y a {} invalides qui touchent des alloc".format(
            invalides["invalide"].sum()))

    print_id(invalides)

    # # # Les conjoints invalides
    # #
    # # #men_inv_conj <- invalides[c("idmen","caseF","quifoy")]
    # # #men_inv_conj <- rename(men_inv_conj, c("caseF"="invalide"))
    # # #table(men_inv_conj[men_inv_conj$inv==1 ,c("invalide","quifoy")],useNA="ifany")
    # # # Il y a des caseF suir des conjoints cela vint des doubles d?clarations TODO: shoumd clean this
    # # #toto <- invalides[invalides$caseF==1 & invalides$quifoy=="conj","idmen"]
    # # #load(indm)
    # # #titi <- indivim[(indivim$ident %in% toto) & (indivim$persfip=="vous" |indivim$persfip=="conj") ,c("ident","noindiv","declar1","declar2","persfip","quelfic")]
    # # #titi <- titi[order(titi$ident),]
    # # foy_inv_conj <- invalides[,c("idfoy","caseF","quifoy")]
    # # foy_inv_conj <- rename(foy_inv_conj, c("caseF"="invalide"))
    # # table(foy_inv_conj[ ,c("invalide","quifoy")],useNA="ifany")
    # # # On ne garde donc que les caseF des "vous"

    # # foy_inv_conj   <- foy_inv_conj[foy_inv_conj$quifoy=="vous",c("idfoy","invalide")]
    # # table(foy_inv_conj[ ,c("invalide")],useNA="ifany")
    # # invalides_conj <- invalides[invalides$quifoy=="conj",c("idfoy","noindiv")]
    # # invalides_conj <- merge(invalides_conj, foy_inv_conj, by="idfoy", all.x=TRUE)
    # # table(invalides_conj$inv) # TODO en 2006 On en a 316 au lieu de 328 il doit y avoir de idfoy avec caseF qui n'ont pas de vous because double déclaration'
    # # invalides[invalides$quifoy=="conj",c("idfoy","noindiv","invalide")] <- invalides_conj
    # # table(invalides[,c("invalide","quifoy")],useNA="ifany")
    # # rm(invalides_conj,foy_inv_conj)

    # On récupère les idfoy des foyers avec une caseF cochée
    log.info('    1.2 : Les conjoints invalides')
    idfoy_inv_conj = final.idfoy[final.caseF].copy()
    inv_conj_condition = (invalides.idfoy.isin(idfoy_inv_conj) &
                          (invalides.quifoy == 1))
    invalides.loc[inv_conj_condition, "invalide"] = True

    log.info(u"Il y a {} invalides conjoints".format(
        len(invalides[inv_conj_condition])))
    log.info(u" Il y a {} invalides déclarants et invalides conjoints".format(
        invalides["invalide"].sum()))

    # Enfants invalides et garde alternée
    # #
    # # loadTmp("pacIndiv.Rdata")
    # # foy_inv_pac <- invalides[!(invalides$quifoy %in% c("vous","conj")),c("invalide","noindiv")]
    # # foy_inv_pac <- merge(foy_inv_pac, pacIndiv[,c("noindiv","typ","naia")], by="noindiv",all.x =TRUE)
    # # names(foy_inv_pac)
    # # table(foy_inv_pac[,c("typ","naia")],useNA="ifany")
    # # table(foy_inv_pac[,c("typ")],useNA="ifany")
    # # foy_inv_pac <- within(foy_inv_pac,{
    # #   invalide  <- (typ=="G") | (typ=="R") | (typ=="I") | (typ=="F" & (as.numeric(year)-naia>18))
    # #   alt  <- (typ=="H") | (typ=="I")
    # #   naia <- NULL
    # #   typ  <- NULL})
    # #
    # # table(foy_inv_pac[ ,c("invalide")],useNA="ifany")
    # # table(foy_inv_pac[ ,c("alt")],useNA="ifany")
    # # invalides$alt <- 0
    # # foy_inv_pac[is.na(foy_inv_pac$alt),"alt"] <- 0
    # # invalides[!(invalides$quifoy %in% c("vous","conj")),c("noindiv","invalide","alt")] <- foy_inv_pac
    log.info(
        u"    1.3 : enfants invalides et en garde alternée (variables inv et alt)"
    )
    pacIndiv = temporary_store['pacIndiv_{}'.format(year)]
    # print pacIndiv.type_pac.value_counts()
    log.info(pacIndiv.type_pac.value_counts())

    foy_inv_pac = invalides[['noindiv', 'invalide'
                             ]][~(invalides.quifoy.isin([0, 1]))].copy()
    # pac = pacIndiv.ix[:, ["noindiv", "type_pac", "naia"]]
    log.info("{}".format(len(foy_inv_pac)))

    log.info("{}".format(pacIndiv.columns))
    foy_inv_pac = foy_inv_pac.merge(
        pacIndiv[['noindiv', 'type_pac', 'naia']].copy(),
        on='noindiv',
        how='left',
    )
    foy_inv_pac['invalide'] = (foy_inv_pac['type_pac'].isin(['G', 'R', 'I']) |
                               ((foy_inv_pac.type_pac == "F") &
                                ((year - foy_inv_pac.naia) > 18)))
    foy_inv_pac['alt'] = ((foy_inv_pac.type_pac == "H") |
                          (foy_inv_pac.type_pac == "I"))
    del foy_inv_pac['naia']
    del foy_inv_pac['type_pac']
    foy_inv_pac['alt'] = foy_inv_pac['alt'].fillna(False)

    log.info("{}".format(foy_inv_pac['invalide'].describe()))
    invalides.loc[:, 'alt'] = False
    invalides.loc[~(invalides.quifoy.isin([0, 1])),
                  ["alt", "invalide"]] = foy_inv_pac[["alt", "invalide"
                                                      ]].copy().values
    invalides = invalides[["noindiv", "invalide", 'alt']].copy()
    invalides['alt'].fillna(False, inplace=True)

    log.info(invalides.invalide.value_counts())
    # invalides = invalides.drop_duplicates(['noindiv', 'invalide', 'alt'], take_last = True)
    del foy_inv_pac, pacIndiv

    # Initialisation des NA sur alt et inv
    # invalides[is.na(invalides$inv), "invalide"] <- 0
    # table(invalides[,c("alt","invalide")],useNA="ifany")
    #
    # final <- merge(final, invalides[,c("noindiv","invalide","alt")], by="noindiv",all.x=TRUE)
    # table(final[, c("invalide","alt")],useNA="ifany")

    log.info('Etape 2 : Initialisation des NA sur alt et inv')
    assert invalides.invalide.notnull().all() & invalides.alt.notnull().all()
    final.set_index('noindiv', inplace=True, verify_integrity=True)
    invalides.set_index('noindiv', inplace=True, verify_integrity=True)
    final = final.join(invalides)
    final.reset_index(inplace=True)
    del invalides

    log.info("{}".format(final.invalide.value_counts()))
    control(final, debug=True)

    temporary_store['final_{}'.format(year)] = final
    log.info(u'final complétée et sauvegardée')
Ejemplo n.º 6
0
def create_totals(year = None):

    assert year is not None
    temporary_store = TemporaryStore.create(file_name = "erfs")
    replace = create_replace(year)

    # On part de la table individu de l'ERFS
    # on renomme les variables

    log.info(u"Creating Totals")
    log.info(u"Etape 1 : Chargement des données")

    erfs_survey_collection = SurveyCollection.load(collection = 'erfs', config_files_directory = config_files_directory)
    data = erfs_survey_collection.get_survey('erfs_{}'.format(year))

    indivim = temporary_store['indivim_{}'.format(year)]

    assert not indivim.duplicated(['noindiv']).any(), "Présence de doublons"

    # Deals individuals with imputed income : some individuals are in 'erf individu table' but
    # not in the 'foyer' table. We need to create a foyer for them.

    selection = Series()
    for var in ["zsali", "zchoi", "zrsti", "zalri", "zrtoi", "zragi", "zrici", "zrnci"]:
        varo = var[:-1] + "o"
        test = indivim[var] != indivim[varo]
        if len(selection) == 0:
            selection = test
        else:
            selection = (test) | (selection)

    indivi_i = indivim[selection].copy()
    indivi_i.rename(
        columns = {
            "ident": "idmen",
            "persfip": "quifoy",
            "zsali": "sali2",  # Inclu les salaires non imposables des agents d'assurance
            "zchoi": "choi2",
            "zrsti": "rsti2",
            "zalri": "alr2"
            },
        inplace = True,
        )

    indivi_i.quifoy = where(indivi_i.quifoy.isnull(), "vous", indivi_i.quifoy)
    indivi_i.quelfic = "FIP_IMP"

    # We merge them with the other individuals
    indivim.rename(
        columns = dict(
            ident = "idmen",
            persfip = "quifoy",
            zsali = "sali2",  # Inclu les salaires non imposables des agents d'assurance
            zchoi = "choi2",
            zrsti = "rsti2",
            zalri = "alr2",
            ),
        inplace = True,
        )

    if not (set(list(indivim.noindiv)) > set(list(indivi_i.noindiv))):
        raise Exception("Individual ")
    indivim.set_index("noindiv", inplace = True)
    indivi_i.set_index("noindiv", inplace = True)
    indivi = indivim
    del indivim
    indivi.update(indivi_i)

    indivi.reset_index(inplace = True)

    log.info("Etape 2 : isolation des FIP")
    fip_imp = indivi.quelfic == "FIP_IMP"
    indivi["idfoy"] = (
        indivi.idmen.astype("int64") * 100 +
        (indivi.declar1.str[0:2]).convert_objects(convert_numeric=True)
        )

    indivi.loc[fip_imp, "idfoy"] = np.nan
    # Certains FIP (ou du moins avec revenus imputés) ont un numéro de déclaration d'impôt ( pourquoi ?)
    fip_has_declar = (fip_imp) & (indivi.declar1.notnull())

    indivi["idfoy"] = where(
        fip_has_declar,
        indivi.idmen * 100 + indivi.declar1.str[0:2].convert_objects(convert_numeric = True),
        indivi.idfoy)
    del fip_has_declar

    fip_no_declar = (fip_imp) & (indivi.declar1.isnull())
    del fip_imp
    indivi["idfoy"] = where(fip_no_declar, indivi["idmen"] * 100 + 50, indivi["idfoy"])

    indivi_fnd = indivi[["idfoy", "noindiv"]][fip_no_declar].copy()

    while any(indivi_fnd.duplicated(cols=["idfoy"])):
        indivi_fnd["idfoy"] = where(
            indivi_fnd.duplicated(cols=["idfoy"]),
            indivi_fnd["idfoy"] + 1,
            indivi_fnd["idfoy"]
            )

    # assert indivi_fnd["idfoy"].duplicated().value_counts()[False] == len(indivi_fnd["idfoy"].values), "Duplicates remaining"
    assert len(indivi[indivi.duplicated(['noindiv'])]) == 0, "Doublons"

    indivi.idfoy[fip_no_declar] = indivi_fnd.idfoy.copy()
    del indivi_fnd, fip_no_declar

    log.info(u"Etape 3 : Récupération des EE_NRT")

    nrt = indivi.quelfic == "EE_NRT"
    indivi.idfoy = where(nrt, indivi.idmen * 100 + indivi.noi, indivi.idfoy)
    indivi.quifoy[nrt] = "vous"
    del nrt

    pref_or_cref = indivi.lpr.isin([1, 2])
    adults = (indivi.quelfic.isin(["EE", "EE_CAF"])) & (pref_or_cref)
    indivi.idfoy = where(adults, indivi.idmen * 100 + indivi.noi, indivi.idfoy)
    indivi.loc[adults, "quifoy"] = "vous"
    del adults
    # TODO: hack to avoid assert error
    log.info("{}".format(indivi.loc[indivi['lpr'].isin([1, 2]), "idfoy"].notnull().value_counts()))
    assert indivi.idfoy[indivi.lpr.dropna().isin([1, 2])].all()

    log.info(u"Etape 4 : Rattachement des enfants aux déclarations")

    assert not(indivi.noindiv.duplicated().any()), "Some noindiv appear twice"
    lpr3_or_lpr4 = indivi['lpr'].isin([3, 4])
    enf_ee = (lpr3_or_lpr4) & (indivi.quelfic.isin(["EE", "EE_CAF"]))
    assert indivi.noindiv[enf_ee].notnull().all(), " Some noindiv are not set, which will ruin next stage"
    assert not(indivi.noindiv[enf_ee].duplicated().any()), "Some noindiv appear twice"

    pere = DataFrame({
        "noindiv_enf": indivi.noindiv.loc[enf_ee],
        "noindiv": 100 * indivi.idmen.loc[enf_ee] + indivi.noiper.loc[enf_ee]
        })
    mere = DataFrame({
        "noindiv_enf": indivi.noindiv.loc[enf_ee],
        "noindiv": 100 * indivi.idmen.loc[enf_ee] + indivi.noimer.loc[enf_ee]
        })

    foyer = data.get_values(variables = ["noindiv", "zimpof"], table = replace["foyer"])
    pere = pere.merge(foyer, how = "inner", on = "noindiv")
    mere = mere.merge(foyer, how = "inner", on = "noindiv")
    df = pere.merge(mere, how = "outer", on = "noindiv_enf", suffixes=('_p', '_m'))

    log.info(u"    4.1 : gestion des personnes dans 2 foyers")
    for col in ["noindiv_p", "noindiv_m", "noindiv_enf"]:
        df[col] = df[col].fillna(0, inplace = True)  # beacause groupby drop groups with NA in index
    df = df.groupby(by = ["noindiv_p", "noindiv_m", "noindiv_enf"]).sum()
    df.reset_index(inplace = True)

    df["which"] = ""
    df.which = where((df.zimpof_m.notnull()) & (df.zimpof_p.isnull()), "mere", "")
    df.which = where((df.zimpof_p.notnull()) & (df.zimpof_m.isnull()), "pere", "")
    both = (df.zimpof_p.notnull()) & (df.zimpof_m.notnull())
    df.which = where(both & (df.zimpof_p > df.zimpof_m), "pere", "mere")
    df.which = where(both & (df.zimpof_m >= df.zimpof_p), "mere", "pere")

    assert df.which.notnull().all(), "Some enf_ee individuals are not matched with any pere or mere"
    del lpr3_or_lpr4, pere, mere

    df.rename(columns = {"noindiv_enf": "noindiv"}, inplace = True)
    df['idfoy'] = where(df.which == "pere", df.noindiv_p, df.noindiv_m)
    df['idfoy'] = where(df.which == "mere", df.noindiv_m, df.noindiv_p)

    assert df["idfoy"].notnull().all()

    dropped = [col for col in df.columns if col not in ["idfoy", "noindiv"]]
    df.drop(dropped, axis = 1, inplace = True)

    assert not(df.duplicated().any())

    df.set_index("noindiv", inplace = True, verify_integrity = True)
    indivi.set_index("noindiv", inplace = True, verify_integrity = True)

    ind_notnull = indivi["idfoy"].notnull().sum()
    ind_isnull = indivi["idfoy"].isnull().sum()
    indivi = indivi.combine_first(df)
    assert ind_notnull + ind_isnull == (
        indivi["idfoy"].notnull().sum() +
        indivi["idfoy"].isnull().sum()
        )
    indivi.reset_index(inplace = True)
    assert not(indivi.duplicated().any())

    # MBJ: issue delt with when moving from R code to python
    # TODO il faut rajouterles enfants_fip et créer un ménage pour les majeurs
    # On suit guide méthodo erf 2003 page 135
    # On supprime les conjoints FIP et les FIP de 25 ans et plus;
    # On conserve les enfants FIP de 19 à 24 ans;
    # On supprime les FIP de 18 ans et moins, exceptés les FIP nés en 2002 dans un
    # ménage en 6ème interrogation car ce sont des enfants nés aprés la date d'enquète
    # EEC que l'on ne retrouvera pas dans les EEC suivantes.
    #
    log.info(u"    4.2 : On enlève les individus pour lesquels il manque le déclarant")
    fip = temporary_store['fipDat_{}'.format(year)]
    fip["declar"] = np.nan
    fip["agepf"] = np.nan

    fip.drop(["actrec", "year", "noidec"], axis = 1, inplace = True)
    fip.naia = fip.naia.astype("int32")
    fip.rename(
        columns = dict(
            ident = "idmen",
            persfip = "quifoy",
            zsali = "sali2",  # Inclu les salaires non imposables des agents d'assurance
            zchoi = "choi2",
            zrsti = "rsti2",
            zalri = "alr2"),
        inplace = True)

    is_fip_19_25 = ((year - fip.naia - 1) >= 19) & ((year - fip.naia - 1) < 25)

    # TODO: BUT for the time being we keep them in thier vous menage so the following lines are commented
    # The idmen are of the form 60XXXX we use idmen 61XXXX, 62XXXX for the idmen of the kids over 18 and less than 25
    # fip[is_fip_19_25 ,"idmen"] <- (99-fip[is_fip_19_25,"noi"]+1)*100000 + fip[is_fip_19_25,"idmen"]
    # fip[is_fip_19_25 ,"lpr"]  <- 1
    #
    # indivi <- rbind.fill(indivi,fip[is_fip_19_25,])

    indivi = concat([indivi, fip.loc[is_fip_19_25]])
    del is_fip_19_25
    indivi['age'] = year - indivi.naia - 1
    indivi['age_en_mois'] = 12 * indivi.age + 12 - indivi.naim

    indivi["quimen"] = 0
    indivi.quimen[indivi.lpr == 1] = 0
    indivi.quimen[indivi.lpr == 2] = 1
    indivi.quimen[indivi.lpr == 3] = 2
    indivi.quimen[indivi.lpr == 4] = 3
    indivi['not_pr_cpr'] = None  # Create a new row
    indivi.not_pr_cpr[indivi.lpr <= 2] = False
    indivi.not_pr_cpr[indivi.lpr > 2] = True

    assert indivi.not_pr_cpr.isin([True, False]).all()

    log.info(u"    4.3 : Creating non pr=0 and cpr=1 idmen's")
    indivi.reset_index(inplace = True)

    test1 = indivi[['quimen', 'idmen']][indivi.not_pr_cpr].copy()
    test1['quimen'] = 2

    j = 2
    while any(test1.duplicated(['quimen', 'idmen'])):
        test1.loc[test1.duplicated(['quimen', 'idmen']), 'quimen'] = j + 1
        j += 1
    print_id(indivi)
    indivi.update(test1)

    print_id(indivi)

    # indivi.set_index(['quimen']) #TODO: check relevance
    # TODO problème avec certains idfoy qui n'ont pas de vous
    log.info(u"Etape 5 : Gestion des idfoy qui n'ont pas de vous")
    all_ind = indivi.drop_duplicates('idfoy')
    with_ = indivi.loc[indivi.quifoy == 'vous', 'idfoy']
    without = all_ind[~(all_ind.idfoy.isin(with_.values))]

    log.info(u"On cherche si le déclarant donné par la deuxième déclaration est bien un vous")

    # TODO: the following should be delt with at the import of the tables
    indivi.replace(
        to_replace = {
            'declar2': {'NA': np.nan, '': np.nan}
            },
        inplace = True
        )

    has_declar2 = (indivi.idfoy.isin(without.idfoy.values)) & (indivi.declar2.notnull())

    decl2_idfoy = (
        indivi.loc[has_declar2, "idmen"].astype('int') * 100 +
        indivi.loc[has_declar2, "declar2"].str[0:2].astype('int')        )
    indivi.loc[has_declar2, 'idfoy'] = where(decl2_idfoy.isin(with_.values), decl2_idfoy, None)
    del all_ind, with_, without, has_declar2

    log.info(u"    5.1 : Elimination idfoy restant")
    idfoyList = indivi.loc[indivi.quifoy == "vous", 'idfoy'].drop_duplicates()
    indivi = indivi[indivi.idfoy.isin(idfoyList.values)]
    del idfoyList
    print_id(indivi)

    # Sélectionne les variables à garder pour les steps suivants
    myvars = [
        "actrec",
        "age",
        "age_en_mois",
        "chpub",
        "encadr",
        "idfoy",
        "idmen",
        "nbsala",
        "noi",
        "noindiv",
        "prosa",
        "quelfic",
        "quifoy",
        "quimen",
        "statut",
        "titc",
        "txtppb",
        "wprm",
        "rc1rev",
        "maahe",
        ]

    assert len(set(myvars).difference(set(indivi.columns))) == 0, \
        "Manquent les colonnes suivantes : {}".format(set(myvars).difference(set(indivi.columns)))

    indivi = indivi[myvars].copy()
    # TODO les actrec des fip ne sont pas codées (on le fera à la fin quand on aura rassemblé
    # les infos provenant des déclarations)
    log.info(u"Etape 6 : Création des variables descriptives")
    log.info(u"    6.1 : variable activité")
    indivi['activite'] = None
    indivi['activite'][indivi.actrec <= 3] = 0
    indivi['activite'][indivi.actrec == 4] = 1
    indivi['activite'][indivi.actrec == 5] = 2
    indivi['activite'][indivi.actrec == 7] = 3
    indivi['activite'][indivi.actrec == 8] = 4
    indivi['activite'][indivi.age <= 13] = 2  # ce sont en fait les actrec=9
    log.info("{}".format(indivi['activite'].value_counts(dropna = False)))
    # TODO: MBJ problem avec les actrec
    # TODO: FIX AND REMOVE
    indivi.activite[indivi.actrec.isnull()] = 5
    indivi.titc[indivi.titc.isnull()] = 0
    assert indivi.titc.notnull().all(), u"Problème avec les titc" # On a 420 NaN pour les varaibels statut, titc etc

    log.info(u"    6.2 : variable statut")
    indivi.statut[indivi.statut.isnull()] = 0
    indivi.statut = indivi.statut.astype('int')
    indivi.statut[indivi.statut == 11] = 1
    indivi.statut[indivi.statut == 12] = 2
    indivi.statut[indivi.statut == 13] = 3
    indivi.statut[indivi.statut == 21] = 4
    indivi.statut[indivi.statut == 22] = 5
    indivi.statut[indivi.statut == 33] = 6
    indivi.statut[indivi.statut == 34] = 7
    indivi.statut[indivi.statut == 35] = 8
    indivi.statut[indivi.statut == 43] = 9
    indivi.statut[indivi.statut == 44] = 10
    indivi.statut[indivi.statut == 45] = 11
    assert indivi.statut.isin(range(12)).all(), u"statut value over range"


    log.info(u"    6.3 : variable txtppb")
    indivi.txtppb.fillna(0, inplace = True)
    assert indivi.txtppb.notnull().all()

    indivi.nbsala.fillna(0, inplace = True)
    indivi['nbsala'] = indivi.nbsala.astype('int')
    indivi.nbsala[indivi.nbsala == 99] = 10
    assert indivi.nbsala.isin(range(11)).all()

    log.info(u"    6.4 : variable chpub et CSP")
    indivi.chpub.fillna(0, inplace = True)
    indivi.chpub = indivi.chpub.astype('int')
    indivi.chpub[indivi.chpub.isnull()] = 0
    assert indivi.chpub.isin(range(11)).all()

    indivi['cadre'] = 0
    indivi.prosa.fillna(0, inplace = True)
    assert indivi['prosa'].notnull().all()
    log.info("{}".format(indivi['encadr'].value_counts(dropna = False)))

    # encadr : 1=oui, 2=non
    indivi.encadr.fillna(2, inplace = True)
    indivi.encadr[indivi.encadr == 0] = 2

    assert indivi.encadr.notnull().all()
    assert indivi.encadr.isin([1, 2]).all()

    indivi['cadre'][indivi.prosa.isin([7, 8])] = 1
    indivi['cadre'][(indivi.prosa == 9) & (indivi.encadr == 1)] = 1

    assert indivi['cadre'].isin(range(2)).all()

    log.info(
        u"Etape 7: on vérifie qu'il ne manque pas d'info sur les liens avec la personne de référence")
    log.info(
        u"nb de doublons idfam/quifam {}".format(len(indivi[indivi.duplicated(cols=['idfoy', 'quifoy'])])))

    log.info(u"On crée les n° de personnes à charge")
    assert indivi['idfoy'].notnull().all()
    print_id(indivi)
    indivi['quifoy2'] = 2
    indivi.quifoy2[indivi.quifoy == 'vous'] = 0
    indivi.quifoy2[indivi.quifoy == 'conj'] = 1
    indivi.quifoy2[indivi.quifoy == 'pac'] = 2

    del indivi['quifoy']
    indivi['quifoy'] = indivi.quifoy2
    del indivi['quifoy2']

    print_id(indivi)
    test2 = indivi[['quifoy', 'idfoy', 'noindiv']][indivi['quifoy'] == 2].copy()
    print_id(test2)

    j = 2
    while test2.duplicated(['quifoy', 'idfoy']).any():
        test2.loc[test2.duplicated(['quifoy', 'idfoy']), 'quifoy'] = j
        j += 1

    print_id(test2)
    indivi = indivi.merge(test2, on = ['noindiv', 'idfoy'], how = "left")
    indivi['quifoy'] = indivi['quifoy_x']
    indivi['quifoy'] = where(indivi['quifoy_x'] == 2, indivi['quifoy_y'], indivi['quifoy_x'])
    del indivi['quifoy_x'], indivi['quifoy_y']
    print_id(indivi)

    del test2, fip
    log.info(
        u"nb de doublons idfam/quifam' {}".format(
            len(indivi[indivi.duplicated(subset = ['idfoy', 'quifoy'])])
            )
        )
    print_id(indivi)

    log.info(u"Etape 8 : création des fichiers totaux")
    famille = temporary_store['famc_{}'.format(year)]

    log.info(u"    8.1 : création de tot2 & tot3")
    tot2 = indivi.merge(famille, on = 'noindiv', how = 'inner')
#     del famille # TODO: MBJ increase in number of menage/foyer when merging with family ...
    del famille

    control(tot2, debug = True, verbose = True)
    assert tot2.quifam.notnull().all()

    temporary_store['tot2_{}'.format(year)] = tot2
    del indivi
    log.info(u"    tot2 saved")

    tot2.merge(foyer, how = 'left')

    tot2 = tot2[tot2.idmen.notnull()].copy()

    print_id(tot2)
    tot3 = tot2
    # TODO: check where they come from
    tot3 = tot3.drop_duplicates(subset = 'noindiv')
    log.info("{}".format(len(tot3)))

    # Block to remove any unwanted duplicated pair
    control(tot3, debug = True, verbose = True)
    tot3 = tot3.drop_duplicates(subset = ['idfoy', 'quifoy'])
    tot3 = tot3.drop_duplicates(subset = ['idfam', 'quifam'])
    tot3 = tot3.drop_duplicates(subset = ['idmen', 'quimen'])
    tot3 = tot3.drop_duplicates(subset = ['noindiv'])
    control(tot3)

    log.info(u"    8.2 : On ajoute les variables individualisables")

    allvars = temporary_store['ind_vars_to_remove_{}'.format(year)]
    vars2 = set(tot3.columns).difference(set(allvars))
    tot3 = tot3[list(vars2)]
    log.info("{}".format(len(tot3)))

    assert not(tot3.duplicated(subset = ['noindiv']).any()), "doublon dans tot3['noindiv']"
    lg_dup = len(tot3[tot3.duplicated(['idfoy', 'quifoy'])])
    assert lg_dup == 0, "{} pairs of idfoy/quifoy in tot3 are duplicated".format(lg_dup)

    temporary_store['tot3_{}'.format(year)] = tot3
    control(tot3)

    del tot2, allvars, tot3, vars2
    log.info(u"tot3 sauvegardé")
    gc.collect()
Ejemplo n.º 7
0
def final(year=None, filename="test", check=True):

    assert year is not None
    temporary_store = TemporaryStore.create(file_name="erfs")

    ##***********************************************************************/
    log.info(u'08_final: derniers réglages')
    ##***********************************************************************/
    #
    # loadTmp("final.Rdata")
    # # On définit comme célibataires les individus dont on n'a pas retrouvé la déclaration
    # final$statmarit[is.na(final$statmarit)] <- 2
    # table(final$statmarit, useNA='ifany')
    #
    import gc
    gc.collect()
    final = temporary_store['final_{}'.format(year)]
    log.info('check doublons'.format(len(final[final.duplicated(['noindiv'
                                                                 ])])))
    final.statmarit = where(final.statmarit.isnull(), 2, final.statmarit)
    #

    # # activite des fip
    # table(final[final$quelfic=="FIP","activite"],useNA="ifany")
    # summary(final[final$quelfic=="FIP",c("activite","choi","sali","alr","rsti","age")] )
    # # activite      # actif occup? 0, ch?meur 1, ?tudiant/?l?ve 2, retrait? 3, autre inactif 4
    #
    # final_fip <- final[final$quelfic=="FIP",]
    # final_fip <- within(final_fip,{
    #   choi <- ifelse(is.na(choi),0,choi)
    #   sali <- ifelse(is.na(sali),0,sali)
    #   alr <- ifelse(is.na(alr),0,alr)
    #   rsti <- ifelse(is.na(rsti),0,rsti)
    #   activite <- 2 # TODO comment choisr la valeur par d?faut ?
    #   activite <- ifelse(choi > 0,1,activite)
    #   activite <- ifelse(sali > 0,0,activite)
    #   activite <- ifelse(age  >= 21, 2,activite) # ne peuvent être rattachés que les étudiants
    # })
    # final[final$quelfic=="FIP",]<- final_fip
    # table(final_fip[,c("age","activite")])
    # rm(final_fip)
    #
    # print_id(final)
    # saveTmp(final, file= "final.Rdata")
    #
    log.info('    gestion des FIP de final')
    final_fip = final[["choi", "sali", "alr", "rsti",
                       "age"]][final.quelfic == "FIP"].copy()

    log.info(
        set(["choi", "sali", "alr",
             "rsti"]).difference(set(final_fip.columns)))
    for var in ["choi", "sali", "alr", "rsti"]:
        final_fip[var].fillna(0, inplace=True)
        assert final_fip[var].notnull().all(
        ), "Some NaN are remaining in column {}".format(var)

    final_fip["activite"] = 2  # TODO comment choisr la valeur par défaut ?
    final_fip.activite = where(final_fip.choi > 0, 1, final_fip.activite)
    final_fip.activite = where(final_fip.sali > 0, 0, final_fip.activite)
    final_fip.activite = where(
        final_fip.age > 21, 2,
        final_fip.activite)  # ne peuvent être rattachés que les étudiants

    final.update(final_fip)
    temporary_store['final_{}'.format(year)] = final
    log.info("final has been updated with fip")

    menage_en_mois = temporary_store['menage_en_mois_{}'.format(year)]
    menage_en_mois.rename(columns=dict(ident="idmen", loym="loyer"),
                          inplace=True)
    menage_en_mois["cstotpragr"] = np.floor(menage_en_mois["cstotpr"] / 10)

    # 2008 tau99 removed TODO: check ! and check incidence
    vars = [
        "champm",
        "cstotpragr",
        "ddipl",
        "idmen",
        "loyer",
        "nbinde",
        "pol99",
        "reg",
        "so",
        "tau99",
        "tu99",
        "typmen15",
        "wprm",
        "zthabm",
    ]
    if year == 2008:
        vars.remove("tau99")
    famille_vars = [
        "m_afeamam", "m_agedm", "m_clcam", "m_colcam", 'm_mgamm', 'm_mgdomm'
    ]
    # if ("naf16pr" %in% names(menage_en_mois)) {
    #   naf16pr <- factor(menage_en_mois$naf16pr)
    #   levels(naf16pr) <-  0:16
    #   menage_en_mois$naf16pr <- as.character(naf16pr)
    #   menage_en_mois[is.na(menage_en_mois$naf16pr), "naf16pr" ] <- "-1"  # Sans objet
    #   vars <- c(vars,"naf16pr")
    # } else if ("nafg17npr" %in% names(menage_en_mois)) {
    #   # TODO: pb in 2008 with xx
    #   if (year == "2008"){
    #     menage_en_mois[ menage_en_mois$nafg17npr == "xx" & !is.na(menage_en_mois$nafg17npr), "nafg17npr"] <- "00"
    #   }
    #   nafg17npr <- factor(menage_en_mois$nafg17npr)
    #   levels(nafg17npr) <-  0:17
    #   menage_en_mois$nafg17npr <- as.character(nafg17npr)
    #   menage_en_mois[is.na(menage_en_mois$nafg17npr), "nafg17npr" ] <- "-1"  # Sans objet
    # }
    #
    #TODO: TODO: pytohn translation needed
    #    if "naf16pr" in menage_en_mois.columns:
    #        naf16pr <- factor(menage_en_mois$naf16pr)
    #   levels(naf16pr) <-  0:16
    #   menage_en_mois$naf16pr <- as.character(naf16pr)
    #   menage_en_mois[is.na(menage_en_mois$naf16pr), "naf16pr" ] <- "-1"  # Sans objet
    #   vars <- c(vars,"naf16pr")
    # } else if ("nafg17npr" %in% names(menage_en_mois)) {
    #   # TODO: pb in 2008 with xx
    #   if (year == "2008"){
    #     menage_en_mois[ menage_en_mois$nafg17npr == "xx" & !is.na(menage_en_mois$nafg17npr), "nafg17npr"] <- "00"
    #   }
    #   nafg17npr <- factor(menage_en_mois$nafg17npr)
    #   levels(nafg17npr) <-  0:17
    #   menage_en_mois$nafg17npr <- as.character(nafg17npr)
    #   menage_en_mois[is.na(menage_en_mois$nafg17npr), "nafg17npr" ] <- "-1"  # Sans objet
    # }
    # TODO: 2008tau99 is not present should be provided by 02_loy.... is it really needed
    all_vars = vars + famille_vars

    log.info("liste de toutes les variables : {}".format(all_vars))
    log.info(menage_en_mois.info())
    available_vars = list(
        set(all_vars).intersection(set(menage_en_mois.columns)))
    loyersMenages = menage_en_mois.xs(available_vars, axis=1)
    #
    # # Recodage de typmen15: modalités de 1:15
    # table(loyersMenages$typmen15, useNA="ifany")
    # loyersMenages <- within(loyersMenages, {
    #   typmen15[typmen15==10 ] <- 1
    #   typmen15[typmen15==11 ] <- 2
    #   typmen15[typmen15==21 ] <- 3
    #   typmen15[typmen15==22 ] <- 4
    #   typmen15[typmen15==23 ] <- 5
    #   typmen15[typmen15==31 ] <- 6
    #   typmen15[typmen15==32 ] <- 7
    #   typmen15[typmen15==33 ] <- 8
    #   typmen15[typmen15==41 ] <- 9
    #   typmen15[typmen15==42 ] <- 10
    #   typmen15[typmen15==43 ] <- 11
    #   typmen15[typmen15==44 ] <- 12
    #   typmen15[typmen15==51 ] <- 13
    #   typmen15[typmen15==52 ] <- 14
    #   typmen15[typmen15==53 ] <- 15
    # })
    #
    #
    # TODO: MBJ UNNECESSARY ?
    #
    # # Pb avec ddipl, pas de modalités 2: on décale les chaps >=3
    # # Cependant on fait cela après avoir fait les traitement suivants
    # table(loyersMenages$ddipl, useNA="ifany")
    # # On convertit les ddipl en numeric
    # loyersMenages$ddipl <- as.numeric(loyersMenages$ddipl)
    # table(loyersMenages$ddipl, useNA="ifany")
    # #   On met les non renseignés ie, NA et "" à sans diplome (modalité 7)
    # loyersMenages[is.na(loyersMenages$ddipl), "ddipl"] <- 7
    #
    # loyersMenages[loyersMenages$ddipl>1, "ddipl"] <- loyersMenages$ddipl[loyersMenages$ddipl>1]-1
    #
    log.info("{}".format(loyersMenages.info()))
    loyersMenages.ddipl = where(loyersMenages.ddipl.isnull(), 7,
                                loyersMenages.ddipl)
    loyersMenages.ddipl = where(loyersMenages.ddipl > 1,
                                loyersMenages.ddipl - 1, loyersMenages.ddipl)
    loyersMenages.ddipl.astype("int32")

    final['act5'] = NaN
    final.act5 = where(final.actrec == 1, 2, final.act5)  # indépendants
    final.act5 = where(final.actrec.isin([2, 3]), 1, final.act5)  # salariés

    final.act5 = where(final.actrec == 4, 3, final.act5)  # chômeur
    final.act5 = where(final.actrec == 7, 4, final.act5)  # retraité
    final.act5 = where(final.actrec == 8, 5, final.act5)  # autres inactifs

    final.act5 = where(final.actrec == 1, 2, final.act5)  # indépendants
    final.act5 = where(final.actrec.isin([2, 3]), 1, final.act5)  # salariés

    final.act5 = where(final.actrec == 4, 3, final.act5)  # chômeur
    final.act5 = where(final.actrec == 7, 4, final.act5)  # retraité
    final.act5 = where(final.actrec == 8, 5, final.act5)  # autres inactifs

    log.info("{}".format(final.act5.value_counts()))

    #     assert final.act5.notnull().all(), 'there are NaN inside final.act5'
    # final$wprm <- NULL # with the intention to extract wprm from menage to deal with FIPs
    # final$taxe_habitation <- final$zthabm # rename zthabm to taxe_habitation
    # final$zthabm <- NULL
    #
    # final2 <- merge(final, loyersMenages, by="idmen", all.x=TRUE)
    log.info('    création de final2')
    del final["wprm"]
    gc.collect()
    final.rename(columns=dict(zthabm="taxe_habitation"),
                 inplace=True)  # rename zthabm to taxe_habitation
    final2 = final.merge(loyersMenages, on="idmen", how="left")  # TODO: Check
    log.info("{}".format(loyersMenages.head()))
    gc.collect()
    print_id(final2)
    # # TODO: merging with patrimoine
    log.info('    traitement des zones apl')
    import pkg_resources
    openfisca_france_data_location = pkg_resources.get_distribution(
        'openfisca-france-data').location
    zone_apl_imputation_data_file_path = os.path.join(
        openfisca_france_data_location,
        'openfisca_france_data',
        'zone_apl_data',
        'zone_apl',
        'zone_apl_imputation_data.csv',
    )
    apl_imp = read_csv(zone_apl_imputation_data_file_path)

    log.info("{}".format(apl_imp.head(10)))
    if year == 2008:
        zone_apl = final2.xs(["tu99", "pol99", "reg"], axis=1)
    else:
        zone_apl = final2.xs(["tu99", "pol99", "tau99", "reg"], axis=1)

    for i in range(len(apl_imp["TU99"])):
        tu = apl_imp["TU99"][i]
        pol = apl_imp["POL99"][i]
        tau = apl_imp["TAU99"][i]
        reg = apl_imp["REG"][i]

    if year == 2008:
        indices = (final2["tu99"] == tu) & (final2["pol99"]
                                            == pol) & (final2["reg"] == reg)
        selection = (apl_imp["TU99"] == tu) & (apl_imp["POL99"] == pol) & (
            apl_imp["REG"] == reg)
    else:
        indices = (final2["tu99"] == tu) & (final2["pol99"] == pol) & (
            final2["tau99"] == tau) & (final2["reg"] == reg)
        selection = ((apl_imp["TU99"] == tu) & (apl_imp["POL99"] == pol) &
                     (apl_imp["TAU99"] == tau) & (apl_imp["REG"] == reg))

    z = random.uniform(size=indices.sum())
    log.info(len(z))
    log.info(len(indices))
    log.info(len(indices) / len(z))
    probs = apl_imp[["proba_zone1", "proba_zone2"]][selection].copy()
    log.info(probs)
    log.info(probs['proba_zone1'].values)
    proba_zone_1 = probs['proba_zone1'].values[0]
    proba_zone_2 = probs['proba_zone2'].values[0]

    final2["zone_apl"] = 3
    final2["zone_apl"][indices] = (1 + (z > proba_zone_1) +
                                   (z > (proba_zone_1 + proba_zone_2)))
    del indices, probs

    log.info('    performing cleaning on final2')
    log.info('{} sali nuls'.format(len(final2[final2['sali'].isnull()])))
    log.info("{} individus d'âges nuls".format(len(
        final2[final2.age.isnull()])))
    log.info("longueur de final2 avant purge : {}".format(len(final2)))
    #     columns_w_nan = []
    #     for col in final2.columns:
    #         if final2[final2['idfoy'].notnull()][col].isnull().any() and not final2[col].isnull().all():
    #             columns_w_nan.append(col)
    #     print columns_w_nan
    log.info('check doublons : {}'.format(
        len(final2[final2.duplicated(['noindiv'])])))
    log.info("{}".format(final2.age.isnull().sum()))

    #     print final2.loc[final2.duplicated('noindiv'), ['noindiv', 'quifam']].to_string()
    #TODO: JS: des chefs de famille et conjoints en double il faut trouver la source des ces doublons !
    #     final2 = final2.drop_duplicates(['noindiv'])

    final2 = final2[~(final2.age.isnull())]
    log.info(u"longueur de final2 après purge: {}".format(len(final2)))
    print_id(final2)

    #
    # # var <- names(foyer)
    # #a1 <- c('f7rb', 'f7ra', 'f7gx', 'f2aa', 'f7gt', 'f2an', 'f2am', 'f7gw', 'f7gs', 'f8td', 'f7nz', 'f1br', 'f7jy', 'f7cu', 'f7xi', 'f7xo', 'f7xn', 'f7xw', 'f7xy', 'f6hj', 'f7qt', 'f7ql', 'f7qm', 'f7qd', 'f7qb', 'f7qc', 'f1ar', 'f7my', 'f3vv', 'f3vu', 'f3vt', 'f7gu', 'f3vd', 'f2al', 'f2bh', 'f7fm', 'f8uy', 'f7td', 'f7gv', 'f7is', 'f7iy', 'f7il', 'f7im', 'f7ij', 'f7ik', 'f1er', 'f7wl', 'f7wk', 'f7we', 'f6eh', 'f7la', 'f7uh', 'f7ly', 'f8wy', 'f8wx', 'f8wv', 'f7sb', 'f7sc', 'f7sd', 'f7se', 'f7sf', 'f7sh', 'f7si',  'f1dr', 'f7hs', 'f7hr', 'f7hy', 'f7hk', 'f7hj', 'f7hm', 'f7hl', 'f7ho', 'f7hn', 'f4gc', 'f4gb', 'f4ga', 'f4gg', 'f4gf', 'f4ge', 'f7vz', 'f7vy', 'f7vx', 'f7vw', 'f7xe', 'f6aa', 'f1cr', 'f7ka', 'f7ky', 'f7db', 'f7dq', 'f2da')
    # #a2 <- setdiff(a1,names(foyer))
    # #b1 <- c('pondfin', 'alt', 'hsup', 'ass_mat', 'zone_apl', 'inactif', 'ass', 'aer', 'code_postal', 'activite', 'type_sal', 'jour_xyz', 'boursier', 'etr', 'partiel1', 'partiel2', 'empl_dir', 'gar_dom', 'categ_inv', 'opt_colca', 'csg_taux_plein','coloc')
    # # hsup feuille d'impot
    # # boursier pas dispo
    # # inactif etc : extraire cela des donn?es clca etc
    #
    # # tester activit? car 0 vaut actif
    # table(is.na(final2$activite),useNA="ifany")
    #
    # saveTmp(final2, file= "final2.Rdata")

    control(final2, debug=True)
    log.info(final2.age.isnull().sum())
    final2 = final2.drop_duplicates(subset='noindiv')

    log.info('    Filter to manage the new 3-tables structures:')
    # On récupère les foyer, famille, ménages qui ont un chef :
    liste_men = unique(final2.loc[final2['quimen'] == 0, 'idmen'].values)
    liste_fam = unique(final2.loc[final2['quifam'] == 0, 'idfam'].values)
    liste_foy = unique(final2.loc[final2['quifoy'] == 0, 'idfoy'].values)

    #On ne conserve dans final2 que ces foyers là :
    log.info('final2 avant le filtrage {}'.format(len(final2)))
    final2 = final2.loc[final2.idmen.isin(liste_men), :]
    final2 = final2.loc[final2.idfam.isin(liste_fam), :]
    final2 = final2.loc[final2.idfoy.isin(liste_foy), :]
    log.info('final2 après le filtrage {}'.format(len(final2)))

    rectify_dtype(final2, verbose=False)
    #    home = os.path.expanduser("~")
    #    test_filename = os.path.join(home, filename + ".h5")
    #    if os.path.exists(test_filename):
    #        import warnings
    #        import datetime
    #        time_stamp = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M')
    #        renamed_file = os.path.join(DATA_SOURCES_DIR, filename + "_" + time_stamp + ".h5")
    #        warnings.warn("A file with the same name already exists \n Renaming current output and saving to " + renamed_file)
    #        test_filename = renamed_file
    data_frame = final2

    if year == 2006:  # Hack crade pur régler un problème rémanent
        data_frame = data_frame[data_frame.idfam != 602177906].copy()

    for id_variable in [
            'idfam', 'idfoy', 'idmen', 'noi', 'quifam', 'quifoy', 'quimen'
    ]:
        data_frame[id_variable] = data_frame[id_variable].astype('int')

    check = False
    if check:
        check_structure(data_frame)

    for entity_id in ['idmen', 'idfoy', 'idfam']:
        data_frame = id_formatter(data_frame, entity_id)

    set_variables_default_value(data_frame, year)
    return data_frame
def create_totals_second_pass(temporary_store = None, year = None):
    assert temporary_store is not None
    assert year is not None

    log.info(u"    5.1 : Elimination idfoy restant")
    # Voiture balai
    # On a plein d'idfoy vides, on fait 1 ménage = 1 foyer fiscal
    indivi = temporary_store['indivi_step_06_{}'.format(year)]
    idfoyList = indivi.loc[indivi.quifoy == "vous", 'idfoy'].unique()
    indivi_without_idfoy = ~indivi.idfoy.isin(idfoyList)

    indivi.loc[indivi_without_idfoy, 'quifoy'] = "pac"

    indivi.loc[indivi_without_idfoy & (indivi.quimen == 0) & (indivi.age >= 18), 'quifoy'] = "vous"
    indivi.loc[indivi_without_idfoy & (indivi.quimen == 0) & (indivi.age >= 18), 'idfoy'] = (
        indivi.loc[indivi_without_idfoy, "idmen"].astype('int') * 100 + 51
        )
    indivi.loc[indivi_without_idfoy & (indivi.quimen == 1) & (indivi.age >= 18), 'quifoy'] = "conj"

    del idfoyList
    print_id(indivi)

    # Sélectionne les variables à garder pour les steps suivants
    variables = [
        "actrec",
        "age",
        "age_en_mois",
        "chpub",
        "encadr",
        "idfoy",
        "idmen",
        "nbsala",
        "noi",
        "noindiv",
        "prosa",
        "quelfic",
        "quifoy",
        "quimen",
        "statut",
        "titc",
        "txtppb",
        "wprm",
        "rc1rev",
        "maahe",
        "sali",
        "rsti",
        "choi",
        "alr",
        "wprm",
        ]

    assert set(variables).issubset(set(indivi.columns)), \
        "Manquent les colonnes suivantes : {}".format(set(variables).difference(set(indivi.columns)))

    dropped_columns = [variable for variable in indivi.columns if variable not in variables]
    indivi.drop(dropped_columns, axis = 1, inplace = True)

    #  see http://stackoverflow.com/questions/11285613/selecting-columns
    indivi.reset_index(inplace = True)
    gc.collect()

    # TODO les actrec des fip ne sont pas codées (on le fera à la fin quand on aura rassemblé
    # les infos provenant des déclarations)
    log.info(u"Etape 6 : Création des variables descriptives")
    log.info(u"    6.1 : Variable activité")
    log.info(u"Variables présentes; \n {}".format(indivi.columns))

    indivi['activite'] = np.nan
    indivi.loc[indivi.actrec <= 3, 'activite'] = 0
    indivi.loc[indivi.actrec == 4, 'activite'] = 1
    indivi.loc[indivi.actrec == 5, 'activite'] = 2
    indivi.loc[indivi.actrec == 7, 'activite'] = 3
    indivi.loc[indivi.actrec == 8, 'activite'] = 4
    indivi.loc[indivi.age <= 13, 'activite'] = 2  # ce sont en fait les actrec=9
    log.info("Valeurs prises par la variable activité \n {}".format(indivi['activite'].value_counts(dropna = False)))
    # TODO: MBJ problem avec les actrec
    # TODO: FIX AND REMOVE
    indivi.loc[indivi.actrec.isnull(), 'activite'] = 5
    indivi.loc[indivi.titc.isnull(), 'titc'] = 0
    assert indivi.titc.notnull().all(), \
        u"Problème avec les titc"  # On a 420 NaN pour les varaibels statut, titc etc

    log.info(u"    6.2 : Variable statut")
    indivi.loc[indivi.statut.isnull(), 'statut'] = 0
    indivi.statut = indivi.statut.astype('int')
    indivi.loc[indivi.statut == 11, 'statut'] = 1
    indivi.loc[indivi.statut == 12, 'statut'] = 2
    indivi.loc[indivi.statut == 13, 'statut'] = 3
    indivi.loc[indivi.statut == 21, 'statut'] = 4
    indivi.loc[indivi.statut == 22, 'statut'] = 5
    indivi.loc[indivi.statut == 33, 'statut'] = 6
    indivi.loc[indivi.statut == 34, 'statut'] = 7
    indivi.loc[indivi.statut == 35, 'statut'] = 8
    indivi.loc[indivi.statut == 43, 'statut'] = 9
    indivi.loc[indivi.statut == 44, 'statut'] = 10
    indivi.loc[indivi.statut == 45, 'statut'] = 11
    assert indivi.statut.isin(range(12)).all(), u"statut value over range"
    log.info("Valeurs prises par la variable statut \n {}".format(
        indivi['statut'].value_counts(dropna = False)))

    log.info(u"    6.3 : variable txtppb")
    indivi.loc[indivi.txtppb.isnull(), 'txtppb'] = 0
    assert indivi.txtppb.notnull().all()
    indivi.loc[indivi.nbsala.isnull(), 'nbsala'] = 0
    indivi.nbsala = indivi.nbsala.astype('int')
    indivi.loc[indivi.nbsala == 99, 'nbsala'] = 10
    assert indivi.nbsala.isin(range(11)).all()
    log.info("Valeurs prises par la variable txtppb \n {}".format(
        indivi['txtppb'].value_counts(dropna = False)))

    log.info(u"    6.4 : variable chpub et CSP")
    indivi.loc[indivi.chpub.isnull(), 'chpub'] = 0
    indivi.chpub = indivi.chpub.astype('int')
    assert indivi.chpub.isin(range(11)).all()

    indivi['cadre'] = 0
    indivi.loc[indivi.prosa.isnull(), 'prosa'] = 0
    assert indivi.prosa.notnull().all()
    log.info("Valeurs prises par la variable encadr \n {}".format(indivi['encadr'].value_counts(dropna = False)))

    # encadr : 1=oui, 2=non
    indivi.loc[indivi.encadr.isnull(), 'encadr'] = 2
    indivi.loc[indivi.encadr == 0, 'encadr'] = 2
    assert indivi.encadr.notnull().all()
    assert indivi.encadr.isin([1, 2]).all()

    indivi.loc[indivi.prosa.isin([7, 8]), 'cadre'] = 1
    indivi.loc[(indivi.prosa == 9) & (indivi.encadr == 1), 'cadre'] = 1
    assert indivi.cadre.isin(range(2)).all()

    log.info(
        u"Etape 7: on vérifie qu'il ne manque pas d'info sur les liens avec la personne de référence"
        )
    log.info(
        u"nb de doublons idfoy/quifoy {}".format(len(indivi[indivi.duplicated(subset = ['idfoy', 'quifoy'])]))
        )

    log.info(u"On crée les n° de personnes à charge dans le foyer fiscal")
    assert indivi.idfoy.notnull().all()
    print_id(indivi)
    indivi['quifoy_bis'] = 2
    indivi.loc[indivi.quifoy == 'vous', 'quifoy_bis'] = 0
    indivi.loc[indivi.quifoy == 'conj', 'quifoy_bis'] = 1
    indivi.loc[indivi.quifoy == 'pac', 'quifoy_bis'] = 2
    del indivi['quifoy']
    indivi['quifoy'] = indivi.quifoy_bis.copy()
    del indivi['quifoy_bis']

    print_id(indivi)
    pac = indivi.loc[indivi['quifoy'] == 2, ['quifoy', 'idfoy', 'noindiv']].copy()
    print_id(pac)

    j = 2
    while pac.duplicated(['quifoy', 'idfoy']).any():
        pac.loc[pac.duplicated(['quifoy', 'idfoy']), 'quifoy'] = j
        j += 1

    print_id(pac)
    indivi = indivi.merge(pac, on = ['noindiv', 'idfoy'], how = "left")
    indivi['quifoy'] = indivi['quifoy_x']
    indivi['quifoy'] = where(indivi['quifoy_x'] == 2, indivi['quifoy_y'], indivi['quifoy_x'])
    del indivi['quifoy_x'], indivi['quifoy_y']
    print_id(indivi)
    del pac
    assert len(indivi[indivi.duplicated(subset = ['idfoy', 'quifoy'])]) == 0, \
        u"Il y a {} doublons idfoy/quifoy".format(
            len(indivi[indivi.duplicated(subset = ['idfoy', 'quifoy'])])
            )
    print_id(indivi)

    log.info(u"Etape 8 : création des fichiers totaux")
    famille = temporary_store['famc_{}'.format(year)]

    log.info(u"    8.1 : création de tot2 & tot3")
    tot2 = indivi.merge(famille, on = 'noindiv', how = 'inner')
    # TODO: MBJ increase in number of menage/foyer when merging with family ...
    del famille

    control(tot2, debug = True, verbose = True)
    assert tot2.quifam.notnull().all()

    temporary_store['tot2_{}'.format(year)] = tot2
    del indivi
    log.info(u"    tot2 saved")
    tot2 = tot2[tot2.idmen.notnull()].copy()

    print_id(tot2)
    tot3 = tot2
    # TODO: check where they come from
    log.info("Avant élimination des doublons noindiv: {}".format(len(tot3)))

    tot3 = tot3.drop_duplicates(subset = 'noindiv')
    log.info("Après élimination des doublons noindiv: {}".format(len(tot3)))

    # Block to remove any unwanted duplicated pair

    control(tot3, debug = True, verbose = True)
    tot3 = tot3.drop_duplicates(subset = ['idfoy', 'quifoy'])
    log.info("Après élimination des doublons idfoy, quifoy: {}".format(len(tot3)))
    tot3 = tot3.drop_duplicates(subset = ['idfam', 'quifam'])
    log.info("Après élimination des doublons idfam, 'quifam: {}".format(len(tot3)))
    tot3 = tot3.drop_duplicates(subset = ['idmen', 'quimen'])
    log.info("Après élimination des doublons idmen, quimen: {}".format(len(tot3)))
    tot3 = tot3.drop_duplicates(subset = ['noindiv'])
    control(tot3)

    log.info(u"    8.2 : On ajoute les variables individualisables")

    allvars = temporary_store['ind_vars_to_remove_{}'.format(year)]
    vars2 = set(tot3.columns).difference(set(allvars))
    tot3 = tot3[list(vars2)]
    log.info("{}".format(len(tot3)))

    assert not(tot3.duplicated(subset = ['noindiv']).any()), "doublon dans tot3['noindiv']"
    lg_dup = len(tot3[tot3.duplicated(['idfoy', 'quifoy'])])
    assert lg_dup == 0, "{} pairs of idfoy/quifoy in tot3 are duplicated".format(lg_dup)

    temporary_store['tot3_{}'.format(year)] = tot3
    control(tot3)

    del tot2, allvars, tot3, vars2
    gc.collect()
    log.info(u"tot3 sauvegardé")
Ejemplo n.º 9
0
def final(temporary_store = None, year = None, check = True):

    assert temporary_store is not None
    assert year is not None

    log.info(u'08_final: derniers réglages')

    # On définit comme célibataires les individus dont on n'a pas retrouvé la déclaration
    final = temporary_store['final_{}'.format(year)]

    if year == 2009:
        final = normalizes_roles_in_entity(final, 'foy')
        final = normalizes_roles_in_entity(final, 'men')

    if check:
        check_structure(final)

    final.statmarit.fillna(2, inplace = True)

    # activite des fip
    log.info('    gestion des FIP de final')
    final_fip = final.loc[final.quelfic.isin(["FIP", "FIP_IMP"]), ["choi", "sali", "alr", "rsti", "age"]].copy()

    log.info(set(["choi", "sali", "alr", "rsti"]).difference(set(final_fip.columns)))
    for var in ["choi", "sali", "alr", "rsti"]:
        final_fip[var].fillna(0, inplace = True)
        assert final_fip[var].notnull().all(), "Some NaN are remaining in column {}".format(var)

    final_fip["activite"] = 2  # TODO comment choisr la valeur par défaut ?
    final_fip.activite = where(final_fip.choi > 0, 1, final_fip.activite)
    final_fip.activite = where(final_fip.sali > 0, 0, final_fip.activite)
    final_fip.activite = where(final_fip.age > 21, 2, final_fip.activite)  # ne peuvent être rattachés que les étudiants

    final.update(final_fip)
    temporary_store['final_{}'.format(year)] = final
    log.info("final has been updated with fip")

    menagem = temporary_store['menagem_{}'.format(year)]
    assert 'ident' in menagem.columns
    assert 'so' in menagem.columns
    menagem.rename(
        columns = dict(ident = "idmen", so = "statut_occupation"),
        inplace = True
        )
    menagem["cstotpragr"] = np.floor(menagem["cstotpr"] / 10)

    # 2008 tau99 removed TODO: check ! and check incidence
    variables = [
        "champm",
        "cstotpragr",
        "ddipl",
        "idmen",
        "nbinde",
        "pol99",
        "reg",
        "statut_occupation",
        "tau99",
        "tu99",
        "typmen15",
        "wprm",
        "zthabm",
        ]
    if year == 2008:
        variables.remove("tau99")
    famille_variables = ["m_afeamam", "m_agedm", "m_clcam", "m_colcam", 'm_mgamm', 'm_mgdomm']

    if 'loyer' in menagem.columns:
        variables.append('loyer')
# if ("naf16pr" %in% names(menagem)) {
#   naf16pr <- factor(menagem$naf16pr)
#   levels(naf16pr) <-  0:16
#   menagem$naf16pr <- as.character(naf16pr)
#   menagem[is.na(menagem$naf16pr), "naf16pr" ] <- "-1"  # Sans objet
#   variables <- c(variables,"naf16pr")
# } else if ("nafg17npr" %in% names(menagem)) {
#   # TODO: pb in 2008 with xx
#   if (year == "2008"){
#     menagem[ menagem$nafg17npr == "xx" & !is.na(menagem$nafg17npr), "nafg17npr"] <- "00"
#   }
#   nafg17npr <- factor(menagem$nafg17npr)
#   levels(nafg17npr) <-  0:17
#   menagem$nafg17npr <- as.character(nafg17npr)
#   menagem[is.na(menagem$nafg17npr), "nafg17npr" ] <- "-1"  # Sans objet
# }
#
# TODO: pytohn translation needed
#    if "naf16pr" in menagem.columns:
#        naf16pr <- factor(menagem$naf16pr)
#   levels(naf16pr) <-  0:16
#   menagem$naf16pr <- as.character(naf16pr)
#   menagem[is.na(menagem$naf16pr), "naf16pr" ] <- "-1"  # Sans objet
#   variables <- c(variables,"naf16pr")
# } else if ("nafg17npr" %in% names(menagem)) {
#   # TODO: pb in 2008 with xx
#   if (year == "2008"){
#     menagem[ menagem$nafg17npr == "xx" & !is.na(menagem$nafg17npr), "nafg17npr"] <- "00"
#   }
#   nafg17npr <- factor(menagem$nafg17npr)
#   levels(nafg17npr) <-  0:17
#   menagem$nafg17npr <- as.character(nafg17npr)
#   menagem[is.na(menagem$nafg17npr), "nafg17npr" ] <- "-1"  # Sans objet
# }
    # TODO: 2008tau99 is not present should be provided by 02_loy.... is it really needed
    all_variables = variables + famille_variables

    log.info("liste de toutes les variables : {}".format(all_variables))
    log.info(menagem.info())
    available_variables = list(set(all_variables).intersection(set(menagem.columns)))
    log.info("liste des variables à extraire de menagem: {}".format(available_variables))
    loyersMenages = menagem[available_variables].copy()
#
# # Recodage de typmen15: modalités de 1:15
# table(loyersMenages$typmen15, useNA="ifany")
# loyersMenages <- within(loyersMenages, {
#   typmen15[typmen15==10 ] <- 1
#   typmen15[typmen15==11 ] <- 2
#   typmen15[typmen15==21 ] <- 3
#   typmen15[typmen15==22 ] <- 4
#   typmen15[typmen15==23 ] <- 5
#   typmen15[typmen15==31 ] <- 6
#   typmen15[typmen15==32 ] <- 7
#   typmen15[typmen15==33 ] <- 8
#   typmen15[typmen15==41 ] <- 9
#   typmen15[typmen15==42 ] <- 10
#   typmen15[typmen15==43 ] <- 11
#   typmen15[typmen15==44 ] <- 12
#   typmen15[typmen15==51 ] <- 13
#   typmen15[typmen15==52 ] <- 14
#   typmen15[typmen15==53 ] <- 15
# })
#
#
# TODO: MBJ UNNECESSARY ?
#
# # Pb avec ddipl, pas de modalités 2: on décale les chaps >=3
# # Cependant on fait cela après avoir fait les traitement suivants
# table(loyersMenages$ddipl, useNA="ifany")
# # On convertit les ddipl en numeric
# loyersMenages$ddipl <- as.numeric(loyersMenages$ddipl)
# table(loyersMenages$ddipl, useNA="ifany")
# #   On met les non renseignés ie, NA et "" à sans diplome (modalité 7)
# loyersMenages[is.na(loyersMenages$ddipl), "ddipl"] <- 7
#
# loyersMenages[loyersMenages$ddipl>1, "ddipl"] <- loyersMenages$ddipl[loyersMenages$ddipl>1]-1
#
    log.info("loyersMenages \n {}".format(loyersMenages.info()))
    loyersMenages.ddipl = where(loyersMenages.ddipl.isnull(), 7, loyersMenages.ddipl)
    loyersMenages.ddipl = where(loyersMenages.ddipl > 1,
                                loyersMenages.ddipl - 1,
                                loyersMenages.ddipl)
    loyersMenages.ddipl = loyersMenages.ddipl.astype("int32")

    final['act5'] = NaN
    final.act5 = where(final.actrec == 1, 2, final.act5)  # indépendants
    final.act5 = where(final.actrec.isin([2, 3]), 1, final.act5)  # salariés

    final.act5 = where(final.actrec == 4, 3, final.act5)  # chômeur
    final.act5 = where(final.actrec == 7, 4, final.act5)  # retraité
    final.act5 = where(final.actrec == 8, 5, final.act5)  # autres inactifs

    log.info("Valeurs prises par act5: \n {}".format(final.act5.value_counts()))

#     assert final.act5.notnull().all(), 'there are NaN inside final.act5'
# final$wprm <- NULL # with the intention to extract wprm from menage to deal with FIPs
# final$taxe_habitation <- final$zthabm # rename zthabm to taxe_habitation
# final$zthabm <- NULL
#
# final2 <- merge(final, loyersMenages, by="idmen", all.x=TRUE)
    log.info('    création de final2')
    del final["wprm"]
    gc.collect()
    final.rename(columns = dict(zthabm = "taxe_habitation"), inplace = True)  # rename zthabm to taxe_habitation
    final2 = final.merge(loyersMenages, on = "idmen", how = "left")  # TODO: Check
    log.info("{}".format(loyersMenages.head()))
    gc.collect()
    print_id(final2)
# # TODO: merging with patrimoine
    log.info('    traitement des zones apl')
    import pkg_resources
    openfisca_france_data_location = pkg_resources.get_distribution('openfisca-france-data').location
    zone_apl_imputation_data_file_path = os.path.join(
        openfisca_france_data_location,
        'openfisca_france_data',
        'zone_apl_data',
        'zone_apl',
        'zone_apl_imputation_data.csv',
        )
    apl_imp = read_csv(zone_apl_imputation_data_file_path)

    log.info("{}".format(apl_imp.head(10)))
    if year == 2008:
        zone_apl = final2.xs(["tu99", "pol99", "reg"], axis = 1)
    else:
        zone_apl = final2.xs(["tu99", "pol99", "tau99", "reg"], axis = 1)

    for i in range(len(apl_imp["TU99"])):
        tu = apl_imp["TU99"][i]
        pol = apl_imp["POL99"][i]
        tau = apl_imp["TAU99"][i]
        reg = apl_imp["REG"][i]

    if year == 2008:
        indices = (final2["tu99"] == tu) & (final2["pol99"] == pol) & (final2["reg"] == reg)
        selection = (apl_imp["TU99"] == tu) & (apl_imp["POL99"] == pol) & (apl_imp["REG"] == reg)
    else:
        indices = (final2["tu99"] == tu) & (final2["pol99"] == pol) & (final2["tau99"] == tau) & (final2["reg"] == reg)
        selection = (
            (apl_imp["TU99"] == tu) &
            (apl_imp["POL99"] == pol) &
            (apl_imp["TAU99"] == tau) &
            (apl_imp["REG"] == reg)
            )

    z = random.uniform(size=indices.sum())
    log.info(len(z))
    log.info(len(indices))
    log.info(len(indices) / len(z))
    probs = apl_imp[["proba_zone1", "proba_zone2"]][selection].copy()
    log.info(probs)
    log.info(probs['proba_zone1'].values)
    proba_zone_1 = probs['proba_zone1'].values[0]
    proba_zone_2 = probs['proba_zone2'].values[0]

    final2["zone_apl"] = 3
    final2.zone_apl.loc[indices] = (
        1 + (z > proba_zone_1) + (z > (proba_zone_1 + proba_zone_2))
        )
    del indices, probs

    log.info('    performing cleaning on final2')
    log.info('{} sali nuls'.format(len(final2[final2['sali'].isnull()])))
    log.info("{} individus d'âges nuls".format(len(final2[final2.age.isnull()])))
    log.info("longueur de final2 avant purge : {}".format(len(final2)))
#     columns_w_nan = []
#     for col in final2.columns:
#         if final2[final2['idfoy'].notnull()][col].isnull().any() and not final2[col].isnull().all():
#             columns_w_nan.append(col)
#     print columns_w_nan
    log.info('check doublons : {}'.format(len(final2[final2.duplicated(['noindiv'])])))
    log.info("{}".format(final2.age.isnull().sum()))

    print_id(final2)

# # var <- names(foyer)
# #a1 <- c('f7rb', 'f7ra', 'f7gx', 'f2aa', 'f7gt', 'f2an', 'f2am', 'f7gw', 'f7gs', 'f8td', 'f7nz', 'f1br', 'f7jy', 'f7cu', 'f7xi', 'f7xo', 'f7xn', 'f7xw', 'f7xy', 'f6hj', 'f7qt', 'f7ql', 'f7qm', 'f7qd', 'f7qb', 'f7qc', 'f1ar', 'f7my', 'f3vv', 'f3vu', 'f3vt', 'f7gu', 'f3vd', 'f2al', 'f2bh', 'f7fm', 'f8uy', 'f7td', 'f7gv', 'f7is', 'f7iy', 'f7il', 'f7im', 'f7ij', 'f7ik', 'f1er', 'f7wl', 'f7wk', 'f7we', 'f6eh', 'f7la', 'f7uh', 'f7ly', 'f8wy', 'f8wx', 'f8wv', 'f7sb', 'f7sc', 'f7sd', 'f7se', 'f7sf', 'f7sh', 'f7si',  'f1dr', 'f7hs', 'f7hr', 'f7hy', 'f7hk', 'f7hj', 'f7hm', 'f7hl', 'f7ho', 'f7hn', 'f4gc', 'f4gb', 'f4ga', 'f4gg', 'f4gf', 'f4ge', 'f7vz', 'f7vy', 'f7vx', 'f7vw', 'f7xe', 'f6aa', 'f1cr', 'f7ka', 'f7ky', 'f7db', 'f7dq', 'f2da')
# #a2 <- setdiff(a1,names(foyer))
# #b1 <- c('pondfin', 'alt', 'hsup', 'ass_mat', 'zone_apl', 'inactif', 'ass', 'aer', 'code_postal', 'activite', 'type_sal', 'jour_xyz', 'boursier', 'etr', 'partiel1', 'partiel2', 'empl_dir', 'gar_dom', 'categ_inv', 'opt_colca', 'csg_taux_plein','coloc')
# # hsup feuille d'impot
# # boursier pas dispo
# # inactif etc : extraire cela des donn?es clca etc
#
# # tester activit? car 0 vaut actif
# table(is.na(final2$activite),useNA="ifany")
#
# saveTmp(final2, file= "final2.Rdata")

    control(final2, debug = True)
    temporary_store['final2'] = final2
    log.info("Nombre de personne d'âge NaN: {} ".format(final2.age.isnull().sum()))
    final2 = final2.drop_duplicates(subset = 'noindiv')

    log.info('    Filter to manage the new 3-tables structures:')
    # On récupère les foyer, famille, ménages qui ont un chef :
    # On ne conserve dans final2 que ces foyers là :
    log.info('final2 avant le filtrage {}'.format(len(final2)))
    print_id(final2)


    liste_fam = final2.loc[final2['quifam'] == 0, 'idfam'].unique()
    log.info("Dropping {} famille".format((~final2.idfam.isin(liste_fam)).sum()))
    final2 = final2.loc[final2.idfam.isin(liste_fam)].copy()

    if check:
        check_structure(final2)

    temporary_store['final2bis'] = final2

    liste_foy = final2.loc[final2['quifoy'] == 0, 'idfoy'].unique()
    log.info("Dropping {} foyers".format((~final2.idfoy.isin(liste_foy)).sum()))
    final2 = final2.loc[final2.idfoy.isin(liste_foy)].copy()


    liste_men = final2.loc[final2['quimen'] == 0, 'idmen'].unique()
    log.info(u"Dropping {} ménages".format((~final2.idmen.isin(liste_men)).sum()))
    final2 = final2.loc[final2.idmen.isin(liste_men)].copy()

    log.info('final2 après le filtrage {}'.format(len(final2)))
    print_id(final2)
    rectify_dtype(final2, verbose = False)
    #    home = os.path.expanduser("~")
    #    test_filename = os.path.join(home, filename + ".h5")
    #    if os.path.exists(test_filename):
    #        import warnings
    #        import datetime
    #        time_stamp = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M')
    #        renamed_file = os.path.join(DATA_SOURCES_DIR, filename + "_" + time_stamp + ".h5")
    #        warnings.warn("A file with the same name already exists \n Renaming current output and saving to " + renamed_file)
    #        test_filename = renamed_file
    data_frame = final2

    temporary_store['final3'] = data_frame

    assert not data_frame.duplicated(['idmen', 'quimen']).any(), 'bad ménages indexing'


    if year == 2006:  # Hack crade pur régler un problème rémanent
        data_frame = data_frame[data_frame.idfam != 602177906].copy()

    for id_variable in ['idfam', 'idfoy', 'idmen', 'noi', 'quifam', 'quifoy', 'quimen']:
        data_frame[id_variable] = data_frame[id_variable].astype('int')

    gc.collect()

    if check:
        check_structure(data_frame)
    temporary_store['final4'] = data_frame

    assert not data_frame.duplicated(['idmen', 'quimen']).any(), 'bad ménages indexing'

    gc.collect()
    for entity_id in ['idmen', 'idfoy', 'idfam']:
        log.info('Reformat ids: {}'.format(entity_id))
        data_frame = id_formatter(data_frame, entity_id)

    assert not data_frame.duplicated(['idmen', 'quimen']).any(), 'bad ménages indexing'

    log.info('Dealing with loyer')
    if 'loyer' in data_frame.columns:
        assert 'loyer' in data_frame.columns
        data_frame.loyer = data_frame.loyer * 12.0
    log.info('Set variables to their default values')
    set_variables_default_value(data_frame, year)
    print_id(data_frame)
    temporary_store['input_{}'.format(year)] = data_frame
    return data_frame
def invalide(temporary_store = None, year = None):

    assert temporary_store is not None
    assert year is not None

    log.info(u"Entering 07_invalides: construction de la variable invalide")
# # # Invalides
# # #invalide = caseP (vous), caseF (conj) ou case G, caseI, ou caseR (pac)

# # loadTmp("final.Rdata")
# # invalides <- final[,c("noindiv","idmen","caseP","caseF","idfoy","quifoy")]
# # invalides <- within(invalides,{
# #   caseP <- ifelse(is.na(caseP),0,caseP)
# #   caseF <- ifelse(is.na(caseF),0,caseF)
# #   invalide <- FALSE})
# # # Les "vous" invalides
# # table(invalides[,c("caseF","quifoy")],useNA="ifany")
# # invalides[(invalides$caseP==1) & (invalides$quifoy=="vous"),"invalide"] <- TRUE
# #

    log.info(u"Etape 1 : création de la df invalides")
    log.info(u"    1.1 : déclarants invalides")
    final = temporary_store['final_{}'.format(year)]
    if "invalide" in final:
        # on drop les colones inv et alt au cas ou on aurait déjà lancé le step07
        final.drop(["invalide", "alt"], axis = 1, inplace = True)

    invalides_vars = [
        "caseF",
        "caseP",
        "idfoy",
        "idmen",
        "noindiv",
        "quifoy",
        ]

    aah_eec_variables = ["rc1rev", "maahe"]
    aah_eec = False

    if set(aah_eec_variables) < set(final.columns):
        aah_eec = True
        invalides_vars += aah_eec_variables

    assert set(invalides_vars) < set(final.columns), \
        "Variables {} are missing".format(set(invalides_vars).difference(set(final.columns)))

    invalides = final.xs(invalides_vars, axis = 1).copy()

    invalides['invalide'] = False
    invalides['alt'] = False
    for var in ["caseP", "caseF"]:
        assert invalides[var].notnull().all(), 'NaN values in {}'.format(var)

    # Les déclarants invalides
    invalides.loc[(invalides['caseP'] == 1) & (invalides['quifoy'] == 0), 'invalide'] = True
    log.info(u"Il y a {} invalides déclarants".format(invalides["invalide"].sum()))

    # Les personnes qui touchent l'aah dans l'enquête emploi
    if aah_eec:
        log.info(u"Inspecting rc1rev")
        log.info(invalides['rc1rev'].value_counts())
        invalides.loc[invalides.maahe > 0, 'invalide'] = True
        invalides.loc[invalides.rc1rev == 4, 'invalide'] = True  # TODO: vérifier le format.
#  TODO:      invalides.rc1rev.astype("str") voir mai mahdi pour pendre en compte 14 24 etc

        log.info(u"Il y a {} invalides qui touchent des alloc".format(invalides["invalide"].sum()))

    print_id(invalides)

# # # Les conjoints invalides
# #
# # #men_inv_conj <- invalides[c("idmen","caseF","quifoy")]
# # #men_inv_conj <- rename(men_inv_conj, c("caseF"="invalide"))
# # #table(men_inv_conj[men_inv_conj$inv==1 ,c("invalide","quifoy")],useNA="ifany")
# # # Il y a des caseF suir des conjoints cela vint des doubles d?clarations TODO: shoumd clean this
# # #toto <- invalides[invalides$caseF==1 & invalides$quifoy=="conj","idmen"]
# # #load(indm)
# # #titi <- indivim[(indivim$ident %in% toto) & (indivim$persfip=="vous" |indivim$persfip=="conj") ,c("ident","noindiv","declar1","declar2","persfip","quelfic")]
# # #titi <- titi[order(titi$ident),]
# # foy_inv_conj <- invalides[,c("idfoy","caseF","quifoy")]
# # foy_inv_conj <- rename(foy_inv_conj, c("caseF"="invalide"))
# # table(foy_inv_conj[ ,c("invalide","quifoy")],useNA="ifany")
# # # On ne garde donc que les caseF des "vous"

# # foy_inv_conj   <- foy_inv_conj[foy_inv_conj$quifoy=="vous",c("idfoy","invalide")]
# # table(foy_inv_conj[ ,c("invalide")],useNA="ifany")
# # invalides_conj <- invalides[invalides$quifoy=="conj",c("idfoy","noindiv")]
# # invalides_conj <- merge(invalides_conj, foy_inv_conj, by="idfoy", all.x=TRUE)
# # table(invalides_conj$inv) # TODO en 2006 On en a 316 au lieu de 328 il doit y avoir de idfoy avec caseF qui n'ont pas de vous because double déclaration'
# # invalides[invalides$quifoy=="conj",c("idfoy","noindiv","invalide")] <- invalides_conj
# # table(invalides[,c("invalide","quifoy")],useNA="ifany")
# # rm(invalides_conj,foy_inv_conj)

    # On récupère les idfoy des foyers avec une caseF cochée
    log.info('    1.2 : Les conjoints invalides')
    idfoy_inv_conj = final.idfoy[final.caseF].copy()
    inv_conj_condition = (invalides.idfoy.isin(idfoy_inv_conj) & (invalides.quifoy == 1))
    invalides.loc[inv_conj_condition, "invalide"] = True

    log.info(u"Il y a {} invalides conjoints".format(len(invalides[inv_conj_condition])))
    log.info(u" Il y a {} invalides déclarants et invalides conjoints".format(invalides["invalide"].sum()))

    # Enfants invalides et garde alternée
    # #
    # # loadTmp("pacIndiv.Rdata")
    # # foy_inv_pac <- invalides[!(invalides$quifoy %in% c("vous","conj")),c("invalide","noindiv")]
    # # foy_inv_pac <- merge(foy_inv_pac, pacIndiv[,c("noindiv","typ","naia")], by="noindiv",all.x =TRUE)
    # # names(foy_inv_pac)
    # # table(foy_inv_pac[,c("typ","naia")],useNA="ifany")
    # # table(foy_inv_pac[,c("typ")],useNA="ifany")
    # # foy_inv_pac <- within(foy_inv_pac,{
    # #   invalide  <- (typ=="G") | (typ=="R") | (typ=="I") | (typ=="F" & (as.numeric(year)-naia>18))
    # #   alt  <- (typ=="H") | (typ=="I")
    # #   naia <- NULL
    # #   typ  <- NULL})
    # #
    # # table(foy_inv_pac[ ,c("invalide")],useNA="ifany")
    # # table(foy_inv_pac[ ,c("alt")],useNA="ifany")
    # # invalides$alt <- 0
    # # foy_inv_pac[is.na(foy_inv_pac$alt),"alt"] <- 0
    # # invalides[!(invalides$quifoy %in% c("vous","conj")),c("noindiv","invalide","alt")] <- foy_inv_pac
    log.info(u"    1.3 : enfants invalides et en garde alternée (variables inv et alt)")
    pacIndiv = temporary_store['pacIndiv_{}'.format(year)]
    # print pacIndiv.type_pac.value_counts()
    log.info(pacIndiv.type_pac.value_counts())

    foy_inv_pac = invalides[['noindiv', 'invalide']][~(invalides.quifoy.isin([0, 1]))].copy()
    # pac = pacIndiv.ix[:, ["noindiv", "type_pac", "naia"]]
    log.info("{}".format(len(foy_inv_pac)))

    log.info("{}".format(pacIndiv.columns))
    foy_inv_pac = foy_inv_pac.merge(
        pacIndiv[['noindiv', 'type_pac', 'naia']].copy(),
        on = 'noindiv',
        how = 'left',
        )
    foy_inv_pac['invalide'] = (
        foy_inv_pac['type_pac'].isin(['G', 'R', 'I']) |
        (
            (foy_inv_pac.type_pac == "F") & ((year - foy_inv_pac.naia) > 18)
            )
        )
    foy_inv_pac['alt'] = ((foy_inv_pac.type_pac == "H") | (foy_inv_pac.type_pac == "I"))
    del foy_inv_pac['naia']
    del foy_inv_pac['type_pac']
    foy_inv_pac['alt'] = foy_inv_pac['alt'].fillna(False)

    log.info("{}".format(foy_inv_pac['invalide'].describe()))
    invalides.loc[:, 'alt'] = False
    invalides.loc[~(invalides.quifoy.isin([0, 1])), ["alt", "invalide"]] = foy_inv_pac[["alt", "invalide"]].copy().values
    invalides = invalides[["noindiv", "invalide", 'alt']].copy()
    invalides['alt'].fillna(False, inplace = True)

    log.info(invalides.invalide.value_counts())
    # invalides = invalides.drop_duplicates(['noindiv', 'invalide', 'alt'], take_last = True)
    del foy_inv_pac, pacIndiv

    # Initialisation des NA sur alt et inv
    # invalides[is.na(invalides$inv), "invalide"] <- 0
    # table(invalides[,c("alt","invalide")],useNA="ifany")
    #
    # final <- merge(final, invalides[,c("noindiv","invalide","alt")], by="noindiv",all.x=TRUE)
    # table(final[, c("invalide","alt")],useNA="ifany")

    log.info('Etape 2 : Initialisation des NA sur alt et inv')
    assert invalides.invalide.notnull().all() & invalides.alt.notnull().all()
    final.set_index('noindiv', inplace = True, verify_integrity = True)
    invalides.set_index('noindiv', inplace = True, verify_integrity = True)
    final = final.join(invalides)
    final.reset_index(inplace = True)
    del invalides

    log.info("{}".format(final.invalide.value_counts()))
    control(final, debug = True)

    temporary_store['final_{}'.format(year)] = final
    log.info(u'final complétée et sauvegardée')