Example #1
0
def build_erf_aggregates(variables = None, year = 2006, unit = 1e6):
    """
    Fetch the relevant aggregates from erf data
    """

    erf = DataCollection(year=year)
    if variables is not None and "wprm" not in variables:
        variables.append("wprm")
    print 'Fetching aggregates from erf %s data' %str(year)
    df = erf.get_of_values(variables=variables, table = "erf_menage")

    of2erf = get_of2erf()
    erf2of = get_erf2of()

    df.rename(columns = erf2of, inplace = True)
    wprm = df["wprm"]
    for col in df.columns:
        try:
            df[col] = df[col].astype(np.float64)
        except:
            pass
    df = df.mul(wprm, axis = 0)
    for col in list(set(df.columns) - set(['ident', 'wprm'])):
        try:
            df[col] = df[col].sum()/1e6
        except:
            pass

    return df.ix[0:1] # Aggregate so we only need 1 row
Example #2
0
def build_erf_aggregates(variables=None, year=2006, unit=1e6):
    """
    Fetch the relevant aggregates from erf data
    """

    erf = DataCollection(year=year)
    if variables is not None and "wprm" not in variables:
        variables.append("wprm")
    print 'Fetching aggregates from erf %s data' % str(year)
    df = erf.get_of_values(variables=variables, table="erf_menage")

    of2erf = get_of2erf()
    erf2of = get_erf2of()

    df.rename(columns=erf2of, inplace=True)
    wprm = df["wprm"]
    for col in df.columns:
        try:
            df[col] = df[col].astype(np.float64)
        except:
            pass
    df = df.mul(wprm, axis=0)
    for col in list(set(df.columns) - set(['ident', 'wprm'])):
        try:
            df[col] = df[col].sum() / 1e6
        except:
            pass

    return df.ix[0:1]  # Aggregate so we only need 1 row
def create_enfnn(year=2006):
    '''
    '''
    #load
    data = DataCollection(year=year)
    ### Enfant à naître (NN pour nouveaux nés)
    individual_vars = ['noi', 'noicon', 'noindiv', 'noiper', 'noimer', 'ident', 'naia', 'naim', 'lien',
               'acteu','stc','contra','titc','mrec','forter','rstg','retrai','lpr','cohab','sexe',
               'agepr','rga']
    data = DataCollection(year=year)
    eeccmp1 = data.get_values(table="eec_cmp_1", variables=individual_vars)
    eeccmp2 = data.get_values(table="eec_cmp_2", variables=individual_vars)
    eeccmp3 = data.get_values(table="eec_cmp_3", variables=individual_vars)
    tmp = eeccmp1.merge(eeccmp2, how="outer")
    enfnn = tmp.merge(eeccmp3, how="outer")

    # optimisation des types? Controle de l'existence en passant
    # pourquoi pas des int quand c'est possible
    #TODO: minimal dtype
    for var in individual_vars:
        print var
        enfnn[var] = enfnn[var].astype('float')
    del eeccmp1, eeccmp2, eeccmp3, individual_vars

    # création de variables
    print enfnn.describe()
    enfnn['declar1'] = ''
    enfnn['noidec'] = 0
    enfnn['ztsai'] = 0
    enfnn['year'] = year
    enfnn['year'] = enfnn['year'].astype("float32") # -> integer ?
    enfnn['agepf'] = enfnn['year'] - enfnn['naia']
    enfnn['agepf'][enfnn['naim'] >= 7] -= 1
    enfnn['actrec'] = 9
    enfnn['quelfic'] = 'ENF_NN'
    enfnn['persfip'] = ""

    #selection
    #enfnn <- enfnn[(enfnn$naia==enfnn$year & enfnn$naim>=10) | (enfnn$naia==enfnn$year+1 & enfnn$naim<=5),]
    enfnn = enfnn[((enfnn['naia'] == enfnn['year']) & (enfnn['naim'] >= 10)) |
                      ((enfnn['naia'] == enfnn['year'] + 1) & (enfnn['naim'] <= 5))]
    #save
    save_temp(enfnn, name="enfnn", year=year)
    del enfnn
    print "enfnnm saved"
    gc.collect()
def create_totals(year=2006):

    print "Creating Totals"
    print "Etape 1 : Chargement des données"

    data = DataCollection(year=year)
    indivim = load_temp(name="indivim", year=year)

    assert indivim.duplicated(['noindiv']).any() == False, "Présence de doublons"


    # Deals individuals with imputed income : some individuals are in 'erf individu table' but
    # not in the 'foyer' table. We need to create a foyer for them.


    selection = Series()
    for var in ["zsali", "zchoi", "zrsti", "zalri", "zrtoi", "zragi", "zrici", "zrnci"]:
        varo = var[:-1]+"o"
        test = indivim[var] != indivim[varo]
        if len(selection) == 0:
            selection = test
        else:
            selection = (test) | (selection)


    indivi_i = indivim[selection]
    indivi_i.rename(columns={"ident" : "idmen",
                     "persfip":"quifoy",
                     "zsali" : "sali2", # Inclu les salaires non imposables des agents d'assurance
                     "zchoi" : "choi2",
                     "zrsti" : "rsti2",
                     "zalri" : "alr2"}, inplace=True)


    indivi_i["quifoy"] = where(indivi_i["quifoy"].isnull(), "vous", indivi_i["quifoy"])
    indivi_i["quelfic"] = "FIP_IMP"



## We merge them with the other individuals
#indivim <- rename(indivim, c(ident = "idmen",
#                             persfip = "quifoy",
#                             zsali = "sali2", # Inclu les salaires non imposables des agents d'assurance
#                             zchoi = "choi2",
#                             zrsti = "rsti2",
#                             zalri = "alr2"))
#
#indivi <- rbind(indivim[!(indivim$noindiv %in% indivi_i$noindiv),], indivi_i)
#rm(indivim, indivi_i)
#gc()
#table(indivi$quelfic)
#

    indivim.rename( columns= dict(ident = "idmen",
                             persfip = "quifoy",
                             zsali = "sali2", # Inclu les salaires non imposables des agents d'assurance
                             zchoi = "choi2",
                             zrsti = "rsti2",
                             zalri = "alr2"), inplace=True)

    if not (set(list(indivim.noindiv)) >  set(list(indivi_i.noindiv)) ):
        raise Exception("Individual ")
    indivim.set_index("noindiv", inplace=True)
    indivi_i.set_index("noindiv", inplace=True)
    indivi = indivim
    del indivim
    indivi.update(indivi_i)

    indivi.reset_index( inplace=True)

    print ''
    print "Etape 2 : isolation des FIP"
    fip_imp = indivi.quelfic=="FIP_IMP"
    indivi["idfoy"] = (indivi["idmen"].astype("int64")*100 +
                       (indivi["declar1"].str[0:2]).convert_objects(convert_numeric=True))

    indivi.loc[fip_imp,"idfoy"] = nan

## Certains FIP (ou du moins avec revenus imputés) ont un num?ro de déclaration d'impôt ( pourquoi ?)


    fip_has_declar = (fip_imp) & (indivi.declar1.notnull())

#    indivi.ix[fip_has_declar, "idfoy"] = ( indivi.ix[fip_has_declar, "idmen"]*100
#                                        + (indivi.ix[fip_has_declar, "declar1"].str[0:1]).convert_objects(convert_numeric=True) )
    indivi["idfoy"] = where(fip_has_declar,
                            indivi["idmen"]*100 + indivi["declar1"].str[0:2].convert_objects(convert_numeric=True),
                            indivi["idfoy"])

    del fip_has_declar


    fip_no_declar = (fip_imp) & (indivi.declar1.isnull())
    del fip_imp
    indivi["idfoy"] = where(fip_no_declar,
                            indivi["idmen"]*100 + 50,
                            indivi["idfoy"])

    indivi_fnd = indivi.loc[fip_no_declar, ["idfoy","noindiv"]]


    while any(indivi_fnd.duplicated(cols=["idfoy"])):
        indivi_fnd["idfoy"] = where(indivi_fnd.duplicated(cols=["idfoy"]),
                                    indivi_fnd["idfoy"] + 1,
                                    indivi_fnd["idfoy"])

    assert indivi_fnd["idfoy"].duplicated().value_counts()[False] == len(indivi_fnd["idfoy"]), "Duplicates remaining"
    assert len(indivi[indivi.duplicated(['noindiv'])]) == 0, "Doublons"


    indivi.loc[fip_no_declar, ["idfoy"]] = indivi_fnd
    del indivi_fnd, fip_no_declar

    print ''
    print 'Etape 3 : Récupération des EE_NRT'

    nrt = indivi.quelfic=="EE_NRT"
    indivi.idfoy = where(nrt, indivi.idmen*100 + indivi.noi, indivi.idfoy)
    indivi.loc[nrt,"quifoy"] = "vous"
    del nrt

    pref_or_cref = indivi['lpr'].isin([1,2])
    adults = (indivi.quelfic.isin(["EE","EE_CAF"])) & (pref_or_cref)
    indivi.idfoy = where(adults, indivi.idmen*100 + indivi.noi, indivi.idfoy)
    indivi.loc[adults, "quifoy"] = "vous"
    del adults
    assert indivi.loc[indivi['lpr'].isin([1,2]),"idfoy"].notnull().all()

    print ''
    print 'Etape 4 : Rattachement des enfants aux déclarations'

    assert indivi["noindiv"].duplicated().any() == False, "Some noindiv appear twice"
    lpr3_or_lpr4 = indivi['lpr'].isin([3,4])
    enf_ee = (lpr3_or_lpr4) & (indivi.quelfic.isin(["EE","EE_CAF"]))
    assert indivi.loc[enf_ee, "noindiv"].notnull().all(), " Some noindiv are not set, which will ruin next stage"
    assert indivi.loc[enf_ee, "noindiv"].duplicated().any() == False, "Some noindiv appear twice"

    pere = DataFrame( {"noindiv_enf" : indivi.noindiv.loc[enf_ee], "noindiv" : 100*indivi.idmen.loc[enf_ee] + indivi.noiper.loc[enf_ee] })
    mere = DataFrame( {"noindiv_enf" : indivi.noindiv.loc[enf_ee], "noindiv" : 100*indivi.idmen.loc[enf_ee] + indivi.noimer.loc[enf_ee] })

    foyer = data.get_values(variables=["noindiv","zimpof"], table="foyer" )
    pere  = pere.merge(foyer, how="inner", on="noindiv")
    mere  = mere.merge(foyer, how="inner", on="noindiv")

#     print "Some pere et mere are duplicated because people have two foyers"
#     print pere[pere.duplicated()]
#     print mere[mere.duplicated()]

    df = pere.merge(mere, how="outer", on="noindiv_enf",  suffixes=('_p', '_m'))

#     print len(pere)
#     print len(mere)
#     print len(df)
#     ll = df.loc[df["noindiv_enf"].duplicated(), "noindiv_enf"]
#     print df.loc[df["noindiv_enf"].isin(ll)]
#     print df[df.duplicated()]



    print '    4.1 : gestion des personnes dans 2 foyers'
    for col in ["noindiv_p","noindiv_m","noindiv_enf"]:
        df[col] = df[col].fillna(0,inplace=True) # beacause groupby drop groups with NA in index
    df = df.groupby(by=["noindiv_p","noindiv_m","noindiv_enf"]).sum()
    df.reset_index(inplace=True)

    df["which"] = ""
    df["which"] = where((df.zimpof_m.notnull()) & (df.zimpof_p.isnull()), "mere", "")
    df["which"] = where((df.zimpof_p.notnull()) & (df.zimpof_m.isnull()), "pere", "")
    both = (df.zimpof_p.notnull()) & (df.zimpof_m.notnull())
    df["which"] = where(both & (df.zimpof_p  > df.zimpof_m), "pere", "mere")
    df["which"] = where(both & (df.zimpof_m >= df.zimpof_p), "mere", "pere")

    assert df["which"].notnull().all(), "Some enf_ee individuals are not matched with any pere or mere"
    del lpr3_or_lpr4, pere, mere

    df.rename(columns={"noindiv_enf" : "noindiv"}, inplace=True)
    df["idfoy"] = where( df.which=="pere", df.noindiv_p, df.noindiv_m)
    df["idfoy"] = where( df.which=="mere", df.noindiv_m, df.noindiv_p)

    assert df["idfoy"].notnull().all()

    for col in df.columns:
        if col not in ["idfoy", "noindiv"]:
            del df[col]

#     assert indivi.loc[enf_ee,"idfoy"].notnull().all()
    assert df.duplicated().any() == False

    df.set_index("noindiv",inplace=True, verify_integrity=True)
    indivi.set_index("noindiv", inplace=True, verify_integrity=True)

    ind_notnull = indivi["idfoy"].notnull().sum()
    ind_isnull = indivi["idfoy"].isnull().sum()
    indivi = indivi.combine_first(df)
    assert ind_notnull + ind_isnull == (indivi["idfoy"].notnull().sum() +
                                        indivi["idfoy"].isnull().sum())

    indivi.reset_index(inplace=True)
    assert indivi.duplicated().any() == False



# MBJ: issue delt with when moving from R code to python
## TODO il faut rajouterles enfants_fip et créer un ménage pour les majeurs
## On suit guide méthodo erf 2003 page 135
## On supprime les conjoints FIP et les FIP de 25 ans et plus;
## On conserve les enfants FIP de 19 à 24 ans;
## On supprime les FIP de 18 ans et moins, exceptés les FIP nés en 2002 dans un
## ménage en 6ème interrogation car ce sont des enfants nés aprés la date d'enquète
## EEC que l'on ne retrouvera pas dans les EEC suivantes.
#
    print '    4.2 : On enlève les individus pour lesquels il manque le déclarant'
    fip = load_temp(name="fipDat", year=year)
    fip["declar"] = nan
    fip["agepf"] = nan


    fip.drop(["actrec", "year", "noidec"], axis=1, inplace=True)
    fip.naia = fip.naia.astype("int32")
    fip.rename( columns=dict(ident="idmen",
                     persfip="quifoy",
                     zsali="sali2", # Inclu les salaires non imposables des agents d'assurance
                     zchoi="choi2",
                     zrsti="rsti2",
                     zalri="alr2"),
                inplace=True)

    is_fip_19_25 = ((year-fip.naia-1)>=19) & ((year-fip.naia-1)<25)

## TODO: BUT for the time being we keep them in thier vous menage so the following lines are commented
## The idmen are of the form 60XXXX we use idmen 61XXXX, 62XXXX for the idmen of the kids over 18 and less than 25
##fip[is_fip_19_25 ,"idmen"] <- (99-fip[is_fip_19_25,"noi"]+1)*100000 + fip[is_fip_19_25,"idmen"]
##fip[is_fip_19_25 ,"lpr"]  <- 1
#
#indivi <- rbind.fill(indivi,fip[is_fip_19_25,])

    indivi = concat([indivi, fip.loc[is_fip_19_25]])
    del is_fip_19_25
    indivi['age'] = year - indivi.naia - 1
    indivi['agem'] = 12*indivi.age  + 12-indivi.naim

    indivi["quimen"] = 0
    indivi.quimen[indivi.lpr == 1] = 0
    indivi.quimen[indivi.lpr == 2] = 1
    indivi.quimen[indivi.lpr == 3] = 2
    indivi.quimen[indivi.lpr == 4] = 3
    indivi['not_pr_cpr'] = nan
    indivi['not_pr_cpr'][indivi['lpr']<=2] = False
    indivi['not_pr_cpr'][indivi['lpr']>2] = True


    print "    4.3 : Creating non pr=0 and cpr=1 idmen's"
    indivi.reset_index(inplace=True)
    test1 = indivi.ix[indivi['not_pr_cpr']==True,['quimen', 'idmen']]
    test1['quimen'] = 2

    j=2
    while any(test1.duplicated(['quimen', 'idmen'])):
        test1.loc[test1.duplicated(['quimen', 'idmen']), 'quimen'] = j+1
        j += 1

    print_id(indivi)
    indivi.update(test1)

    print_id(indivi)

#     indivi.set_index(['quiment']) #TODO: check relevance
#     TODO problème avec certains idfoy qui n'ont pas de vous
    print ''
    print "Etape 5 : Gestion des idfoy qui n'ont pas de vous"
    all = indivi.drop_duplicates('idfoy')
    with_ = indivi.loc[indivi['quifoy']=='vous', 'idfoy']
    without = all[~(all.idfoy.isin(with_.values))]

    print 'On cherche si le déclarant donné par la deuxième déclaration est bien un vous'
    has_declar2 = (indivi.idfoy.isin(without.idfoy.values)) & (indivi.declar2.notnull())
    decl2_idfoy = (indivi.loc[has_declar2, 'idmen'].astype('int')*100 +
                    indivi.loc[has_declar2, "declar2"].str[0:2].astype('int'))
    indivi.loc[has_declar2, 'idfoy'] = where(decl2_idfoy.isin(with_.values), decl2_idfoy, None)
    del all,with_,without, has_declar2

    print '    5.1 : Elimination idfoy restant'
    idfoyList = indivi.loc[indivi['quifoy']=="vous", 'idfoy'].drop_duplicates()
    indivi = indivi[indivi.idfoy.isin(idfoyList.values)]
    del idfoyList
    print_id(indivi)

    myvars = ["noindiv", "noi", "idmen", "idfoy", "quifoy", "wprm",
                            "age","agem","quelfic","actrec", "quimen",
                            "nbsala","titc","statut","txtppb","chpub","prosa","encadr"]

    if not(len(set(myvars).difference(set(indivi.columns))) == 0):
        print set(myvars).difference(set(indivi.columns))

    assert len(set(myvars).difference(set(indivi.columns))) == 0

    indivi = indivi.loc[:, myvars]

## TODO les actrec des fip ne sont pas codées (on le fera à la fin quand on aura rassemblé
## les infos provenant des déclarations)

    print ''
    print 'Etape 6 : Création des variables descriptives'
    print '    6.1 : variable activité'
    indivi['activite'] = None
    indivi['activite'][indivi['actrec']<=3] = 0
    indivi['activite'][indivi['actrec']==4] = 1
    indivi['activite'][indivi['actrec']==5] = 2
    indivi['activite'][indivi['actrec']==7] = 3
    indivi['activite'][indivi['actrec']==8] = 4
    indivi['activite'][indivi['age']<=13] = 2 # ce sont en fait les actrec=9
    print indivi['activite'].value_counts()
    # TODO: MBJ problem avec les actrec


    indivi['titc'][indivi['titc'].isnull()] = 0
    assert indivi['titc'].notnull().all() , Exception("Problème avec les titc")


    print '    6.2 : variable statut'
    indivi['statut'][indivi['statut'].isnull()] = 0
    indivi['statut'] = indivi['statut'].astype('int')
    indivi['statut'][indivi['statut']==11] = 1
    indivi['statut'][indivi['statut']==12] = 2
    indivi['statut'][indivi['statut']==13] = 3
    indivi['statut'][indivi['statut']==21] = 4
    indivi['statut'][indivi['statut']==22] = 5
    indivi['statut'][indivi['statut']==33] = 6
    indivi['statut'][indivi['statut']==34] = 7
    indivi['statut'][indivi['statut']==35] = 8
    indivi['statut'][indivi['statut']==43] = 9
    indivi['statut'][indivi['statut']==44] = 10
    indivi['statut'][indivi['statut']==45] = 11
    assert indivi['statut'].isin(range(12)).all(), Exception("statut value over range")


#indivi$nbsala <- as.numeric(indivi$nbsala)
#indivi <- within(indivi,{
#  nbsala[is.na(nbsala) ]    <- 0
#  nbsala[nbsala==99 ] <- 10  # TODO  418 fip à retracer qui sont NA
#})

    print '    6.3 : variable txtppb'
    indivi['txtppb'] = indivi['txtppb'].fillna(0)
    assert indivi['txtppb'].notnull().all()

    indivi['nbsala'] = indivi['nbsala'].fillna(0)
    indivi['nbsala'] = indivi['nbsala'].astype('int')
    indivi['nbsala'][indivi['nbsala']==99] = 10
    assert indivi['nbsala'].isin(range(11)).all()

    print '    6.4 : variable chpub et CSP'
    indivi['chpub'].fillna(0, inplace=True)
    indivi['chpub'] = indivi['chpub'].astype('int')
    indivi['chpub'][indivi['chpub'].isnull()] = 0
    print indivi['chpub'].value_counts()
    assert indivi['chpub'].isin(range(11)).all()

    indivi['cadre'] = 0
    indivi['prosa'][indivi['prosa'].isnull()] = 0
    assert indivi['prosa'].notnull().all()
    print indivi['encadr'].value_counts()

    # encadr : 1=oui, 2=non
    indivi['encadr'].fillna(2, inplace=True)
    assert indivi['encadr'].notnull().all()
    indivi['cadre'][indivi['prosa'].isin([7,8])] = 1
    indivi['cadre'][(indivi['prosa']==9) & (indivi['encadr']==1)] = 1
    print "cadre"
    print indivi['cadre'].value_counts()
    assert indivi['cadre'].isin(range(2)).all()

    print ''
    print "Etape 7 : on vérifie qu'il ne manque pas d'info sur les liens avec la personne de référence"

    print 'nb de doublons idfam/quifam', len(indivi[indivi.duplicated(cols=['idfoy', 'quifoy'])])

    print 'On crée les n° de personnes à charge'
    assert indivi['idfoy'].notnull().all()
    print_id(indivi)
    indivi['quifoy2'] = 2
    indivi['quifoy2'][indivi['quifoy']=='vous'] = 0
    indivi['quifoy2'][indivi['quifoy']=='conj'] = 1
    indivi['quifoy2'][indivi['quifoy']=='pac'] = 2


    del indivi['quifoy']
    indivi['quifoy'] = indivi['quifoy2']
    del indivi['quifoy2']

    print_id(indivi)
    test2 = indivi.loc[indivi['quifoy']==2, ['quifoy', 'idfoy','noindiv']]
    print_id(test2)

    j=2
    while test2.duplicated(['quifoy', 'idfoy']).any():
        test2.loc[test2.duplicated(['quifoy', 'idfoy']), 'quifoy'] = j
        j += 1

    print_id(test2)
    indivi = indivi.merge(test2, on=['noindiv','idfoy'], how="left")
    indivi['quifoy'] = indivi['quifoy_x']
    indivi['quifoy'] = where(indivi['quifoy_x']==2, indivi['quifoy_y'], indivi['quifoy_x'])
    del indivi['quifoy_x'], indivi['quifoy_y']
    print_id(indivi)

    del test2, fip
    print 'nb de doublons idfam/quifam', len(indivi[indivi.duplicated(cols=['idfoy', 'quifoy'])])
    print_id(indivi)

#####################################################################################
## On ajoute les idfam et quifam
#load(famc)
#
#tot2 <- merge(indivi, famille, by = c('noindiv'), all.x = TRUE)
#rm(famille)
#print_id(tot2)
#
### Les idfam des enfants FIP qui ne font plus partie des familles forment des famille seuls
#tot2[is.na(tot2$quifam), "idfam"] <- tot2[is.na(tot2$quifam), "noindiv"]
#tot2[is.na(tot2$quifam), "quifam"] <- 0
#print_id(tot2)
#saveTmp(tot2, file = "tot2.Rdata")
#rm(indivi,tot2)
#
## on merge les variables de revenus (foyer_aggr) avec les identifiants précédents
## load foyer
#loadTmp(file = "tot2.Rdata")
#loadTmp(file= "foyer_aggr.Rdata")
#
#tot3 <- merge(tot2, foyer, all.x = TRUE)
#print_id(tot3) # OK
#saveTmp(tot3, file= "tot3.Rdata")
#rm(tot3,tot2,foyer)
#
    print ''
    print 'Etape 8 : création des fichiers totaux'
    famille = load_temp(name='famc', year=year)



    print '    8.1 : création de tot2 & tot3'
    tot2 = indivi.merge(famille, on='noindiv', how='inner')
#     del famille # TODO: MBJ increase in number of menage/foyer when merging with family ...
    del famille


    control(tot2, debug=True, verbose=True)
    assert tot2['quifam'].notnull().all()

    save_temp(tot2, name='tot2', year=year)
    del indivi
    print '    tot2 saved'

#     #On combine les variables de revenu
#     foyer = load_temp(name='foy_ind', year=year)
#     print " INTERSERCT THE POOCHAY"
#     tot2["idfoy"] = tot2["idfoy"][tot2["idfoy"].notnull()] +1
#     print "pingas"
#     print sorted(tot2.loc[tot2.idfoy.notnull(),"idfoy"].astype('int').unique())[0:10]
#     print "pocchay"
#     print sorted(foyer["idfoy"].unique())[0:10]
#     print "final flash"
#     print 602062550.0 in foyer["idfoy"].values
#     print len(list(set(tot2["idfoy"].unique()) & set(foyer["idfoy"].unique())))
#     print tot2.quifoy.value_counts()
    #tot2.update(foyer)
    tot2.merge(foyer, how = 'left')

    tot2 = tot2[tot2.idmen.notnull()]
#     tot2['idfoy'] += 1

    print_id(tot2)

    tot3 = tot2
    # TODO: check where they come from
    tot3 = tot3.drop_duplicates(cols='noindiv')
    print len(tot3)

    #Block to remove any unwanted duplicated pair
    print "    check tot3"
    control(tot3, debug=True, verbose=True)
    tot3 = tot3.drop_duplicates(cols=['idfoy', 'quifoy'])
    tot3 = tot3.drop_duplicates(cols=['idfam', 'quifam'])
    tot3 = tot3.drop_duplicates(cols=['idmen', 'quimen'])
    tot3 = tot3.drop_duplicates(cols='noindiv')
    control(tot3)

## On ajoute les variables individualisables
#loadTmp("foyer_individualise.Rdata") # foy_ind
#loadTmp("tot3.Rdata")
#loadTmp("allvars.Rdata")
#loadTmp("sif.Rdata")
#
#vars2 <- setdiff(names(tot3),  allvars)
#tot3 <- tot3[,vars2]
#
#print_id(tot3)
#final <- merge(tot3, foy_ind, by = c('idfoy', 'quifoy'), all.x = TRUE)
#
    print '    8.2 : On ajoute les variables individualisables'

    allvars = load_temp(name = 'ind_vars_to_remove', year=year)
    vars2 = set(tot3.columns).difference(set(allvars))
    tot3 = tot3[list(vars2)]
    print len(tot3)


    assert not(tot3.duplicated(cols=['noindiv']).any()), "doublon dans tot3['noindiv']"
    lg_dup = len(tot3[tot3.duplicated(['idfoy', 'quifoy'])])
    assert lg_dup == 0, "%i pairs of idfoy/quifoy in tot3 are duplicated" %(lg_dup)

    save_temp(tot3, name='tot3', year=year)
    control(tot3)

    del tot2, allvars, tot3, vars2
    print 'tot3 sauvegardé'
    gc.collect()
Example #5
0
def create_fip(year = 2006): # message('03_fip')
    """
    Creates a 'fipDat' table containing all these 'fip individuals'
    """

    df = DataCollection(year=year)

    print 'Démarrer 03_fip'
# # anaisenf: année de naissance des PAC
# erfFoyVar <- c('anaisenf','declar')
# foyer <- LoadIn(erfFoyFil)
# foyer <- LoadIn(erfFoyFil,erfFoyVar)

    # anaisenf is a string containing letter code of pac (F,G,H,I,J,N,R) and year of birth (example: 'F1990H1992')
    # when a child is invalid, he appears twice in anaisenf (example: F1900G1900 is a single invalid child born in 1990)
    erfFoyVar = ['declar', 'anaisenf']
    foyer = df.get_values(table="foyer", variables=erfFoyVar)
    print_id(foyer)
#    control(foyer, verbose=True, verbose_length=10, debug=True)


# #***********************************************************************************************************
# # print "Step 1 : on recupere les personnes à charge des foyers"
# #**********************************************************************************************************
# # On traite les cas de declarations multiples pour ne pas créer de doublon de pac
#
#
# # On récupère toutes les pac des foyers
# L <- max(nchar(foyer$anaisenf))/5 # nombre de pac maximal
# fip <-data.frame(declar = foyer$declar)
# for (i in c(1:L)){
#   eval(parse(text = paste('fip$typ.',as.character(i),'<- substr(foyer$anaisenf,5*(i-1)+1,5*(i-1)+1)',sep = '')))
#   eval(parse(text = paste('fip$naia.',as.character(i),'<- as.numeric(substr(foyer$anaisenf,5*(i-1)+2,5*(i-1)+5))',sep = '')))
# }
# fip <- fip[!is.na(fip$typ.1),]
# fip <- reshape(fip,direction ='long', varying=2:17, sep=".")
# fip <- fip[!is.na(fip$naia),]
# fip <- fip[order(fip$declar,-rank(fip$typ),fip$naia),c('declar','naia','typ')]
# fip$N <- row(fip)[,1]
# str(fip$N)

    print "Etape 1 : on recupere les personnes à charge des foyers"
    print "    1.1 : Création des codes des enfants"
    foyer['anaisenf'] = foyer['anaisenf'].astype('string')
    nb_pac_max = len(max(foyer['anaisenf'], key=len))/5
    print "il ya a au maximum %s pac par foyer" %nb_pac_max

# Separating the string coding the pac of each "déclaration".
# Creating a list containing the new variables.

    # Creating the multi_index for the columns
    multi_index_columns = []
    for i in range(1, nb_pac_max + 1):
        pac_tuples_list = [(i, 'declaration'), (i, 'type_pac'), (i, 'naia')]
        multi_index_columns += pac_tuples_list

    columns = MultiIndex.from_tuples(multi_index_columns, names=['pac_number', 'variable'])
    fip = DataFrame(randn(len(foyer), 3*nb_pac_max), columns=columns)
    fip.fillna(NaN, inplace=True) # inutile a cause de la ligne précédente, to remove
    for i in range(1,nb_pac_max+1):
        fip[(i, 'declaration')] = foyer['declar'].values
        fip[(i,'type_pac')] = foyer['anaisenf'].str[5*(i-1)]
        fip[(i,'naia')] = foyer['anaisenf'].str[5*(i-1)+1:5*(i)]

    fip = fip.stack("pac_number")
    fip.reset_index(inplace=True)
    del fip["level_0"]

#     print fip.describe()
#     print fip.head().to_string()
    print "    1.2 : elimination des foyers fiscaux sans pac"
    #Clearing missing values and changing data format
    fip = fip[(fip['type_pac'].notnull()) & (fip['naia'] != 'an')  & (fip['naia'] != '')]
    fip = fip.sort(columns=['declaration','naia','type_pac'])
    # TODO: check if useful
    fip.set_index(["declaration","pac_number"], inplace=True)
    fip = fip.reset_index()

    del fip['pac_number']
#    control(fip, debug=True, verbose=True, verbose_columns=['naia'])

    print "    1.3 : on enlève les individus F pour lesquels il existe un individu G"
    tyFG = fip[fip.type_pac.isin(['F', 'G'])] #Filtre pour ne travailler que sur F & G

    tyFG['same_pair'] = tyFG.duplicated(cols=['declaration', 'naia'], take_last=True)
    tyFG['is_twin'] = tyFG.duplicated(cols=['declaration', 'naia', 'type_pac'])
    tyFG['to_keep'] = (~(tyFG['same_pair']) | (tyFG['is_twin']))
    #Note : On conserve ceux qui ont des couples déclar/naia différents et les jumeaux
    #puis on retire les autres (à la fois F et G)
    print len(tyFG),'/', len(tyFG[tyFG['to_keep']])
    print 'longueur fip', len(fip)

    fip['to_keep'] = NaN
    fip.update(tyFG)
    print 'enfants F & G traités'

    print "    1.4 : on enlève les H pour lesquels il y a un I"
    tyHI = fip[fip.type_pac.isin(['H', 'I'])]
    tyHI['same_pair'] = tyHI.duplicated(cols=['declaration', 'naia'], take_last=True)
    tyHI['is_twin'] = tyHI.duplicated(cols=['declaration', 'naia', 'type_pac'])
    tyHI['to_keep'] = ~(tyHI['same_pair']) | (tyHI['is_twin'])

    fip.update(tyHI)
    fip['to_keep'] = fip['to_keep'].fillna(True)
    print 'nb lines to keep/nb initial lines'
    print len(fip[fip['to_keep']]), '/', len(fip)

    indivifip = fip[fip['to_keep']]; del indivifip['to_keep'], fip, tyFG, tyHI

#    control(indivifip, debug=True)


# #************************************************************************************************************/
    print ''
    print 'Step 2 : matching indivifip with eec file'
# #************************************************************************************************************/

    indivi = load_temp(name="indivim", year=year) #TODO: USE THIS INSTEAD OF PREVIOUS LINES


# pac <- indivi[!is.na(indivi$persfip) & indivi$persfip == 'pac',]
# pac$key1 <- paste(pac$naia,pac$declar1)
# pac$key2 <- paste(pac$naia,pac$declar2)
# indivifip$key <- paste(indivifip$naia,indivifip$declar)

    #TODO: replace Indivi['persfip'] is not NaN by indivi['persfip'].notnull()
    import pdb
    pdb.set_trace()
    pac = indivi[(indivi['persfip'] is not NaN) & (indivi['persfip']=='pac')]

    pac['naia'] = pac['naia'].astype('int32') # TODO: was float in pac fix upstream
    indivifip['naia'] = indivifip['naia'].astype('int32')
    pac['key1'] = zip(pac['naia'], pac['declar1'].str[:29])
    pac['key2'] = zip(pac['naia'], pac['declar2'].str[:29])
    indivifip['key'] = zip(indivifip['naia'], indivifip['declaration'].str[:29])
    assert pac.naia.dtype == indivifip.naia.dtype, 'types %s , %s are different' %(pac.naia.dtype, indivifip.naia.dtype)

# fip <- indivifip[!indivifip$key %in% pac$key1,]
# fip <- fip[!fip$key %in% pac$key2,]
    fip = indivifip[~(indivifip.key.isin(pac.key1.values))]
    fip = fip[~(fip.key.isin(pac.key2.values))]


    print "    2.1 new fip created"
# We build a dataframe to link the pac to their type and noindiv
# table(duplicated(pac[,c("noindiv")]))
    countInd = pac.noindiv.value_counts()

# pacInd1 <- merge(pac[,c("noindiv","key1","naia")],
#                 indivifip[,c("key","typ")], by.x="key1", by.y="key")
# pacInd2 <- merge(pac[,c("noindiv","key2","naia")],
#                 indivifip[,c("key","typ")], by.x="key2", by.y="key")

    tmp_pac1 = pac[['noindiv', 'key1']]
    tmp_pac2 = pac[['noindiv', 'key2']]
    tmp_indivifip = indivifip[['key', 'type_pac', 'naia']]

    pac_ind1 = tmp_pac1.merge(tmp_indivifip, left_on='key1', right_on='key', how='inner')
    print 'longueur pacInd1' , len(pac_ind1)
    pac_ind2 = tmp_pac2.merge(tmp_indivifip, left_on='key2', right_on='key', how='inner')
    print 'longueur pacInd2', len(pac_ind2)
    print "pacInd1&2 créés"

# table(duplicated(pacInd1))
# table(duplicated(pacInd2))

    print pac_ind1.duplicated().sum()
    print pac_ind2.duplicated().sum()

# pacInd1 <-rename(pacInd1,c("key1" = "key"))
# pacInd2 <-rename(pacInd2,c("key2" = "key"))
# pacInd <- rbind(pacInd1,pacInd2)
# rm(pacInd1,pacInd2)

#     pacInd1.rename(columns={'key1':'key'}, inplace=True)
#     pacInd2.rename(columns={'key2':'key'}, inplace=True)
    del pac_ind1['key1'], pac_ind2['key2']
    print pac_ind1.columns
    print pac_ind2.columns

    if pac_ind1.index == []:
        if pac_ind2.index == []:
                print "Warning : no link between pac and noindiv for both pacInd1&2"
        else:
            print "Warning : pacInd1 is an empty data frame"
            pacInd = pac_ind2
    elif pac_ind2.index == []:
        print "Warning : pacInd2 is an empty data frame"
        pacInd = pac_ind1
    else:
        pacInd = concat([pac_ind2, pac_ind1])
    print len(pac_ind1), len(pac_ind2), len(pacInd)
    print pac_ind2.type_pac.isnull().sum()
    print pacInd.type_pac.value_counts()

    print '    2.2 : pacInd created'

# table(duplicated(pacInd[,c("noindiv","typ")]))
# table(duplicated(pacInd$noindiv))

    print 'doublons noindiv, type_pac', pacInd.duplicated(['noindiv', 'type_pac']).sum()
    print 'doublons noindiv seulement', pacInd.duplicated('noindiv').sum()
    print 'nb de NaN', pacInd.type_pac.isnull().sum()

    del pacInd["key"]
    pacIndiv = pacInd[~(pacInd.duplicated('noindiv'))]
#     pacIndiv.reset_index(inplace=True)
    print pacIndiv.columns

    save_temp(pacIndiv, name="pacIndiv", year=year)

    print pacIndiv.type_pac.value_counts()
    gc.collect()

# # We keep the fip in the menage of their parents because it is used in to
# # build the famille. We should build an individual ident for the fip that are
# # older than 18 since they are not in their parents' menage according to the eec

# individec1 <- subset(indivi, (declar1 %in% fip$declar) & (persfip=="vous"))
# individec1 <- individec1[,c("declar1","noidec","ident","rga","ztsai","ztsao")]
# individec1 <- upData(individec1,rename=c(declar1="declar"))
# fip1       <- merge(fip,individec1)
    # indivi$noidec <- as.numeric(substr(indivi$declar1,1,2))
    indivi['noidec'] = indivi['declar1'].str[0:2].astype('float16') # To be used later to set idfoy
    individec1 = indivi[(indivi.declar1.isin(fip.declaration.values)) & (indivi['persfip']=="vous")]
    individec1 = individec1.loc[:, ["declar1","noidec","ident","rga","ztsai","ztsao"]]
    individec1 = individec1.rename(columns={'declar1':'declaration'})
    fip1 = fip.merge(individec1, on='declaration')
    print '    2.3 : fip1 created'

# # TODO: On ne s'occupe pas des declar2 pour l'instant
# # individec2 <- subset(indivi, (declar2 %in% fip$declar) & (persfip=="vous"))
# # individec2 <- individec2[,c("declar2","noidec","ident","rga","ztsai","ztsao")]
# # individec2 <- upData(individec2,rename=c(declar2="declar"))
# # fip2 <-merge(fip,individec2)

    individec2 = indivi[(indivi.declar2.isin(fip.declaration.values)) & (indivi['persfip']=="vous")]
    individec2 = individec2.loc[:, ["declar2","noidec","ident","rga","ztsai","ztsao"]]
    individec2.rename(columns={'declar2':'declaration'}, inplace=True)
    print individec2.head()
    fip2 = fip.merge(individec2)
    print '    2.4 : fip2 created'


    fip1.duplicated().value_counts()
    fip2.duplicated().value_counts()

# #fip <- rbind(fip1,fip2)
# fip <- fip1
# table(fip$typ)

    fip = concat([fip1, fip2])
#     fip = fip1 #TODO: Pourquoi cette ligne ?
    fip.type_pac.value_counts()

    print fip.columns
    fip['persfip'] = 'pac'
    fip['year'] = year
    fip['year'] = fip['year'].astype('float') # BUG; pas de colonne année dans la DF
    fip['noi'] = 99
    fip['noicon'] = None
    fip['noindiv'] = fip['declaration']
    fip['noiper'] = None
    fip['noimer'] = None
    fip['declar1'] = fip['declaration'] #TODO declar ?
    fip['naim'] = 99
    fip['lien'] = None
    fip['quelfic'] = 'FIP'
    fip['acteu'] = None
    fip['agepf'] = fip['year'] - fip['naia'].astype('float')
    fip['lpr'] = where(fip['agepf'] <=20, 3, 4) # TODO pas très propre d'après Mahdi/Clément
    fip['stc'] = None
    fip['contra'] = None
    fip['titc'] = None
    fip['mrec'] = None
    fip['forter'] = None
    fip['rstg'] = None
    fip['retrai'] = None
    fip['cohab'] = None
    fip['sexe'] = None
    fip['persfip'] = "pac"
    fip['agepr'] = None
    fip['actrec'] = where(fip['agepf']<=15, 9, 5)

## TODO: probleme actrec des enfants fip entre 16 et 20 ans : on ne sait pas s'ils sont étudiants ou salariés */
## TODO problème avec les mois des enfants FIP : voir si on ne peut pas remonter à ces valeurs: Alexis : clairement non

# Reassigning noi for fip children if they are more than one per foyer fiscal
# while ( any(duplicated( fip[,c("noi","ident")]) ) ) {
#   dup <- duplicated( fip[, c("noi","ident")])
#   tmp <- fip[dup,"noi"]
#   fip[dup, "noi"] <- (tmp-1)
# }
    #TODO: Le vecteur dup est-il correct
    fip["noi"] = fip["noi"].astype("int64")
    fip["ident"] = fip["ident"].astype("int64")

    fip_tmp = fip[['noi','ident']]

    while any(fip.duplicated(cols=['noi', 'ident'])):
        fip_tmp = fip.loc[:, ['noi', 'ident']]
        dup = fip_tmp.duplicated()
        tmp = fip.loc[dup, 'noi']
        print len(tmp)
        fip.loc[dup, 'noi'] = tmp.astype('int64') - 1

    fip['idfoy'] = 100*fip['ident'] + fip['noidec']
    fip['noindiv'] = 100*fip['ident'] + fip['noi']
    fip['type_pac'] = 0 ; fip['key'] = 0

    print fip.duplicated('noindiv').value_counts()
    save_temp(fip, name="fipDat", year=year)
    del fip, fip1, individec1, indivifip, indivi, pac
    print 'fip sauvegardé'
def sif(year=2006):
    data = DataCollection(year=year)
    print u"05_foyer: extraction des données foyer"
    ## TODO Comment choisir le rfr n -2 pour éxonération de TH ?
    ## mnrvka  Revenu TH n-2
    ## mnrvkh  revenu TH (revenu fiscal de référence)
    #
    ## On récupère les variables du code sif
    #vars <- c("noindiv", 'sif', "nbptr", "mnrvka")
    vars = ["noindiv", 'sif', "nbptr", "mnrvka", "rbg", "tsrvbg"]
    #sif <- LoadIn(erfFoyFil, vars)
    sif = data.get_values(variables=vars, table="foyer")
    #sif$statmarit <- 0
    sif['statmarit'] = 0

    print sif

    ## for (index in 60:80){
    ##    print(index)
    ##    print(table(substr(sif$sif,index,index)))
    ## }

    #
    #
    ## Pb with some 2 sifs that are misaligned
    ## index = 61
    ## sif[substr(sif$sif,index,index)=="F", "noindiv"]
    ## sif$sif[sif$noindiv == sif[substr(sif$sif,index,index)=="F", "noindiv"]]
    ## sif$sif[sif$noindiv == sif[substr(sif$sif,index,index)=="F", "noindiv"]+1]
    #
    ## index = 62
    ## sif[substr(sif$sif,index,index)=="G", "noindiv"]
    ## sif$sif[sif$noindiv == sif[substr(sif$sif,index,index)=="G", "noindiv"]]
    ## sif$sif[1]
    #
    #if (year==2009){
    #  # problem with one entry in 2009
    #  length  <- nchar(sif$sif[1])
    #  old_sif <- sif$sif[sif$noindiv == 901803201]
    #  new_sif <- paste(substr(old_sif,1,59), substr(old_sif,61,length),"0", sep="")
    #  sif$sif[sif$noindiv == 901803201] <- new_sif
    #  old_sif <- sif$sif[sif$noindiv == 900872201]
    #  new_sif <- paste(substr(old_sif,1,58), " ", substr(old_sif,59,length), sep="")
    #  sif$sif[sif$noindiv == 900872201] <- new_sif
    #  rm(old_sif,new_sif)
    #}
    if year == 2009:
        old_sif = sif['sif'][sif['noindiv'] == 901803201]
        new_sif = old_sif.str[0:59] + old_sif.str[60:] + "0"
        sif['sif'][sif['noindiv'] == 901803201] = new_sif
        old_sif = sif['sif'][sif['noindiv'] == 900872201]
        new_sif = old_sif.str[0:58] + " " + old_sif.str[58:]
        sif['sif'][sif['noindiv'] == 900872201] = new_sif
        del old_sif, new_sif
    #
    #
    ## for (index in 60:80){
    ##     print(index)
    ##     print(table(substr(sif$sif,index,index)))
    ## }
    #
    #
    #
    #sif <- within(sif,{
    ##  rbg = rbg*((tsrvbg =='+')-(tsrvbg =='-'))
    print sif["rbg"].describe()
    sif["rbg"] = sif["rbg"] * ((sif["tsrvbg"] == '+') - (sif["tsrvbg"] == '-'))
    print sif["rbg"].describe()

    #  stamar <- substr(sif,5,5)

    sif["stamar"] = sif["sif"].str[4:5]

    # Converting marital status

    #  statmarit[stamar =="M"] <- 1
    #  statmarit[stamar =="C"] <- 2
    #  statmarit[stamar =="D"] <- 3
    #  statmarit[stamar =="V"] <- 4
    #  statmarit[stamar =="O"] <- 5

    statmarit_dict = {"M": 1, "C": 2, "D": 3, "V": 4, "O": 5}
    for key, val in statmarit_dict.iteritems():
        sif["statmarit"][sif.stamar == key] = val

    #  birthvous <- as.numeric(substr(sif,6,9))
    #  birthconj <- as.numeric(substr(sif,11,14))
    #

    sif["birthvous"] = sif["sif"].str[5:9]
    sif["birthconj"] = sif["sif"].str[10:14]

    #  caseE <- as.numeric(substr(sif,16,16)=='E')
    #  caseF <- as.numeric(substr(sif,17,17)=='F')
    #  caseG <- as.numeric(substr(sif,18,18)=='G')
    #  caseK <- as.numeric(substr(sif,19,19)=='K')

    sif["caseE"] = sif["sif"].str[15:16] == "E"
    sif["caseF"] = sif["sif"].str[16:17] == "F"
    sif["caseG"] = sif["sif"].str[17:18] == "G"
    sif["caseK"] = sif["sif"].str[18:19] == "K"

    d = 0
    #
    #  if (year %in% c(2006,2007)){
    #    caseL <- as.numeric(substr(sif,20,20)=='L')
    #    caseP <- as.numeric(substr(sif,21,21)=='P')
    #    caseS <- as.numeric(substr(sif,22,22)=='S')
    #    caseW <- as.numeric(substr(sif,23,23)=='W')
    #    caseN <- as.numeric(substr(sif,24,24)=='N')
    #    caseH <- as.numeric(substr(sif,25,28))
    #    caseT <- as.numeric(substr(sif,29,29) == 'T')
    #  }

    if year in [2006, 2007]:
        sif["caseL"] = sif["sif"].str[19:20] == "L"
        sif["caseP"] = sif["sif"].str[20:21] == "P"
        sif["caseS"] = sif["sif"].str[21:22] == "S"
        sif["caseW"] = sif["sif"].str[22:23] == "W"
        sif["caseN"] = sif["sif"].str[23:24] == "N"
        sif["caseH"] = sif["sif"].str[24:28]
        sif["caseT"] = sif["sif"].str[28:29] == "T"
    #
    #  if (year == 2008){
    #    d = - 1 # fin de la case L
    #    caseP <- as.numeric(substr(sif,21+d,21+d)=='P')
    #    caseS <- as.numeric(substr(sif,22+d,22+d)=='S')
    #    caseW <- as.numeric(substr(sif,23+d,23+d)=='W')
    #    caseN <- as.numeric(substr(sif,24+d,24+d)=='N')
    #    caseH <- as.numeric(substr(sif,25+d,28+d))
    #    caseT <- as.numeric(substr(sif,29+d,29+d)=='T')
    #  }
    #
    if year in [2008]:
        d = -1  # fin de la case L
        sif["caseP"] = sif["sif"].str[20 + d:21 + d] == "P"
        sif["caseS"] = sif["sif"].str[21 + d:22 + d] == "S"
        sif["caseW"] = sif["sif"].str[22 + d:23 + d] == "W"
        sif["caseN"] = sif["sif"].str[23 + d:24 + d] == "N"
        sif["caseH"] = sif["sif"].str[24 + d:28 + d]
        sif["caseT"] = sif["sif"].str[28 + d:29 + d] == "T"

    #  if (year == 2009){
    #    # retour de la case L par rapport à 2008 (donc on retrouve 2006)
    #    caseL <- as.numeric(substr(sif,20,20)=='L')
    #    caseP <- as.numeric(substr(sif,21,21)=='P')
    #    caseS <- as.numeric(substr(sif,22,22)=='S')
    #    caseW <- as.numeric(substr(sif,23,23)=='W')
    #    caseN <- as.numeric(substr(sif,24,24)=='N')
    #    # caseH en moins par rapport à 2008 (mais case en L en plus)
    #    # donc décalage par rapport à 2006
    #    d = -4
    #    caseT <- as.numeric(substr(sif,29+d,29+d)=='T')
    #  }

    if year in [2009]:
        sif["caseL"] = sif["sif"].str[19:20] == "L"
        sif["caseP"] = sif["sif"].str[20:21] == "P"
        sif["caseS"] = sif["sif"].str[21:22] == "S"
        sif["caseW"] = sif["sif"].str[22:23] == "W"
        sif["caseN"] = sif["sif"].str[23:24] == "N"
        # caseH en moins par rapport à 2008 (mais case en L en plus)
        # donc décalage par rapport à 2006
        d = -4
        sif["caseT"] = sif["sif"].str[28 + d:29 + d] == "T"

    #
    #  caseX <- as.numeric(substr(sif,34+d,34+d)=='X')
    #  dateX <- as.Date(substr(sif,35+d,42+d),'%d%m%Y')
    #  caseY <- as.numeric(substr(sif,43+d,43+d)== 'Y')
    #  dateY <- as.Date(substr(sif,44+d,51+d),'%d%m%Y')
    #  caseZ <- as.numeric(substr(sif,52+d,53+d)== 'Z')
    #  dateZ <- as.Date(substr(sif,53+d,60+d),'%d%m%Y')  # ERROR 54+d
    #  causeXYZ <- substr(sif,61+d,61+d)
    #

    sif["caseX"] = sif["sif"].str[33 + d:34 + d] == "X"
    sif["dateX"] = sif["sif"].str[34 + d:42 + d]
    sif["caseY"] = sif["sif"].str[42 + d:43 + d] == "Y"
    sif["dateY"] = sif["sif"].str[43 + d:51 + d]
    sif["caseZ"] = sif["sif"].str[51 + d:53 + d] == "Z"
    sif["dateZ"] = sif["sif"].str[52 + d:60 + d]
    sif["causeXYZ"] = sif["sif"].str[60 + d:61 + d]

    # TODO: convert dateXYZ to appropriate date in pandas
    # print sif["dateY"].unique()

    #  nbptr <- nbptr/100
    #  rfr_n_2 <- mnrvka

    sif["nbptr"] = sif["nbptr"] / 100
    sif["rfr_n_2"] = sif["mnrvka"]

    #  nbF <- as.numeric(substr(sif,65+d,66+d))
    #  nbG <- as.numeric(substr(sif,68+d,69+d))
    #  nbR <- as.numeric(substr(sif,71+d,72+d))
    #  nbJ <- as.numeric(substr(sif,74+d,75+d))
    #  nbN <- as.numeric(substr(sif,77+d,78+d))
    #  nbH <- as.numeric(substr(sif,80+d,81+d))
    #  nbI <- as.numeric(substr(sif,83+d,84+d))

    sif["nbF"] = sif["sif"].str[64 + d:66 + d]
    sif["nbG"] = sif["sif"].str[67 + d:69 + d]
    sif["nbR"] = sif["sif"].str[70 + d:72 + d]
    sif["nbJ"] = sif["sif"].str[73 + d:75 + d]
    sif["nbN"] = sif["sif"].str[76 + d:78 + d]
    sif["nbH"] = sif["sif"].str[79 + d:81 + d]
    sif["nbI"] = sif["sif"].str[82 + d:84 + d]

    #  if (year != 2009){
    #  nbP <- as.numeric(substr(sif,86+d,87+d))
    #  }
    #})

    if (year != 2009):
        sif["nbP"] = sif["sif"].str[85 + d:87 + d]

    #sif$sif <- NULL
    #sif$stamar <- NULL
    #

    del sif["sif"], sif["stamar"]

    #table(sif$statmarit)
    print sif["statmarit"].value_counts()

    #print(length(table(sif$noindiv)))
    print "Number of individuals :", len(sif["noindiv"])
    print "Number of distinct individuals :", len(
        sif["noindiv"].value_counts())

    #dup <- duplicated(sif$noindiv)
    #table(dup)
    #sif <- sif[!dup,]
    #print(length(table(sif$noindiv)))

    sif_drop_duplicated = sif.drop_duplicates("noindiv")
    assert len(sif["noindiv"].value_counts(
    )) == len(sif_drop_duplicated["noindiv"]), Exception(
        "Number of distinct individuals after removing duplicates is not correct"
    )

    print 'Saving sif'
    save_temp(sif, name='sif', year=year)
    del sif
    gc.collect()
Example #7
0
def foyer_all(year=2006):

    ## On ajoute les cases de la déclaration
    #foyer_all <- LoadIn(erfFoyFil)
    data = DataCollection(year=year)
    foyer_all = data.get_values(table="foyer" )

    ## on ne garde que les cases de la déclaration ('fxzz')
    #vars <- names(foyer_all)
    #vars <- c("noindiv", vars[grep("^f[0-9]", vars)])
    #

    vars = foyer_all.columns
    regex = re.compile("^f[0-9]")
    vars = [x for x in vars if regex.match(x)]

    #foyer <- foyer_all[vars]
    #rm(foyer_all)
    #gc()
    #noindiv <- list(foyer$noindiv)
    #

    foyer = foyer_all[vars + ["noindiv"]]

    del foyer_all
    gc.collect()

    #
    ## On aggrège les déclarations dans le cas où un individu a fait plusieurs déclarations
    #foyer <- aggregate(foyer, by = noindiv, FUN = 'sum')
    #print foyer.describe()["f1aj"].to_string()
    foyer = foyer.groupby("noindiv", as_index=False).aggregate(numpy.sum)
    #
    #print foyer.describe()["f1aj"].to_string()
    #print foyer.describe()["noindiv"].to_string()
    #

    print_id(foyer)

    ## noindiv have been summed over original noindiv which are now in Group.1
    #foyer$noindiv <- NULL
    #foyer <- rename(foyer, c(Group.1 = 'noindiv'))
    ## problème avec les dummies ()
    #
    #saveTmp(foyer, file= "foyer_aggr.Rdata")
    #
    #
    #############################################################################
    ## On récupère les variables individualisables
    #loadTmp("foyer_aggr.Rdata")
    #
    #individualisable <- function(table, var, vars, qui){
    #  print(var)
    #  print(vars)
    #  temp <- table[c('noindiv', vars)]
    #  n = length(qui)
    #  names(temp)[2:(n+1)] <- qui
    #  temp$newvar <- NULL
    #  temp2 <- melt(temp, id = 'noindiv', variable_name = 'quifoy')
    #  temp2 <- transform(temp2, quifoy = as.character(quifoy))
    #  temp2 <- transform(temp2, noindiv = as.character(noindiv))
    #  str(temp2)
    #  rename(temp2, c(value = var))
    #}


    var_dict = {'sali': ['f1aj', 'f1bj', 'f1cj', 'f1dj', 'f1ej'],
                'choi': ['f1ap', 'f1bp', 'f1cp', 'f1dp', 'f1ep'],
               'fra': ['f1ak', 'f1bk', 'f1ck', 'f1dk', 'f1ek'],
               'cho_ld': ['f1ai', 'f1bi', 'f1ci', 'f1di', 'f1ei'],
               'ppe_tp_sa': ['f1ax', 'f1bx', 'f1cx', 'f1dx', 'f1qx'],
               'ppe_du_sa': ['f1av', 'f1bv', 'f1cv', 'f1dv', 'f1qv'],
               'rsti': ['f1as', 'f1bs', 'f1cs', 'f1ds', 'f1es'],
               'alr': ['f1ao', 'f1bo', 'f1co', 'f1do', 'f1eo'],
               'f1tv': ['f1tv', 'f1uv'],
               'f1tw': ['f1tw', 'f1uw'],
               'f1tx': ['f1tx', 'f1ux'],
               'ppe_tp_ns': ['f5nw', 'f5ow', 'f5pw'],
               'ppe_du_ns':  ['f5nv', 'f5ov', 'f5pv'],
               'frag_exon': ['f5hn', 'f5in', 'f5jn'],
               'frag_impo': ['f5ho', 'f5io', 'f5jo'],
               'arag_exon': ['f5hb', 'f5ib', 'f5jb'],
               'arag_impg': ['f5hc', 'f5ic', 'f5jc'],
               'arag_defi': ['f5hf', 'f5if', 'f5jf'],
               'nrag_exon': ['f5hh', 'f5ih', 'f5jh'],
               'nrag_impg': ['f5hi', 'f5ii', 'f5ji'],
               'nrag_defi': ['f5hl', 'f5il', 'f5jl'],
               'nrag_ajag': ['f5hm', 'f5im', 'f5jm'],
               'mbic_exon': ['f5kn', 'f5ln', 'f5mn'],
               'abic_exon': ['f5kb', 'f5lb', 'f5mb'],
               'nbic_exon': ['f5kh', 'f5lh', 'f5mh'],
               'mbic_impv': ['f5ko', 'f5lo', 'f5mo'],
               'mbic_imps': ['f5kp', 'f5lp', 'f5mp'],
               'abic_impn': ['f5kc', 'f5lc', 'f5mc'],
               'abic_imps': ['f5kd', 'f5ld', 'f5md'],
               'nbic_impn': ['f5ki', 'f5li', 'f5mi'],
               'nbic_imps': ['f5kj', 'f5lj', 'f5mj'],
               'abic_defn': ['f5kf', 'f5lf', 'f5mf'],
               'abic_defs': ['f5kg', 'f5lg', 'f5mg'],
               'nbic_defn': ['f5kl', 'f5ll', 'f5ml'],
               'nbic_defs': ['f5km', 'f5lm', 'f5mm'],
               'nbic_apch': ['f5ks', 'f5ls', 'f5ms'],
               'macc_exon': ['f5nn', 'f5on', 'f5pn'],
               'aacc_exon': ['f5nb', 'f5ob', 'f5pb'],
               'nacc_exon': ['f5nh', 'f5oh', 'f5ph'],
               'macc_impv': ['f5no', 'f5oo', 'f5po'],
               'macc_imps': ['f5np', 'f5op', 'f5pp'],
               'aacc_impn': ['f5nc', 'f5oc', 'f5pc'],
               'aacc_imps': ['f5nd', 'f5od', 'f5pd'],
               'aacc_defn': ['f5nf', 'f5of', 'f5pf'],
               'aacc_defs': ['f5ng', 'f5og', 'f5pg'],
               'nacc_impn': ['f5ni', 'f5oi', 'f5pi'],
               'nacc_imps': ['f5nj', 'f5oj', 'f5pj'],
               'nacc_defn': ['f5nl', 'f5ol', 'f5pl'],
               'nacc_defs': ['f5nm', 'f5om', 'f5pm'],
               'mncn_impo': ['f5ku', 'f5lu', 'f5mu'],
               'cncn_bene': ['f5sn', 'f5ns', 'f5os'],
               'cncn_defi': ['f5sp', 'f5nu', 'f5ou', 'f5sr'], # TODO: check
               'mbnc_exon': ['f5hp', 'f5ip', 'f5jp'],
               'abnc_exon': ['f5qb', 'f5rb', 'f5sb'],
               'nbnc_exon': ['f5qh', 'f5rh', 'f5sh'],
               'mbnc_impo': ['f5hq', 'f5iq', 'f5jq'],
               'abnc_impo': ['f5qc', 'f5rc', 'f5sc'],
               'abnc_defi': ['f5qe', 'f5re', 'f5se'],
               'nbnc_impo': ['f5qi', 'f5ri', 'f5si'],
               'nbnc_defi': ['f5qk', 'f5rk', 'f5sk'],
#               'ebic_impv' : ['f5ta','f5ua', 'f5va'],
#               'ebic_imps' : ['f5tb','f5ub', 'f5vb'],
               'mbic_mvct': ['f5hu'],
               'macc_mvct': ['f5iu'],
               'mncn_mvct': ['f5ju'],
               'mbnc_mvct': ['f5kz'],
               'frag_pvct': ['f5hw', 'f5iw', 'f5jw'],
               'mbic_pvct': ['f5kx', 'f5lx', 'f5mx'],
               'macc_pvct': ['f5nx', 'f5ox', 'f5px'],
               'mbnc_pvct': ['f5hv', 'f5iv', 'f5jv'],
               'mncn_pvct': ['f5ky', 'f5ly', 'f5my'],
               'mbic_mvlt': ['f5kr', 'f5lr', 'f5mr'],
               'macc_mvlt': ['f5nr', 'f5or', 'f5pr'],
               'mncn_mvlt': ['f5kw', 'f5lw', 'f5mw'],
               'mbnc_mvlt': ['f5hs', 'f5is', 'f5js'],
               'frag_pvce': ['f5hx', 'f5ix', 'f5jx'],
               'arag_pvce': ['f5he', 'f5ie', 'f5je'],
               'nrag_pvce': ['f5hk', 'f5lk', 'f5jk'],
               'mbic_pvce': ['f5kq', 'f5lq', 'f5mq'],
               'abic_pvce': ['f5ke', 'f5le', 'f5me'],
               'nbic_pvce': ['f5kk', 'f5ik', 'f5mk'],
               'macc_pvce': ['f5nq', 'f5oq', 'f5pq'],
               'aacc_pvce': ['f5ne', 'f5oe', 'f5pe'],
               'nacc_pvce': ['f5nk', 'f5ok', 'f5pk'],
               'mncn_pvce': ['f5kv', 'f5lv', 'f5mv'],
               'cncn_pvce': ['f5so', 'f5nt', 'f5ot'],
               'mbnc_pvce': ['f5hr', 'f5ir', 'f5jr'],
               'abnc_pvce': ['f5qd', 'f5rd', 'f5sd'],
               'nbnc_pvce': ['f5qj', 'f5rj', 'f5sj'],
               'demenage' : ['f1ar', 'f1br', 'f1cr', 'f1dr', 'f1er']}  # (déménagement) uniquement en 2006


#
#varlist = list(list('sali', c('f1aj', 'f1bj', 'f1cj', 'f1dj', 'f1ej')),
#                list('choi', c('f1ap', 'f1bp', 'f1cp', 'f1dp', 'f1ep')),
#               list('fra', c('f1ak', 'f1bk', 'f1ck', 'f1dk', 'f1ek')),
# ......
#               list('mbnc_pvce', c('f5hr', 'f5ir', 'f5jr')),
#               list('abnc_pvce', c('f5qd', 'f5rd', 'f5sd')),
#               list('nbnc_pvce', c('f5qj', 'f5rj', 'f5sj')),
#               list('demenage' , c('f1ar', 'f1br', 'f1cr', 'f1dr', 'f1er'))) # (déménagement) uniquement en 2006
#
    vars_sets = [ set(var_list) for var_list in var_dict.values() ]
    eligible_vars = (set().union(*vars_sets)).intersection( set(list(foyer.columns)))

    print "From %i variables, we keep %i eligibles variables"   %( len(set().union(*vars_sets)), len(eligible_vars))
    qui = ['vous', 'conj', 'pac1', 'pac2', 'pac3']
    err = 0
    err_vars = {}

    foy_ind = DataFrame()

    for individual_var, foyer_vars in var_dict.iteritems():
        try:
            selection = foyer[foyer_vars + ["noindiv"]]
        except KeyError:
            # Testing if at least one variable of foyers_vars is in the eligible list
            presence = [ x  in eligible_vars for x in foyer_vars ]
            var_present = any(presence)
            if not var_present:
                print individual_var + " is not present"
                continue
            else:
                # Shrink the list
                foyer_vars_cleaned = [var for var,present in zip(foyer_vars, presence) if present is True]
                selection = foyer[foyer_vars_cleaned + ["noindiv"]]

        # Reshape the dataframe
        selection.rename(columns=dict(zip(foyer_vars, qui)), inplace=True)
        selection.set_index("noindiv", inplace=True)
        selection.columns.name = "quifoy"

        selection = selection.stack()
        selection.name = individual_var
        selection = selection.reset_index() # A Series cannot see its index resetted to produce a DataFrame
        selection = selection.set_index(["quifoy", "noindiv"])
        selection = selection[selection[individual_var] !=0]
#        print len(selection)

        if len(foy_ind) == 0:
            foy_ind = selection
        else:

            foy_ind = concat([foy_ind, selection], axis=1, join='outer')

    foy_ind.reset_index(inplace=True)

    print "foy_ind"
    print foy_ind.describe().to_string()


#not_first <- FALSE
#allvars = c()
#for (v in varlist){
#  vars = intersect(v[[2]],names(foyer)) # to deal with variabes that are not present
#  if (length(vars) > 0) {
#    allvars <-  c(allvars, vars)
#    qui <- c('vous', 'conj', 'pac1', 'pac2', 'pac3')
#    n <- length(vars)
#    temp <- individualisable(foyer, v[[1]], vars, qui[1:n])
#    if (not_first) {
#      print('merge')
#      foy_ind <- merge(temp, foy_ind, by = c('noindiv', 'quifoy'), all = TRUE)
#      names(foy_ind)
#    }
#    else   {
#      print('init')
#      foy_ind <- temp
#      not_first <- TRUE
#    }
#  }
#}

    ind_vars_to_remove = Series(list(eligible_vars))
    save_temp(ind_vars_to_remove, name='ind_vars_to_remove', year=year)
    foy_ind.rename(columns={"noindiv" : "idfoy"}, inplace=True)

    print_id(foy_ind)
    foy_ind['quifoy'][foy_ind['quifoy']=='vous'] = 0
    foy_ind['quifoy'][foy_ind['quifoy']=='conj'] = 1
    foy_ind['quifoy'][foy_ind['quifoy']=='pac1'] = 2
    foy_ind['quifoy'][foy_ind['quifoy']=='pac2'] = 3
    foy_ind['quifoy'][foy_ind['quifoy']=='pac3'] = 4

    assert foy_ind['quifoy'].isin(range(5)).all(), 'présence de valeurs aberrantes dans quifoy'

    print 'saving foy_ind'
    print_id(foy_ind)
    save_temp(foy_ind, name="foy_ind", year = year)
    show_temp()
    return
Example #8
0
def test(year=2006, variables=['af']):
    simulation = SurveySimulation()
    survey_filename = os.path.join(model.DATA_DIR, 'sources', 'test.h5')
    simulation.set_config(year=year, survey_filename=survey_filename)
    simulation.set_param()
    simulation.compute()

    #     of_aggregates = Aggregates()
    #     of_aggregates.set_simulation(simulation)
    #     of_aggregates.compute()
    #     print of_aggregates.aggr_frame
    #
    #     from openfisca_france.data.erf.aggregates import build_erf_aggregates
    #     temp = (build_erf_aggregates(variables=variables, year= year))
    #     print temp
    #     return
    variable = "af"
    debugger = Debugger()
    debugger.set_simulation(simulation)
    debugger.set_variable(variable)
    debugger.show_aggregates()

    def get_all_ancestors(varlist):
        if len(varlist) == 0:
            return []
        else:
            if varlist[0]._parents == set():
                return ([varlist[0]] + get_all_ancestors(varlist[1:]))
            else:
                return ([varlist[0]] +
                        get_all_ancestors(list(varlist[0]._parents)) +
                        get_all_ancestors(varlist[1:]))

    # We want to get all ancestors + children + the options that we're going to encounter
    parents = map(lambda x: simulation.output_table.column_by_name.get(x),
                  variables)
    parents = get_all_ancestors(parents)
    options = []
    for varcol in parents:
        options.extend(varcol._option.keys())
    options = list(set(options))
    #print options
    parents = map(lambda x: x.name, parents)
    for var in variables:
        children = set()
        varcol = simulation.output_table.column_by_name.get(var)
        children = children.union(set(map(lambda x: x.name, varcol._children)))
    variables = list(set(parents + list(children)))
    #print variables
    del parents, children
    gc.collect()

    def get_var(variable):
        variables = [variable]
        return simulation.aggregated_by_entity(entity="men",
                                               variables=variables,
                                               all_output_vars=False,
                                               force_sum=True)[0]

    simu_aggr_tables = get_var(variables[0])
    for var in variables[1:]:
        simu_aggr_tables = simu_aggr_tables.merge(get_var(var)[['idmen', var]],
                                                  on='idmen',
                                                  how='outer')

    # We load the data from erf table in case we have to pick data there
    erf_data = DataCollection(year=year)
    os.system('cls')
    todo = set(variables + ["ident", "wprm"]).union(set(options))
    print 'Variables or equivalents to fetch :'
    print todo
    '''
    Méthode générale pour aller chercher les variables de l'erf/eec
    ( qui n'ont pas forcément le même nom
    et parfois sont les variables utilisées pour créér l'of ):
    1 - essayer le get_of2erf, ça doit marcher pour les variables principales ( au moins les aggrégats
    que l'on compare )
    Si les variables ne sont pas directement dans la table,
    elles ont été calculées à partir d'autres variables de données erf/eec
    donc chercher dans :
    2 - build_survey
    3 - model/model.py qui dira éventuellement dans quel module de model/ chercher
    Le 'print todo' vous indique quelles variables chercher
    ( attention à ne pas inclure les enfants directs )
    L'utilisation du Ctrl-H est profitable !
    '''

    fetch_eec = [
        'statut', 'titc', 'chpub', 'encadr', 'prosa', 'age', 'naim', 'naia',
        'noindiv'
    ]
    fetch_erf = ['zsali', 'af', 'ident', 'wprm', 'noi', 'noindiv', 'quelfic']
    erf_df = erf_data.get_of_values(variables=fetch_erf, table="erf_indivi")
    eec_df = erf_data.get_of_values(variables=fetch_eec, table="eec_indivi")
    erf_eec_indivi = erf_df.merge(eec_df, on='noindiv', how='inner')
    assert 'quelfic' in erf_eec_indivi.columns, "quelfic not in erf_indivi columns"
    del eec_df, erf_df

    # We then get the aggregate variables for the menage ( mainly to compare with of )
    print 'Loading data from erf_menage table'
    erf_menage = erf_data.get_of_values(variables=list(todo) + ['quelfic'],
                                        table="erf_menage")

    del todo
    gc.collect()
    assert 'ident' in erf_menage.columns, "ident not in erf_menage.columns"

    from openfisca_france.data.erf import get_erf2of
    erf2of = get_erf2of()
    erf_menage.rename(columns=erf2of, inplace=True)

    # We get the options from the simulation non aggregated tables:

    # First from the output_table
    # We recreate the noindiv in output_table
    simulation.output_table.table[
        'noindiv'] = 100 * simulation.output_table.table.idmen_ind + simulation.output_table.table.noi_ind
    simulation.output_table.table['noindiv'] = simulation.output_table.table[
        'noindiv'].astype(np.int64)
    s1 = [
        var for var in set(options).intersection(
            set(simulation.output_table.table.columns))
    ] + ['idmen_ind', 'quimen_ind', 'noindiv']
    simu_nonaggr_tables = (simulation.output_table.table)[s1]
    simu_nonaggr_tables.rename(columns={
        'idmen_ind': 'idmen',
        'quimen_ind': 'quimen'
    },
                               inplace=True)
    assert 'noindiv' in simu_nonaggr_tables.columns

    # If not found, we dwelve into the input_table
    if (set(s1) - set(['idmen_ind', 'quimen_ind', 'noindiv'])) < set(options):
        assert 'noindiv' in simulation.input_table.table.columns, "'noindiv' not in simulation.input_table.table.columns"
        s2 = [
            var for var in (set(options).intersection(
                set(simulation.input_table.table.columns)) - set(s1))
        ] + ['noindiv']
        #print s2
        temp = simulation.input_table.table[s2]
        simu_nonaggr_tables = simu_nonaggr_tables.merge(temp,
                                                        on='noindiv',
                                                        how='inner',
                                                        sort=False)

        del s2, temp
    del s1
    gc.collect()

    simu_nonaggr_tables = simu_nonaggr_tables[list(set(options)) +
                                              ['idmen', 'quimen', 'noindiv']]
    #print options, variables
    assert 'idmen' in simu_nonaggr_tables.columns, 'Idmen not in simu_nonaggr_tables columns'

    # Check the idmens that are not common
    erf_menage.rename(columns={'ident': 'idmen'}, inplace=True)

    print "\n"
    print 'Checking if idmen is here...'
    print '\n ERF : '
    print 'idmen' in erf_menage.columns
    print "\n Simulation output"
    print 'idmen' in simu_aggr_tables.columns
    print "\n"

    #print 'Dropping duplicates of idmen for both tables...'
    assert not erf_menage["idmen"].duplicated().any(
    ), "Duplicated idmen in erf_menage"
    #erf_menage.drop_duplicates('idmen', inplace = True)
    simu_aggr_tables.drop_duplicates('idmen', inplace=True)
    assert not simu_aggr_tables["idmen"].duplicated().any(
    ), "Duplicated idmen in of"

    print 'Checking mismatching idmen... '
    s1 = set(erf_menage['idmen']) - (set(simu_aggr_tables['idmen']))
    if s1:
        print "idmen that aren't in simu_aggr_tables : %s" % str(len(s1))
        pass
    s2 = (set(simu_aggr_tables['idmen'])) - set(erf_menage['idmen'])
    if s2:
        print "idmen that aren't in erf_menage : %s" % str(len(s2))
        pass
    del s1, s2

    # Restrict to common idmens and merge
    s3 = set(erf_menage['idmen']).intersection(set(simu_aggr_tables['idmen']))
    print "Restricting to %s common idmen... \n" % str(len(s3))
    erf_menage = erf_menage[erf_menage['idmen'].isin(s3)]
    simu_aggr_tables = simu_aggr_tables[simu_aggr_tables['idmen'].isin(s3)]
    del s3
    gc.collect()

    #print erf_menage.columns
    #print simu_aggr_tables.columns

    # Compare differences across of and erf dataframes
    print "Comparing differences between dataframes... \n"
    colcom = (set(erf_menage.columns).intersection(
        set(simu_aggr_tables.columns))) - set(['idmen', 'wprm'])
    print 'Common variables: '
    print colcom
    erf_menage.reset_index(inplace=True)
    simu_aggr_tables.reset_index(inplace=True)
    for col in colcom:
        temp = set(
            erf_menage['idmen'][erf_menage[col] != simu_aggr_tables[col]])
        print "Numbers of idmen that aren't equal on variable %s : %s \n" % (
            col, str(len(temp)))
        del temp

    # Detect the biggest differences
    bigtable = merge(erf_menage,
                     simu_aggr_tables,
                     on='idmen',
                     how='inner',
                     suffixes=('_erf', '_of'))
    print 'Length of new dataframe is %s' % str(len(bigtable))
    #print bigtable.columns
    bigtable.set_index('idmen', drop=False, inplace=True)

    already_met = []
    options_met = []

    for col in colcom:
        bigtemp = None
        table = bigtable[and_(bigtable[col + '_erf'] != 0,
                              bigtable[col + '_of'] != 0)]
        table[col] = (table[col + '_erf'] - table[col + '_of']
                      ) / table[col + '_erf']  #Difference relative
        table[col] = table[col].apply(lambda x: abs(x))
        print 'Minimum difference between the two tables for %s is %s' % (
            col, str(table[col].min()))
        print 'Maximum difference between the two tables for %s is %s' % (
            col, str(table[col].max()))
        print table[col].describe()
        try:
            assert len(table[col]) == len(table['wprm_of']), "PINAGS"
            dec, values = mwp(table[col],
                              np.arange(1, 11),
                              table['wprm_of'],
                              2,
                              return_quantiles=True)
            #print sorted(values)
            dec, values = mwp(table[col],
                              np.arange(1, 101),
                              table['wprm_erf'],
                              2,
                              return_quantiles=True)
            #print sorted(values)[90:]
            del dec, values
            gc.collect()
        except:
            #print 'Weighted percentile method didnt work for %s' %col
            pass
        print "\n"

        # Show the relevant information for the most deviant households
        table.sort(columns=col, ascending=False, inplace=True)
        #print table[col][0:10].to_string()
        if bigtemp is None:
            bigtemp = {
                'table': table[[col, col + '_of', col + '_erf',
                                'idmen']][0:10],
                'options': None
            }
        bigtemp['table'][col + 'div'] = bigtemp['table'][
            col + '_of'] / bigtemp['table'][col + '_erf']
        print bigtemp['table'].to_string()
        '''
        bigtemp is the table which will get filled little by little by the relevant variables.
        Up to the last rows of code 'table' refers to a table of aggregated values,
        while 'options is a table of individual variables.
        The reason we call it in a dictionnary is also because we modify it inside the recursive function 'iter_on parents',
        and it causes an error in Python unless for certain types like dictionnary values.
        '''
        #print "\n"

        # If variable is a Prestation, we show the dependancies
        varcol = simulation.output_table.column_by_name.get(col)
        if isinstance(varcol, Prestation):
            '''
            For the direct children
            '''
            if not varcol._children is None:
                ch_to_fetch = list(varcol._children)
                ch_to_fetch = map(lambda x: x.name, ch_to_fetch)
                ch_fetched = []

                if set(ch_to_fetch) <= set(simu_aggr_tables.columns):
                    print "Variables which need %s to be computed :\n %s \n" % (
                        col, str(ch_to_fetch))
                    for var in ch_to_fetch:
                        if var + '_of' in table.columns:
                            ch_fetched.append(var + '_of')
                        else:
                            ch_fetched.append(var)
                elif set(ch_to_fetch) <= set(simu_aggr_tables.columns).union(
                        erf_menage.columns):
                    print "Variables which need %s to be computed (some missing picked in erf):\n %s \n" % (
                        col, str(ch_to_fetch))
                    for var in ch_to_fetch:
                        if var in simu_aggr_tables.columns:
                            if var + '_of' in table.columns:
                                ch_fetched.append(var + '_of')
                        elif var + '_erf' in table.columns:
                            ch_fetched.append(var + '_erf')
                        else:
                            ch_fetched.append(var)
                else:
                    print "Variables which need %s to be computed (some missing):\n %s \n" % (
                        col, str(ch_to_fetch))
                    for var in ch_to_fetch:

                        if var in simu_aggr_tables.columns:
                            if var + '_of' in table.columns:
                                ch_fetched.append(var + '_of')
                        elif var in erf_menage.columns:
                            if var + '_erf' in table.columns:
                                ch_fetched.append(var + '_erf')

                print table[[col] + ch_fetched][0:10]
                print "\n"
                del ch_to_fetch, ch_fetched
            '''
            For the parents
            '''
            def iter_on_parents(varcol):
                if (varcol._parents == set() and varcol._option
                        == {}) or varcol.name in already_met:
                    return
                else:
                    par_to_fetch = list(varcol._parents)
                    par_to_fetch = map(lambda x: x.name, par_to_fetch)
                    par_fetched = []

                    if set(par_fetched) <= set(simu_aggr_tables.columns):
                        #print "Variables the prestation %s depends of :\n %s \n" %(varcol.name, str(par_fetched))
                        for var in par_fetched:
                            if var + '_of' in table.columns:
                                par_fetched.append(var + '_of')
                            else:
                                par_fetched.append(var)
                    elif set(par_fetched) <= set(
                            simu_aggr_tables.columns).union(
                                erf_menage.columns):
                        #print "Variables the prestation %s depends of (some missing picked in erf):\n %s \n" %(varcol.name,str(par_fetched))
                        for var in par_fetched:
                            if var in simu_aggr_tables.columns:
                                if var + '_of' in table.columns:
                                    par_fetched.append(var + '_of')
                            elif var + '_erf' in table.columns:
                                par_fetched.append(var + '_erf')
                            else:
                                par_fetched.append(var)
                    else:
                        for var in par_fetched:
                            if var in simu_aggr_tables.columns:
                                if var + '_of' in table.columns:
                                    par_fetched.append(var + '_of')
                            elif var in erf_menage.columns:
                                if var + '_erf' in table.columns:
                                    par_fetched.append(var + '_erf')
                        if len(par_fetched) > 0:
                            #print "Variables the prestation %s depends of (some missing):\n %s \n" %(varcol.name, str(par_fetched))
                            pass
                        else:
                            #print "Variables the prestation %s depends of couldn't be found :\n %s \n" %(varcol.name, str(par_fetched))
                            pass

                    if len(par_fetched) > 0:
                        temp = table[[col, 'idmen'] + par_fetched][0:10]
                        bigtemp['table'] = pd.merge(temp,
                                                    bigtemp['table'],
                                                    how='inner')
                        #print temp.to_string(), "\n"
                    if varcol._option != {} and not set(
                            varcol._option.keys()) < set(options_met):
                        vars_to_fetch = list(
                            set(varcol._option.keys()) - set(options_met))
                        #print "and the options to current variable %s for the id's with strongest difference :\n %s \n" %(varcol.name, varcol._option.keys())
                        liste = [i for i in range(0, 10)]
                        liste = map(lambda x: table['idmen'].iloc[x], liste)
                        temp = simu_nonaggr_tables[
                            ['idmen', 'quimen', 'noindiv'] +
                            vars_to_fetch][simu_nonaggr_tables['idmen'].isin(
                                table['idmen'][0:10])]

                        temp_sorted = temp[temp['idmen'] == liste[0]]
                        for i in xrange(1, 10):
                            temp_sorted = temp_sorted.append(
                                temp[temp['idmen'] == liste[i]])
                        if bigtemp['options'] is None:
                            bigtemp['options'] = temp_sorted
                            bigtemp['options'] = bigtemp['options'].merge(
                                erf_eec_indivi, on='noindiv', how='outer')
                        else:
                            bigtemp['options'] = bigtemp['options'].merge(
                                temp_sorted,
                                on=['noindiv', 'idmen', 'quimen'],
                                how='outer')


#                         temp_sorted.set_index(['idmen',  'quimen'], drop = True, inplace = True) # If we do that
                        del temp, temp_sorted
                        gc.collect()

                    already_met.append(varcol.name)
                    options_met.extend(varcol._option.keys())
                    for var in varcol._parents:
                        iter_on_parents(var)

            iter_on_parents(varcol)
            # We merge the aggregate table with the option table ( for each individual in entity )
            bigtemp['table'] = bigtemp['table'].merge(bigtemp['options'],
                                                      how='left',
                                                      on='idmen',
                                                      suffixes=('(agg)',
                                                                '(ind)'))

            # Reshaping the table to group by descending error on col, common entities
            bigtemp['table'].sort(columns=['af', 'quimen'],
                                  ascending=[False, True],
                                  inplace=True)
            bigtemp['table'] = bigtemp['table'].groupby(['idmen', 'quimen'],
                                                        sort=False).sum()
            print "Table of values for %s dependencies : \n" % col
            print bigtemp['table'].to_string()
            del bigtemp['table'], bigtemp['options']
            gc.collect()
Example #9
0
    def preproc(self):

        erf_menage = self.erf_menage
        erf_eec_indivi = self.erf_eec_indivi
        simu_aggr_tables = self.simu_aggr_tables
        simu_nonaggr_tables = self.simu_nonaggr_tables

        def get_all_ancestors(varlist):
            if len(varlist) == 0:
                return []
            else:
                if varlist[0]._parents == set():
                    return ([varlist[0]]
                          + get_all_ancestors(varlist[1:]))
                else:
                    return ([varlist[0]]
                     + get_all_ancestors(list(varlist[0]._parents))
                      + get_all_ancestors(varlist[1:]))

        # We want to get all ancestors + children + the options that we're going to encounter
        parents = map(lambda x: self.simulation.output_table.column_by_name.get(x), [self.variable])
        parents = get_all_ancestors(parents)
        options = []
        for varcol in parents:
            options.extend(varcol._option.keys())
        options = list(set(options))
        #print options
        parents = map(lambda x: x.name, parents)
        for var in [self.variable]:
            children = set()
            varcol = self.simulation.output_table.column_by_name.get(var)
            children = children.union(set(map(lambda x: x.name, varcol._children)))
        variables = list(set(parents + list(children)))
        #print variables
        del parents, children
        gc.collect()

        def get_var(variable):
            variables =[variable]
            return self.simulation.aggregated_by_entity(entity="men", variables=variables,
                                                    all_output_vars = False, force_sum=True)[0]

        simu_aggr_tables = get_var(variables[0])
        for var in variables[1:]:
            simu_aggr_tables = simu_aggr_tables.merge(get_var(var)[['idmen', var]], on = 'idmen', how = 'outer')
        # We load the data from erf table in case we have to pick data there
        erf_data = DataCollection(year= self.simulation.datesim.year)
        os.system('cls')
        todo = set(variables + ["ident", "wprm"]).union(set(options))
        print 'Variables or equivalents to fetch :'
        print todo

        '''
        Méthode générale pour aller chercher les variables de l'erf/eec
        ( qui n'ont pas forcément le même nom
        et parfois sont les variables utilisées pour créér l'of ):
        1 - essayer le get_of2erf, ça doit marcher pour les variables principales ( au moins les aggrégats
        que l'on compare )
        Si les variables ne sont pas directement dans la table,
        elles ont été calculées à partir d'autres variables de données erf/eec
        donc chercher dans :
        2 - build_survey
        3 - model/model.py qui dira éventuellement dans quel module de model/ chercher
        Le 'print todo' vous indique quelles variables chercher
        ( attention à ne pas inclure les enfants directs )
        L'utilisation du Ctrl-H est profitable !
        '''

        fetch_eec = ['statut','titc','chpub','encadr','prosa','age','naim','naia','noindiv']
        fetch_erf = ['zsali','af','ident','wprm','noi','noindiv','quelfic']
        erf_df = erf_data.get_of_values(variables= fetch_erf, table="erf_indivi")
        eec_df = erf_data.get_of_values(variables= fetch_eec, table="eec_indivi")
        erf_eec_indivi = erf_df.merge(eec_df, on ='noindiv', how = 'inner' )
        assert 'quelfic' in erf_eec_indivi.columns, "quelfic not in erf_indivi columns"
        del eec_df, erf_df

        # We then get the aggregate variables for the menage ( mainly to compare with of )
        print 'Loading data from erf_menage table'
        erf_menage = erf_data.get_of_values(variables= list(todo) + ['quelfic'], table="erf_menage")

        del todo
        gc.collect()
        assert 'ident' in erf_menage.columns, "ident not in erf_menage.columns"

        from openfisca_france.data.erf import get_erf2of
        erf2of = get_erf2of()
        erf_menage.rename(columns = erf2of, inplace = True)

    # We get the options from the simulation non aggregated tables:

        # First from the output_table
        # We recreate the noindiv in output_table
        self.simulation.output_table.table['noindiv'] = 100 * self.simulation.output_table.table.idmen_ind + self.simulation.output_table.table.noi_ind
        self.simulation.output_table.table['noindiv'] = self.simulation.output_table.table['noindiv'].astype(np.int64)
        s1 = [var for var in set(options).intersection(set(self.simulation.output_table.table.columns))] + ['idmen_ind', 'quimen_ind', 'noindiv']
        simu_nonaggr_tables = (self.simulation.output_table.table)[s1]
        simu_nonaggr_tables.rename(columns = {'idmen_ind' : 'idmen', 'quimen_ind':'quimen'}, inplace = True)
        assert 'noindiv' in simu_nonaggr_tables.columns

        # If not found, we dwelve into the input_table
        if (set(s1)- set(['idmen_ind', 'quimen_ind','noindiv'])) < set(options):
            assert 'noindiv' in self.simulation.input_table.table.columns, "'noindiv' not in simulation.input_table.table.columns"
            s2 = [var for var in (set(options).intersection(set(self.simulation.input_table.table.columns)) - set(s1))] + ['noindiv']
            #print s2
            temp = self.simulation.input_table.table[s2]
            simu_nonaggr_tables = simu_nonaggr_tables.merge(temp, on = 'noindiv', how = 'inner', sort = False)

            del s2, temp
        del s1
        gc.collect()

        simu_nonaggr_tables = simu_nonaggr_tables[list(set(options)) + ['idmen', 'quimen','noindiv']]
        #print options, variables
        assert 'idmen' in simu_nonaggr_tables.columns, 'Idmen not in simu_nonaggr_tables columns'

        # Check the idmens that are not common
        erf_menage.rename(columns = {'ident' : 'idmen'}, inplace = True)

        print "\n"
        print 'Checking if idmen is here...'
        print '\n ERF : '
        print 'idmen' in erf_menage.columns
        print "\n Simulation output"
        print 'idmen' in simu_aggr_tables.columns
        print "\n"

        #print 'Dropping duplicates of idmen for both tables...'
        assert not erf_menage["idmen"].duplicated().any(), "Duplicated idmen in erf_menage"
        #erf_menage.drop_duplicates('idmen', inplace = True)
        simu_aggr_tables.drop_duplicates('idmen', inplace = True)
        assert not simu_aggr_tables["idmen"].duplicated().any(), "Duplicated idmen in of"

        print 'Checking mismatching idmen... '
        s1 = set(erf_menage['idmen']) - (set(simu_aggr_tables['idmen']))
        if s1:
            print "idmen that aren't in simu_aggr_tables : %s" %str(len(s1))
            pass
        s2 = (set(simu_aggr_tables['idmen'])) - set(erf_menage['idmen'])
        if s2:
            print "idmen that aren't in erf_menage : %s" %str(len(s2))
            pass
        del s1, s2

        # Restrict to common idmens and merge
        s3 = set(erf_menage['idmen']).intersection(set(simu_aggr_tables['idmen']))
        print "Restricting to %s common idmen... \n" %str(len(s3))
        erf_menage = erf_menage[erf_menage['idmen'].isin(s3)]
        simu_aggr_tables = simu_aggr_tables[simu_aggr_tables['idmen'].isin(s3)]
        del s3
        gc.collect()

        #print erf_menage.columns
        #print simu_aggr_tables.columns

        # Compare differences across of and erf dataframes
        print "Comparing differences between dataframes... \n"
        colcom = (set(erf_menage.columns).intersection(set(simu_aggr_tables.columns))) - set(['idmen','wprm'])
        print 'Common variables: '
        print colcom
        erf_menage.reset_index(inplace = True)
        simu_aggr_tables.reset_index(inplace = True)
        for col in colcom:
            temp = set(erf_menage['idmen'][erf_menage[col] != simu_aggr_tables[col]])
            print "Numbers of idmen that aren't equal on variable %s : %s \n" %(col, str(len(temp)))
            del temp

        self.erf_menage = erf_menage
        self.erf_eec_indivi = erf_eec_indivi
        self.simu_aggr_tables = simu_aggr_tables
        self.simu_nonaggr_tables = simu_nonaggr_tables
Example #10
0
def test(year=2006, variables = ['af']):
    simulation = SurveySimulation()
    survey_filename = os.path.join(model.DATA_DIR, 'sources', 'test.h5')
    simulation.set_config(year=year, survey_filename=survey_filename)
    simulation.set_param()
    simulation.compute()

#     of_aggregates = Aggregates()
#     of_aggregates.set_simulation(simulation)
#     of_aggregates.compute()
#     print of_aggregates.aggr_frame
#
#     from openfisca_france.data.erf.aggregates import build_erf_aggregates
#     temp = (build_erf_aggregates(variables=variables, year= year))
#     print temp
#     return
    variable= "af"
    debugger = Debugger()
    debugger.set_simulation(simulation)
    debugger.set_variable(variable)
    debugger.show_aggregates()





    def get_all_ancestors(varlist):
        if len(varlist) == 0:
            return []
        else:
            if varlist[0]._parents == set():
                return ([varlist[0]]
                      + get_all_ancestors(varlist[1:]))
            else:
                return ([varlist[0]]
                 + get_all_ancestors(list(varlist[0]._parents))
                  + get_all_ancestors(varlist[1:]))

    # We want to get all ancestors + children + the options that we're going to encounter
    parents = map(lambda x: simulation.output_table.column_by_name.get(x), variables)
    parents = get_all_ancestors(parents)
    options = []
    for varcol in parents:
        options.extend(varcol._option.keys())
    options = list(set(options))
    #print options
    parents = map(lambda x: x.name, parents)
    for var in variables:
        children = set()
        varcol = simulation.output_table.column_by_name.get(var)
        children = children.union(set(map(lambda x: x.name, varcol._children)))
    variables = list(set(parents + list(children)))
    #print variables
    del parents, children
    gc.collect()

    def get_var(variable):
        variables =[variable]
        return simulation.aggregated_by_entity(entity="men", variables=variables,
                                                all_output_vars = False, force_sum=True)[0]

    simu_aggr_tables = get_var(variables[0])
    for var in variables[1:]:
        simu_aggr_tables = simu_aggr_tables.merge(get_var(var)[['idmen', var]], on = 'idmen', how = 'outer')

    # We load the data from erf table in case we have to pick data there
    erf_data = DataCollection(year=year)
    os.system('cls')
    todo = set(variables + ["ident", "wprm"]).union(set(options))
    print 'Variables or equivalents to fetch :'
    print todo

    '''
    Méthode générale pour aller chercher les variables de l'erf/eec
    ( qui n'ont pas forcément le même nom
    et parfois sont les variables utilisées pour créér l'of ):
    1 - essayer le get_of2erf, ça doit marcher pour les variables principales ( au moins les aggrégats
    que l'on compare )
    Si les variables ne sont pas directement dans la table,
    elles ont été calculées à partir d'autres variables de données erf/eec
    donc chercher dans :
    2 - build_survey
    3 - model/model.py qui dira éventuellement dans quel module de model/ chercher
    Le 'print todo' vous indique quelles variables chercher
    ( attention à ne pas inclure les enfants directs )
    L'utilisation du Ctrl-H est profitable !
    '''

    fetch_eec = ['statut','titc','chpub','encadr','prosa','age','naim','naia','noindiv']
    fetch_erf = ['zsali','af','ident','wprm','noi','noindiv','quelfic']
    erf_df = erf_data.get_of_values(variables= fetch_erf, table="erf_indivi")
    eec_df = erf_data.get_of_values(variables= fetch_eec, table="eec_indivi")
    erf_eec_indivi = erf_df.merge(eec_df, on ='noindiv', how = 'inner' )
    assert 'quelfic' in erf_eec_indivi.columns, "quelfic not in erf_indivi columns"
    del eec_df, erf_df

    # We then get the aggregate variables for the menage ( mainly to compare with of )
    print 'Loading data from erf_menage table'
    erf_menage = erf_data.get_of_values(variables= list(todo) + ['quelfic'], table="erf_menage")

    del todo
    gc.collect()
    assert 'ident' in erf_menage.columns, "ident not in erf_menage.columns"

    from openfisca_france.data.erf import get_erf2of
    erf2of = get_erf2of()
    erf_menage.rename(columns = erf2of, inplace = True)

# We get the options from the simulation non aggregated tables:

    # First from the output_table
    # We recreate the noindiv in output_table
    simulation.output_table.table['noindiv'] = 100 * simulation.output_table.table.idmen_ind + simulation.output_table.table.noi_ind
    simulation.output_table.table['noindiv'] = simulation.output_table.table['noindiv'].astype(np.int64)
    s1 = [var for var in set(options).intersection(set(simulation.output_table.table.columns))] + ['idmen_ind', 'quimen_ind', 'noindiv']
    simu_nonaggr_tables = (simulation.output_table.table)[s1]
    simu_nonaggr_tables.rename(columns = {'idmen_ind' : 'idmen', 'quimen_ind':'quimen'}, inplace = True)
    assert 'noindiv' in simu_nonaggr_tables.columns

    # If not found, we dwelve into the input_table
    if (set(s1)- set(['idmen_ind', 'quimen_ind','noindiv'])) < set(options):
        assert 'noindiv' in simulation.input_table.table.columns, "'noindiv' not in simulation.input_table.table.columns"
        s2 = [var for var in (set(options).intersection(set(simulation.input_table.table.columns)) - set(s1))] + ['noindiv']
        #print s2
        temp = simulation.input_table.table[s2]
        simu_nonaggr_tables = simu_nonaggr_tables.merge(temp, on = 'noindiv', how = 'inner', sort = False)

        del s2, temp
    del s1
    gc.collect()

    simu_nonaggr_tables = simu_nonaggr_tables[list(set(options)) + ['idmen', 'quimen','noindiv']]
    #print options, variables
    assert 'idmen' in simu_nonaggr_tables.columns, 'Idmen not in simu_nonaggr_tables columns'

    # Check the idmens that are not common
    erf_menage.rename(columns = {'ident' : 'idmen'}, inplace = True)

    print "\n"
    print 'Checking if idmen is here...'
    print '\n ERF : '
    print 'idmen' in erf_menage.columns
    print "\n Simulation output"
    print 'idmen' in simu_aggr_tables.columns
    print "\n"

    #print 'Dropping duplicates of idmen for both tables...'
    assert not erf_menage["idmen"].duplicated().any(), "Duplicated idmen in erf_menage"
    #erf_menage.drop_duplicates('idmen', inplace = True)
    simu_aggr_tables.drop_duplicates('idmen', inplace = True)
    assert not simu_aggr_tables["idmen"].duplicated().any(), "Duplicated idmen in of"

    print 'Checking mismatching idmen... '
    s1 = set(erf_menage['idmen']) - (set(simu_aggr_tables['idmen']))
    if s1:
        print "idmen that aren't in simu_aggr_tables : %s" %str(len(s1))
        pass
    s2 = (set(simu_aggr_tables['idmen'])) - set(erf_menage['idmen'])
    if s2:
        print "idmen that aren't in erf_menage : %s" %str(len(s2))
        pass
    del s1, s2

    # Restrict to common idmens and merge
    s3 = set(erf_menage['idmen']).intersection(set(simu_aggr_tables['idmen']))
    print "Restricting to %s common idmen... \n" %str(len(s3))
    erf_menage = erf_menage[erf_menage['idmen'].isin(s3)]
    simu_aggr_tables = simu_aggr_tables[simu_aggr_tables['idmen'].isin(s3)]
    del s3
    gc.collect()

    #print erf_menage.columns
    #print simu_aggr_tables.columns

    # Compare differences across of and erf dataframes
    print "Comparing differences between dataframes... \n"
    colcom = (set(erf_menage.columns).intersection(set(simu_aggr_tables.columns))) - set(['idmen','wprm'])
    print 'Common variables: '
    print colcom
    erf_menage.reset_index(inplace = True)
    simu_aggr_tables.reset_index(inplace = True)
    for col in colcom:
        temp = set(erf_menage['idmen'][erf_menage[col] != simu_aggr_tables[col]])
        print "Numbers of idmen that aren't equal on variable %s : %s \n" %(col, str(len(temp)))
        del temp


    # Detect the biggest differences
    bigtable = merge(erf_menage, simu_aggr_tables, on = 'idmen', how = 'inner', suffixes=('_erf','_of'))
    print 'Length of new dataframe is %s' %str(len(bigtable))
    #print bigtable.columns
    bigtable.set_index('idmen', drop = False, inplace = True)

    already_met = []
    options_met = []

    for col in colcom:
        bigtemp = None
        table = bigtable[and_(bigtable[col+'_erf']!=0,bigtable[col+'_of']!=0)]
        table[col] = (table[col+'_erf'] - table[col+'_of']) / table[col+'_erf'] #Difference relative
        table[col] = table[col].apply(lambda x: abs(x))
        print 'Minimum difference between the two tables for %s is %s' %(col, str(table[col].min()))
        print 'Maximum difference between the two tables for %s is %s' %(col, str(table[col].max()))
        print table[col].describe()
        try:
            assert len(table[col]) == len(table['wprm_of']), "PINAGS"
            dec, values = mwp(table[col], np.arange(1,11), table['wprm_of'], 2, return_quantiles=True)
            #print sorted(values)
            dec, values = mwp(table[col], np.arange(1,101), table['wprm_erf'], 2, return_quantiles=True)
            #print sorted(values)[90:]
            del dec, values
            gc.collect()
        except:
            #print 'Weighted percentile method didnt work for %s' %col
            pass
        print "\n"

    # Show the relevant information for the most deviant households
        table.sort(columns = col, ascending = False, inplace = True)
        #print table[col][0:10].to_string()
        if bigtemp is None:
            bigtemp = {'table' : table[[col, col+'_of', col+'_erf', 'idmen']][0:10],
                       'options' : None}
        bigtemp['table'][col+'div'] = bigtemp['table'][col+'_of'] / bigtemp['table'][col+'_erf']
        print bigtemp['table'].to_string()

        '''
        bigtemp is the table which will get filled little by little by the relevant variables.
        Up to the last rows of code 'table' refers to a table of aggregated values,
        while 'options is a table of individual variables.
        The reason we call it in a dictionnary is also because we modify it inside the recursive function 'iter_on parents',
        and it causes an error in Python unless for certain types like dictionnary values.
        '''
        #print "\n"

        # If variable is a Prestation, we show the dependancies
        varcol = simulation.output_table.column_by_name.get(col)
        if isinstance(varcol, Prestation):

            '''
            For the direct children
            '''
            if not varcol._children is None:
                ch_to_fetch = list(varcol._children)
                ch_to_fetch = map(lambda x: x.name, ch_to_fetch)
                ch_fetched = []

                if set(ch_to_fetch) <= set(simu_aggr_tables.columns):
                    print "Variables which need %s to be computed :\n %s \n" %(col, str(ch_to_fetch))
                    for var in ch_to_fetch:
                        if var + '_of' in table.columns:
                            ch_fetched.append(var + '_of')
                        else:
                            ch_fetched.append(var)
                elif set(ch_to_fetch) <= set(simu_aggr_tables.columns).union(erf_menage.columns):
                    print "Variables which need %s to be computed (some missing picked in erf):\n %s \n" %(col, str(ch_to_fetch))
                    for var in ch_to_fetch:
                        if var in simu_aggr_tables.columns:
                            if var + '_of' in table.columns:
                                ch_fetched.append(var + '_of')
                        elif var + '_erf' in table.columns:
                                ch_fetched.append(var + '_erf')
                        else:
                            ch_fetched.append(var)
                else:
                    print "Variables which need %s to be computed (some missing):\n %s \n" %(col, str(ch_to_fetch))
                    for var in ch_to_fetch:

                        if var in simu_aggr_tables.columns:
                            if var + '_of' in table.columns:
                                ch_fetched.append(var + '_of')
                        elif var in erf_menage.columns:
                            if var + '_erf' in table.columns:
                                ch_fetched.append(var + '_erf')

                print table[[col] + ch_fetched][0:10]
                print "\n"
                del ch_to_fetch, ch_fetched

            '''
            For the parents
            '''
            def iter_on_parents(varcol):
                if (varcol._parents == set() and varcol._option == {}) or varcol.name in already_met:
                    return
                else:
                    par_to_fetch = list(varcol._parents)
                    par_to_fetch = map(lambda x: x.name, par_to_fetch)
                    par_fetched = []

                    if set(par_fetched) <= set(simu_aggr_tables.columns):
                        #print "Variables the prestation %s depends of :\n %s \n" %(varcol.name, str(par_fetched))
                        for var in par_fetched:
                            if var + '_of' in table.columns:
                                par_fetched.append(var + '_of')
                            else:
                                par_fetched.append(var)
                    elif set(par_fetched) <= set(simu_aggr_tables.columns).union(erf_menage.columns):
                        #print "Variables the prestation %s depends of (some missing picked in erf):\n %s \n" %(varcol.name,str(par_fetched))
                        for var in par_fetched:
                            if var in simu_aggr_tables.columns:
                                if var + '_of' in table.columns:
                                    par_fetched.append(var + '_of')
                            elif var + '_erf' in table.columns:
                                par_fetched.append(var + '_erf')
                            else:
                                par_fetched.append(var)
                    else:
                        for var in par_fetched:
                            if var in simu_aggr_tables.columns:
                                if var + '_of' in table.columns:
                                    par_fetched.append(var + '_of')
                            elif var in erf_menage.columns:
                                if var + '_erf' in table.columns:
                                    par_fetched.append(var + '_erf')
                        if len(par_fetched) > 0:
                            #print "Variables the prestation %s depends of (some missing):\n %s \n" %(varcol.name, str(par_fetched))
                            pass
                        else:
                            #print "Variables the prestation %s depends of couldn't be found :\n %s \n" %(varcol.name, str(par_fetched))
                            pass

                    if len(par_fetched) > 0:
                        temp = table[[col, 'idmen'] + par_fetched][0:10]
                        bigtemp['table'] = pd.merge(temp, bigtemp['table'], how = 'inner')
                        #print temp.to_string(), "\n"
                    if varcol._option != {} and not set(varcol._option.keys()) < set(options_met):
                        vars_to_fetch = list(set(varcol._option.keys())-set(options_met))
                        #print "and the options to current variable %s for the id's with strongest difference :\n %s \n" %(varcol.name, varcol._option.keys())
                        liste = [i for i in range(0,10)]
                        liste = map(lambda x: table['idmen'].iloc[x], liste)
                        temp = simu_nonaggr_tables[['idmen', 'quimen','noindiv']
                                                  + vars_to_fetch][simu_nonaggr_tables['idmen'].isin(table['idmen'][0:10])]

                        temp_sorted = temp[temp['idmen'] == liste[0]]
                        for i in xrange(1,10):
                            temp_sorted = temp_sorted.append(temp[temp['idmen'] == liste[i]])
                        if bigtemp['options'] is None:
                            bigtemp['options'] = temp_sorted
                            bigtemp['options'] = bigtemp['options'].merge(erf_eec_indivi, on = 'noindiv', how = 'outer')
                        else:
                            bigtemp['options'] = bigtemp['options'].merge(temp_sorted, on = ['noindiv','idmen','quimen'], how = 'outer')
#                         temp_sorted.set_index(['idmen',  'quimen'], drop = True, inplace = True) # If we do that
                        del temp, temp_sorted
                        gc.collect()

                    already_met.append(varcol.name)
                    options_met.extend(varcol._option.keys())
                    for var in varcol._parents:
                        iter_on_parents(var)

            iter_on_parents(varcol)
            # We merge the aggregate table with the option table ( for each individual in entity )
            bigtemp['table'] = bigtemp['table'].merge(bigtemp['options'],
                                                       how = 'left',
                                                        on = 'idmen',
                                                         suffixes = ('(agg)', '(ind)'))

            # Reshaping the table to group by descending error on col, common entities
            bigtemp['table'].sort(columns = ['af','quimen'], ascending = [False,True], inplace = True)
            bigtemp['table'] = bigtemp['table'].groupby(['idmen','quimen'], sort = False).sum()
            print "Table of values for %s dependencies : \n" %col
            print bigtemp['table'].to_string()
            del bigtemp['table'], bigtemp['options']
            gc.collect()
def create_imput_loyer(year):
    '''
    Impute les loyers à partir de ???
    '''

    #Variables used for imputation
    df = DataCollection(year=year)
    print 'Démarrer 02_imput_loyer'

    menm_vars = [
        "ztsam", "zperm", "zragm", "zricm", "zrncm", "zracm", "nb_uci", "wprm",
        "so", "nbpiec", "typmen5", "spr", "nbenfc", "agpr", "cstotpr",
        "nat28pr", "tu99", "aai1", 'ident', "pol99", "reg", "tau99"
    ]
    if year == 2008:  # Tau99 not present
        menm_vars = menm_vars.pop('tau99')

    indm_vars = ["noi", 'ident', "lpr", "dip11"]
    LgtAdrVars = ["gzc2"]
    LgtMenVars = [
        "sec1", "mrcho", "mrret", "mrsal", "mrtns", "mdiplo", "mtybd", "magtr",
        "mcs8", "maa1at", "qex", "muc1"
    ]

    if year == 2003:
        LgtMenVars.extend(["typse", "lmlm", "hnph2", "mnatior", "ident"])
        LgtAdrVars.extend(["iaat", "tu99", "ident"])
    if year < 2010 and year > 2005:
        LgtMenVars.extend(["mnatio", "idlog"])
        LgtAdrVars.extend(["idlog"])  # pas de typse en 2006
        LgtLgtVars = ["lmlm", "iaat", "tu99", "hnph2",
                      "idlog"]  # pas de typse en 2006

    ## Travail sur la base ERF
    #Preparing ERF menages tables


#     print show_temp()
# TODO : data.get_values
    erfmenm = load_temp(name="menagem", year=year)
    #     erfmenm = df.get_values(table="erf_menage",variables=menm_vars)
    erfmenm['revtot'] = (erfmenm['ztsam'] + erfmenm['zperm'] +
                         erfmenm['zragm'] + erfmenm['zricm'] +
                         erfmenm['zrncm'] + erfmenm['zracm'])
    erfmenm['nvpr'] = erfmenm['revtot'].astype(
        np.float64) / erfmenm['nb_uci'].astype(np.float64)
    # On donne la valeur 0 aux nvpr négatifs
    tmp = np.zeros(erfmenm['nvpr'].shape, dtype=int)
    erfmenm['nvpr'] = max_(tmp, erfmenm['nvpr'])
    for v in erfmenm['nvpr']:  # On vérifie qu'il n'y a plus de nvpr négatifs
        assert v >= 0, Exception('Some nvpr are negatives')
    erfmenm['logt'] = erfmenm['so']
    l = erfmenm.columns.tolist()
    #     print l
    #Preparing ERF individuals table
    erfindm = load_temp(name="indivim", year=year)
    #     erfindm = df.get_values(table = "eec_indivi", variables = indm_vars)

    # TODO: clean this later
    erfindm['dip11'] = 0
    count_NA('dip11', erfindm)
    #     erfindm['dip11'] = 99
    erfindm = erfindm[['ident', 'dip11']][erfindm['lpr'] == 1]
    # erf <- merge(erfmenm, erfindm, by ="ident")
    print('merging erf menage and individu')
    erf = erfmenm.merge(erfindm, on='ident', how='inner')
    erf = erf.drop_duplicates('ident')

    # control(erf) La colonne existe mais est vide,
    # on a du confondre cette colonne avec dip11 ?

    dec, values = mark_weighted_percentiles(erf['nvpr'],
                                            arange(1, 11),
                                            erf['wprm'],
                                            2,
                                            return_quantiles=True)
    values.sort()
    erf['deci'] = (1 + (erf['nvpr'] > values[1]) + (erf['nvpr'] > values[2]) +
                   (erf['nvpr'] > values[3]) + (erf['nvpr'] > values[4]) +
                   (erf['nvpr'] > values[5]) + (erf['nvpr'] > values[6]) +
                   (erf['nvpr'] > values[7]) + (erf['nvpr'] > values[8]) +
                   (erf['nvpr'] > values[9]))
    # Problème : tous les individus sont soit dans le premier, soit dans le dernier décile. WTF
    assert_variable_inrange('deci', [1, 11], erf)
    count_NA('deci', erf)
    del dec, values
    gc.collect()

    #TODO: faire le lien avec men_vars, il manque "pol99","reg","tau99" et ici on a en plus logt, 'nvpr','revtot','dip11','deci'
    erf = erf[[
        'ident', 'ztsam', 'zperm', 'zragm', 'zricm', 'zrncm', 'zracm',
        'nb_uci', 'logt', 'nbpiec', 'typmen5', 'spr', 'nbenfc', 'agpr',
        'cstotpr', 'nat28pr', 'tu99', 'aai1', 'wprm', 'nvpr', 'revtot',
        'dip11', 'deci'
    ]][erf['so'].isin(range(3, 6))]

    erf.rename(columns={
        'nbpiec': 'hnph2',
        'nat28pr': 'mnatio',
        'aai1': 'iaat',
        'dip11': 'mdiplo'
    },
               inplace=True)

    # TODO: ne traite pas les types comme dans R teste-les pour voir comment pandas les gère

    count_NA('agpr', erf)
    erf['agpr'] = erf['agpr'].astype('int64')
    # TODO: moche, pourquoi créer deux variables quand une suffit ?
    erf['tmp'] = 3
    erf['tmp'][erf['agpr'] < 65] = 2
    erf['tmp'][erf['agpr'] < 40] = 1
    erf['magtr'] = erf['tmp']
    count_NA('magtr', erf)
    assert_variable_inrange('magtr', [1, 4], erf)

    count_NA('cstotpr', erf)
    erf['tmp'] = erf['cstotpr'].astype('float') / 10.0
    erf['tmp'] = map(math.floor, erf['tmp'])
    erf['mcs8'] = erf['tmp']
    erf['mcs8'][erf['mcs8'] == 0] = NaN
    # assert isinstance(erf['mcs8'], (int, long)).all(), Exception('Some mcs8 are not integers')
    count_NA('mcs8', erf)

    # TODO il reste 41 NA's 2003
    erf['mtybd'] = NaN
    erf['mtybd'][(erf['typmen5'] == 1) & (erf['spr'] != 2)] = 1
    erf['mtybd'][(erf['typmen5'] == 1) & (erf['spr'] == 2)] = 2
    erf['mtybd'][erf['typmen5'] == 5] = 3
    erf['mtybd'][erf['typmen5'] == 3] = 7
    erf['mtybd'][erf['nbenfc'] == 1] = 4
    erf['mtybd'][erf['nbenfc'] == 2] = 5
    erf['mtybd'][erf['nbenfc'] >= 3] = 6
    count_NA('mtybd', erf)

    #     print erf['mtybd'].dtype.fields
    #assert_variable_inrange('mtybd', [1,7], erf) # bug, on trouve 7.0 qui fait assert

    # TODO : 3 logements ont 0 pièces !!
    erf['hnph2'][erf['hnph2'] < 1] = 1
    erf['hnph2'][erf['hnph2'] >= 6] = 6
    count_NA('hnph2', erf)
    assert_variable_inrange('hnph2', [1, 7], erf)

    # # TODO: il reste un NA 2003
    # #       il rest un NA en 2008

    tmp = erf['mnatio']
    tmp[erf['mnatio'] == 10] = 1
    tmp[erf['mnatio'].isin([
        11, 12, 13, 14, 15, 21, 22, 23, 24, 25, 26, 27, 28, 29, 31, 32, 41, 42,
        43, 44, 45, 46, 47, 48, 51, 52, 62, 60
    ])] = 2
    erf['mnatio'] = tmp
    count_NA('mnatio', erf)
    assert_variable_inrange('mnatio', [1, 3], erf)

    tmp = erf['iaat']
    tmp[erf['mnatio'].isin([1, 2, 3])] = 1
    tmp[erf['mnatio'] == 4] = 2
    tmp[erf['mnatio'] == 5] = 3
    tmp[erf['mnatio'] == 6] = 4
    tmp[erf['mnatio'] == 7] = 5
    tmp[erf['mnatio'] == 8] = 6
    erf['iaat'] = tmp
    count_NA('iaat', erf)
    assert_variable_inrange('iaat', [1, 7], erf)

    # # Il reste un NA en 2003
    # #    reste un NA en 2008
    # table(erf$iaat, useNA="ifany")
    # TODO: comparer logement et erf pour ?tre sur que cela colle

    tmp = erf['mdiplo']
    tmp[erf['mdiplo'].isin([71, ""])] = 1
    tmp[erf['mdiplo'].isin([70, 60, 50])] = 2
    tmp[erf['mdiplo'].isin([41, 42, 31, 33])] = 3
    tmp[erf['mdiplo'].isin([10, 11, 30])] = 4
    erf['mdiplo'] = tmp
    count_NA('mdiplo', erf)
    #assert_variable_inrange('mdiplo', [1,5], erf) # On a un 99 qui se balade

    tmp = erf['tu99']
    tmp[erf['tu99'] == 0] = 1
    tmp[erf['tu99'].isin([1, 2, 3])] = 2
    tmp[erf['tu99'].isin([4, 5, 6])] = 3
    tmp[erf['tu99'] == 7] = 4
    tmp[erf['tu99'] == 8] = 5
    erf['tu99_recoded'] = tmp
    count_NA('tu99_recoded', erf)
    assert_variable_inrange('tu99_recoded', [1, 6], erf)

    # TODO : 0 ? Rajouetr 2003 !
    tmp = erf['mcs8']
    tmp[erf['mcs8'] == 1] = 1
    tmp[erf['mcs8'] == 2] = 2
    tmp[erf['mcs8'] == 3] = 3
    tmp[erf['mcs8'].isin([4, 8])] = 4
    tmp[erf['mcs8'].isin([5, 6, 7])] = 5
    erf['mcs8'] = tmp
    count_NA('mcs8', erf)
    assert_variable_inrange('mcs8', [1, 6], erf)

    erf['wprm'] = erf['wprm'].astype('int64')
    count_NA('wprm', erf)

    del (erf['cstotpr'], erf['agpr'], erf['typmen5'], erf['nbenfc'],
         erf['spr'], erf['tmp'], erf['tu99'])
    gc.collect()

    erf = erf.dropna(subset=[
        'logt', 'magtr', 'mcs8', 'mtybd', 'hnph2', 'mnatio', 'iaat', 'mdiplo',
        'tu99_recoded'
    ])
    #On vérifie au final que l'on n'a pas de doublons d'individus
    assert erf['ident'].value_counts().max() == 1, Exception(
        'Number of distinct individuals after removing duplicates is not correct'
    )

    ## Travail sur la table logement

    # Table menage
    if year == 2003:
        year_lgt = 2003
    if year > 2005 and year < 2010:
        year_lgt = 2006

    print "preparing logement menage table"

    #     Lgtmen = load_temp(name = "indivim",year = year) # Je rajoute une étape bidon
    Lgtmen = df.get_values(table="lgt_menage", variables=LgtMenVars)
    Lgtmen.rename(columns={'idlog': 'ident'}, inplace=True)

    count_NA('mrcho', Lgtmen)
    Lgtmen['mrcho'].fillna(0, inplace=True)
    Lgtmen['mrret'].fillna(0, inplace=True)
    Lgtmen['mrsal'].fillna(0, inplace=True)
    Lgtmen['mrtns'].fillna(0, inplace=True)
    count_NA('mrcho', Lgtmen)
    Lgtmen['revtot'] = Lgtmen['mrcho'] + Lgtmen['mrret'] + Lgtmen[
        'mrsal'] + Lgtmen['mrtns']  # Virer les revenus négatifs ?
    count_NA('revtot', Lgtmen)
    Lgtmen['nvpr'] = 10.0 * Lgtmen['revtot'] / Lgtmen['muc1']

    count_NA('qex', Lgtmen)
    dec, values = mark_weighted_percentiles(Lgtmen['nvpr'],
                                            arange(1, 11),
                                            Lgtmen['qex'],
                                            2,
                                            return_quantiles=True)
    values.sort()
    Lgtmen['deci'] = (
        1 + (Lgtmen['nvpr'] > values[1]) + (Lgtmen['nvpr'] > values[2]) +
        (Lgtmen['nvpr'] > values[3]) + (Lgtmen['nvpr'] > values[4]) +
        (Lgtmen['nvpr'] > values[5]) + (Lgtmen['nvpr'] > values[6]) +
        (Lgtmen['nvpr'] > values[7]) + (Lgtmen['nvpr'] > values[8]) +
        (Lgtmen['nvpr'] > values[9]))
    del dec, values
    print Lgtmen['deci'].describe()
    gc.collect()

    ##Table logement (pas en 2003 mais en 2006)
    # str(lgtmen)
    # if (year_lgt=="2006"){
    #   message("preparing logement logement table")
    #   lgtlgt <- LoadIn(lgtLgtFil,lgtLgtVars)
    #   lgtlgt <- upData(lgtlgt, rename=renameidlgt)
    #   lgtmen <- merge(lgtmen, lgtlgt, by.x="ident", by.y="ident")

    if year_lgt == 2006:
        print 'preparing logement logement table'
        lgtlgt = df.get_values(table="lgt_logt", variables=LgtLgtVars)
        lgtlgt.rename(columns={'idlog': 'ident'}, inplace=True)
        Lgtmen = Lgtmen.merge(lgtlgt,
                              left_on='ident',
                              right_on='ident',
                              how='inner')
        del lgtlgt

    data = Lgtmen[Lgtmen['sec1'].isin([21, 22, 23, 24, 30])]
    del Lgtmen
    gc.collect()

    if year_lgt == 2006:
        data.rename(columns={'mnatio': 'mnatior'}, inplace=True)

    data = (data[data['mnatior'].notnull()])
    data = (data[data['sec1'].notnull()])
    data['tmp'] = data['sec1'].astype(np.int64)
    data['tmp'][data['sec1'].isin([21, 22, 23])] = 3
    data['tmp'][data['sec1'] == 24] = 4
    data['tmp'][data['sec1'] == 30] = 5
    data['logt'] = data['tmp']
    count_NA('logt', data)
    data = (data[data['logt'].notnull()])
    Lgtmen = data

    # ## Table adresse
    print "preparing logement adresse table"
    # lgtadr <- LoadIn(lgtAdrFil,lgtAdrVars)
    # lgtadr <- upData(lgtadr, rename=renameidlgt)
    # Je rajoute une étae bidon
    Lgtadr = df.get_values(table="adresse", variables=LgtAdrVars)
    Lgtadr.rename(columns={'idlog': 'ident'}, inplace=True)

    print('Merging logement and menage tables')
    Logement = Lgtmen.merge(Lgtadr, on='ident', how='inner')
    #     control(Logement) # Pas de idfoy, etc. dans la table logement ?

    Logement['hnph2'][Logement['hnph2'] >= 6] = 6
    Logement['hnph2'][Logement['hnph2'] < 1] = 1
    count_NA('hnph2', Logement)
    assert not Logement['hnph2'].isnull().any(), "Some hnph2 are null"
    #     Logement=(Logement[Logement['hnph2'].notnull()]) # Mis en comment car 0 NA pour hnph2

    # On est dans la même étape within ici et par la suite ( cf code R )
    # TODO : ici problème je transforme les 07 en 7
    # car Python considère les 0n comme des nombres octaux ( < 08 ).
    # J'espère que ce n'est pas important.
    Logement['tmp'] = Logement['mnatior']
    Logement['tmp'][Logement['mnatior'].isin([0, 1])] = 1
    Logement['tmp'][Logement['mnatior'].isin([2, 3, 4, 5, 6, 7, 8, 9, 10,
                                              11])] = 2
    Logement['mnatior'] = Logement['tmp']
    count_NA('mnatior', Logement)
    assert_variable_inrange('mnatior', [1, 3], Logement)

    Logement['tmp'] = Logement['iaat']
    Logement['tmp'][Logement['iaat'].isin([1, 2, 3, 4, 5])] = 1
    Logement['tmp'][Logement['iaat'] == 6] = 2
    Logement['tmp'][Logement['iaat'] == 7] = 3
    Logement['tmp'][Logement['iaat'] == 8] = 4
    Logement['tmp'][Logement['iaat'] == 9] = 5
    Logement['tmp'][Logement['iaat'] ==
                    10] = 6  # TODO question Clément : et le 9 et le 10 ?
    Logement['iaat'] = Logement['tmp']
    count_NA('iaat', Logement)
    assert_variable_inrange('iaat', [1, 7], Logement)

    Logement['tmp'] = Logement['mdiplo']
    Logement['tmp'][Logement['mdiplo'] == 1] = 1
    Logement['tmp'][Logement['mdiplo'].isin([2, 3, 4])] = 2
    Logement['tmp'][Logement['mdiplo'].isin([5, 6, 7, 8])] = 3
    Logement['tmp'][Logement['mdiplo'] == 9] = 4
    Logement['mdiplo'] = Logement['tmp']
    count_NA('mdiplo', Logement)
    assert_variable_inrange('mdiplo', [1, 5], Logement)

    Logement['tmp'] = Logement['mtybd']
    Logement['tmp'][Logement['mtybd'] == 110] = 1
    Logement['tmp'][Logement['mtybd'] == 120] = 2
    Logement['tmp'][Logement['mtybd'] == 200] = 3
    Logement['tmp'][Logement['mtybd'].isin([311, 321, 401])] = 4
    Logement['tmp'][Logement['mtybd'].isin([312, 322, 402])] = 5
    Logement['tmp'][Logement['mtybd'].isin([313, 323, 403])] = 6
    Logement['tmp'][Logement['mtybd'] == 400] = 7
    Logement['mtybd'] = Logement['tmp']
    count_NA('mtybd', Logement)
    assert_variable_inrange('mtybd', [1, 8], Logement)

    Logement['tmp'] = Logement['tu99']
    count_NA('tu99', Logement)
    Logement['tmp'][Logement['tu99'] == 0] = 1
    Logement['tmp'][Logement['tu99'].isin([1, 2, 3])] = 2
    Logement['tmp'][Logement['tu99'].isin([4, 5, 6])] = 3
    Logement['tmp'][Logement['tu99'] == 7] = 4
    Logement['tmp'][Logement['tu99'] == 8] = 5
    Logement['tu99_recoded'] = Logement['tmp']
    count_NA('tu99_recoded', Logement)
    assert_variable_inrange('tu99_recoded', [1, 6], Logement)

    Logement['tmp'] = Logement['gzc2']
    Logement['tmp'][Logement['gzc2'] == 1] = 1
    Logement['tmp'][Logement['gzc2'].isin([2, 3, 4, 5, 6])] = 2
    Logement['tmp'][Logement['gzc2'] == 7] = 3
    Logement['gzc2'] = Logement['tmp']
    count_NA('gzc2', Logement)
    assert_variable_inrange('gzc2', [1, 4], Logement)

    Logement['tmp'] = Logement['magtr']
    Logement['tmp'][Logement['magtr'].isin([1, 2])] = 1
    Logement['tmp'][Logement['magtr'].isin([3, 4])] = 2
    Logement['tmp'][Logement['magtr'] == 5] = 3
    Logement['magtr'] = Logement['tmp']
    count_NA('magtr', Logement)
    assert_variable_inrange('magtr', [1, 4], Logement)

    Logement['tmp'] = Logement['mcs8']
    Logement['tmp'][Logement['mcs8'] == 1] = 1
    Logement['tmp'][Logement['mcs8'] == 2] = 2
    Logement['tmp'][Logement['mcs8'] == 3] = 3
    Logement['tmp'][Logement['mcs8'].isin([4, 8])] = 4
    Logement['tmp'][Logement['mcs8'].isin([5, 6, 7])] = 5
    Logement['mcs8'] = Logement['tmp']
    count_NA('mcs8', Logement)
    assert_variable_inrange('mcs8', [1, 6], Logement)

    Logement['logloy'] = Logement['lmlm'].apply(lambda x: math.log(x))

    Logement = (Logement[Logement['mdiplo'].notnull()])
    Logement = (Logement[Logement['mtybd'].notnull()])
    Logement = (Logement[Logement['magtr'].notnull()])
    Logement = (Logement[Logement['mcs8'].notnull()])
    Logement = (Logement[Logement['maa1at'].notnull()])

    ## Imputation des loyers proprement dite

    # library(StatMatch) # loads StatMatch
    # # library(mice) use md.pattern to locate missing data
    # TODO : à supprimer ?

    # logt <- subset(logement,select=c(lmlm,logt , hnph2 , iaat , mdiplo , mtybd , tu99_recoded , magtr , mcs8 , deci, ident))
    # logt$wprm <- logement$qex
    # erf <- subset(erf,select=c( logt , hnph2 , iaat , mdiplo , mtybd , tu99_recoded , magtr , mcs8 , deci, wprm, ident))
    print('Compute imputed rents')
    Logt = Logement[[
        'lmlm', 'logt', 'hnph2', 'iaat', 'mdiplo', 'mtybd', 'tu99_recoded',
        'magtr', 'mcs8', 'deci', 'ident'
    ]]
    Logt['wprm'] = Logement['qex']
    erf = erf[[
        'logt', 'hnph2', 'iaat', 'mdiplo', 'mtybd', 'tu99_recoded', 'magtr',
        'mcs8', 'deci', 'wprm', 'ident'
    ]]

    # # debug
    # # derf  <- describe(erf, weights=as.numeric(erf$wprm))
    # # dlogt <- describe(logt, weights=logt$wprm)
    # #
    # # for (var in as.list(names(derf))){
    # #   print("erf")
    # #   print(derf[[var]])
    # #   print("logt")
    # #   print(dlogt[[var]])
    # #   print("================")
    # # }

    # TODO add md.pattern

    # erf1 <- na.omit(erf)
    # logt <- na.omit(logt)
    from pandas import DataFrame
    erf = erf.dropna(
        how='any'
    )  # Si j'ai bien compris ce que l'on fait en R : dropper les lignes avec des NA
    #erf1 = erf # A-t-on toujours besoin de changer le nom du coup ?
    Logt = Logt.dropna(how='any')

    # allvars <- c("logt", "hnph2", "iaat", "mdiplo", "mtybd", "tu99_recoded", "magtr", "mcs8", "deci")
    # classes <- c("magtr","tu99_recoded")
    # matchvars <- setdiff(allvars,classes)
    allvars = [
        'logt', 'hnph2', 'iaat', 'mdiplo', 'mtybd', 'tu99_recoded', 'magtr',
        'mcs8', 'deci'
    ]
    classes = ['magtr', 'tu99_recoded']
    matchvars = list(set(allvars) - set(classes))
    erf['mcs8'] = erf['mcs8'].astype(int)
    # out.nnd <- NND.hotdeck(data.rec=erf1,data.don=logt,match.vars=matchvars,don.class=classes,gdist.fun="Gower")
    # fill.erf.nnd <- create.fused(data.rec=erf1, data.don=logt,mtc.ids=out.nnd$mtc.ids, z.vars="lmlm")
    from rpy2.robjects.packages import importr
    import rpy2.robjects.pandas2ri
    import rpy2.robjects.vectors as vectors
    rpy2.robjects.pandas2ri.activate(
    )  # Permet à rpy2 de convertir les dataframes
    sm = importr(
        "StatMatch")  #, lib_loc = "C:\Program Files\R\R-2.15.2\library")
    print 'TEST 2'
    out_nnd = sm.NND_hotdeck(data_rec=erf,
                             data_don=Logt,
                             match_vars=vectors.StrVector(matchvars),
                             don_class=vectors.StrVector(classes),
                             dist_fun="Gower")
    print 'TEST 3'
    fill_erf_nnd = sm.create_fused(data_rec=erf,
                                   data_don=Logt,
                                   mtc_ids=out_nnd[0],
                                   z_vars=vectors.StrVector(["lmlm"]))
    del allvars, matchvars, classes, out_nnd
    gc.collect()

    # fill.erf.nnd <- upData(fill.erf.nnd, rename=c(lmlm='loym'))
    import pandas.rpy.common as com
    fill_erf_nnd = com.convert_robj(fill_erf_nnd)
    fill_erf_nnd = DataFrame(fill_erf_nnd)
    (fill_erf_nnd).rename(columns={'lmlm': 'loym'}, inplace=True)

    # loy_imput = fill.erf.nnd[c('ident','loym')]
    loy_imput = (fill_erf_nnd)[['ident', 'loym']]

    # load(menm)
    # menagem$loym <- NULL
    # menagem <- merge(menagem,loy_imput,by='ident',all.x = TRUE)
    # save(menagem,file=menm)
    #     Mis en comment block, car à manipuler avec précaution je suppose ( ne souhaite pas faire de conneries )

    erfmenm = load_temp(name="menagem", year=year)
    #     del erfmenm['loym']
    erfmenm = erfmenm.merge(loy_imput, on='ident', how='left')
    assert 'loym' in erfmenm.columns, 'No loym in erfmenm columns'
    save_temp(erfmenm, name="menagem", year=year)
def create_imput_loyer(year):
    '''
    Impute les loyers à partir de ???
    '''

    #Variables used for imputation
    df = DataCollection(year=year)
    print 'Démarrer 02_imput_loyer'

    menm_vars = ["ztsam","zperm","zragm","zricm","zrncm","zracm","nb_uci","wprm",
             "so","nbpiec","typmen5","spr","nbenfc","agpr","cstotpr","nat28pr","tu99","aai1",'ident',"pol99","reg","tau99"]
    if year == 2008: # Tau99 not present
        menm_vars = menm_vars.pop('tau99')


    indm_vars = ["noi",'ident',"lpr","dip11"]
    LgtAdrVars = ["gzc2"]
    LgtMenVars = ["sec1","mrcho","mrret","mrsal","mrtns","mdiplo","mtybd","magtr","mcs8","maa1at","qex","muc1"]

    if year == 2003:
        LgtMenVars.extend(["typse","lmlm","hnph2","mnatior","ident"])
        LgtAdrVars.extend(["iaat","tu99","ident"])
    if year < 2010 and year > 2005:
        LgtMenVars.extend(["mnatio","idlog"])
        LgtAdrVars.extend(["idlog"]) # pas de typse en 2006
        LgtLgtVars=["lmlm","iaat","tu99","hnph2","idlog"] # pas de typse en 2006

    ## Travail sur la base ERF
    #Preparing ERF menages tables
#     print show_temp()
    # TODO : data.get_values
    erfmenm = load_temp(name="menagem", year=year)
#     erfmenm = df.get_values(table="erf_menage",variables=menm_vars)
    erfmenm['revtot'] = (erfmenm['ztsam'] + erfmenm['zperm'] + erfmenm['zragm'] +
                         erfmenm['zricm'] + erfmenm['zrncm'] + erfmenm['zracm'])
    erfmenm['nvpr'] = erfmenm['revtot'].astype(np.float64) / erfmenm['nb_uci'].astype(np.float64)
    # On donne la valeur 0 aux nvpr négatifs
    tmp = np.zeros(erfmenm['nvpr'].shape, dtype = int)
    erfmenm['nvpr'] = max_(tmp, erfmenm['nvpr'])
    for v in erfmenm['nvpr']: # On vérifie qu'il n'y a plus de nvpr négatifs
        assert v >= 0, Exception('Some nvpr are negatives')
    erfmenm['logt'] = erfmenm['so']
    l = erfmenm.columns.tolist()
#     print l
    #Preparing ERF individuals table
    erfindm = load_temp(name = "indivim",year=year)
#     erfindm = df.get_values(table = "eec_indivi", variables = indm_vars)

    # TODO: clean this later
    erfindm['dip11'] = 0
    count_NA('dip11', erfindm)
#     erfindm['dip11'] = 99
    erfindm = erfindm[['ident', 'dip11']][erfindm['lpr'] == 1]
# erf <- merge(erfmenm, erfindm, by ="ident")
    print('merging erf menage and individu')
    erf = erfmenm.merge(erfindm, on ='ident', how='inner')
    erf=erf.drop_duplicates('ident')

    # control(erf) La colonne existe mais est vide,
    # on a du confondre cette colonne avec dip11 ?

    dec, values = mark_weighted_percentiles(erf['nvpr'], arange(1,11), erf['wprm'], 2, return_quantiles=True)
    values.sort()
    erf['deci'] = (1 + (erf['nvpr']>values[1]) + (erf['nvpr']>values[2]) + (erf['nvpr']>values[3])
                   + (erf['nvpr']>values[4]) + (erf['nvpr']>values[5]) + (erf['nvpr']>values[6])
                   + (erf['nvpr']>values[7]) + (erf['nvpr']>values[8]) + (erf['nvpr']>values[9]))
    # Problème : tous les individus sont soit dans le premier, soit dans le dernier décile. WTF
    assert_variable_inrange('deci',[1,11], erf)
    count_NA('deci',erf)
    del dec, values
    gc.collect()

    #TODO: faire le lien avec men_vars, il manque "pol99","reg","tau99" et ici on a en plus logt, 'nvpr','revtot','dip11','deci'
    erf = erf[['ident','ztsam','zperm','zragm','zricm','zrncm','zracm',
                 'nb_uci', 'logt' ,'nbpiec','typmen5','spr','nbenfc','agpr','cstotpr',
                 'nat28pr','tu99','aai1','wprm', 'nvpr','revtot','dip11','deci']][erf['so'].isin(range(3,6))]

    erf.rename(columns = {'nbpiec':'hnph2','nat28pr':'mnatio','aai1':'iaat','dip11':'mdiplo'}, inplace = True)

    # TODO: ne traite pas les types comme dans R teste-les pour voir comment pandas les gère

    count_NA('agpr', erf)
    erf['agpr'] = erf['agpr'].astype('int64')
    # TODO: moche, pourquoi créer deux variables quand une suffit ?
    erf['tmp'] = 3
    erf['tmp'][erf['agpr'] < 65] = 2
    erf['tmp'][erf['agpr'] < 40] = 1
    erf['magtr'] = erf['tmp']
    count_NA('magtr',erf)
    assert_variable_inrange('magtr',[1,4],erf)

    count_NA('cstotpr',erf)
    erf['tmp'] = erf['cstotpr'].astype('float')/10.0
    erf['tmp']=map(math.floor, erf['tmp'])
    erf['mcs8'] = erf['tmp']
    erf['mcs8'][erf['mcs8'] == 0] = NaN
    # assert isinstance(erf['mcs8'], (int, long)).all(), Exception('Some mcs8 are not integers')
    count_NA('mcs8',erf)

    # TODO il reste 41 NA's 2003
    erf['mtybd'] = NaN
    erf['mtybd'][(erf['typmen5'] == 1) & (erf['spr'] != 2)] = 1
    erf['mtybd'][(erf['typmen5'] == 1) & (erf['spr'] == 2)] = 2
    erf['mtybd'][erf['typmen5'] == 5] = 3
    erf['mtybd'][erf['typmen5'] == 3] = 7
    erf['mtybd'][erf['nbenfc'] == 1] = 4
    erf['mtybd'][erf['nbenfc'] == 2] = 5
    erf['mtybd'][erf['nbenfc'] >= 3] = 6
    count_NA('mtybd',erf)

#     print erf['mtybd'].dtype.fields
    #assert_variable_inrange('mtybd', [1,7], erf) # bug, on trouve 7.0 qui fait assert

    # TODO : 3 logements ont 0 pièces !!
    erf['hnph2'][erf['hnph2'] < 1] = 1
    erf['hnph2'][erf['hnph2'] >= 6] = 6
    count_NA('hnph2', erf)
    assert_variable_inrange('hnph2', [1,7], erf)

# # TODO: il reste un NA 2003
# #       il rest un NA en 2008

    tmp = erf['mnatio']
    tmp[erf['mnatio'] == 10] = 1
    tmp[erf['mnatio'].isin([11,12,13,14,15,21,22,23,24,25,26,27,28,29,31,32,41,42,43,44,45,46,47,48,51,52,62,60])] = 2
    erf['mnatio'] = tmp
    count_NA('mnatio', erf)
    assert_variable_inrange('mnatio', [1,3], erf)

    tmp = erf['iaat']
    tmp[erf['mnatio'].isin([1,2,3])] = 1
    tmp[erf['mnatio'] == 4] = 2
    tmp[erf['mnatio'] == 5] = 3
    tmp[erf['mnatio'] == 6] = 4
    tmp[erf['mnatio'] == 7] = 5
    tmp[erf['mnatio'] == 8] = 6
    erf['iaat'] = tmp
    count_NA('iaat', erf)
    assert_variable_inrange('iaat', [1,7], erf)

# # Il reste un NA en 2003
# #    reste un NA en 2008
# table(erf$iaat, useNA="ifany")
    # TODO: comparer logement et erf pour ?tre sur que cela colle

    tmp = erf['mdiplo']
    tmp[erf['mdiplo'].isin([71,""])] = 1
    tmp[erf['mdiplo'].isin([70,60,50])] = 2
    tmp[erf['mdiplo'].isin([41,42,31,33])] = 3
    tmp[erf['mdiplo'].isin([10,11,30])] = 4
    erf['mdiplo'] = tmp
    count_NA('mdiplo', erf)
    #assert_variable_inrange('mdiplo', [1,5], erf) # On a un 99 qui se balade

    tmp = erf['tu99']
    tmp[erf['tu99'] == 0] = 1
    tmp[erf['tu99'].isin([1,2,3])] = 2
    tmp[erf['tu99'].isin([4,5,6])] = 3
    tmp[erf['tu99'] == 7] = 4
    tmp[erf['tu99'] == 8] = 5
    erf['tu99_recoded'] = tmp
    count_NA('tu99_recoded', erf)
    assert_variable_inrange('tu99_recoded', [1,6], erf)

    # TODO : 0 ? Rajouetr 2003 !
    tmp = erf['mcs8']
    tmp[erf['mcs8'] == 1] = 1
    tmp[erf['mcs8'] == 2] = 2
    tmp[erf['mcs8'] == 3] = 3
    tmp[erf['mcs8'].isin([4,8])] = 4
    tmp[erf['mcs8'].isin([5,6,7])] = 5
    erf['mcs8'] = tmp
    count_NA('mcs8', erf)
    assert_variable_inrange('mcs8', [1,6], erf)

    erf['wprm'] = erf['wprm'].astype('int64')
    count_NA('wprm', erf)

    del (erf['cstotpr'] ,erf['agpr'], erf['typmen5'],
    erf['nbenfc'], erf['spr'], erf['tmp'], erf['tu99'])
    gc.collect()


    erf = erf.dropna(subset=['logt','magtr','mcs8','mtybd','hnph2','mnatio','iaat','mdiplo','tu99_recoded'])
    #On vérifie au final que l'on n'a pas de doublons d'individus
    assert erf['ident'].value_counts().max() == 1, Exception('Number of distinct individuals after removing duplicates is not correct')


    ## Travail sur la table logement

    # Table menage
    if year == 2003:
        year_lgt = 2003
    if year > 2005 and year < 2010:
        year_lgt = 2006


    print "preparing logement menage table"

#     Lgtmen = load_temp(name = "indivim",year = year) # Je rajoute une étape bidon
    Lgtmen = df.get_values(table = "lgt_menage", variables = LgtMenVars)
    Lgtmen.rename(columns = {'idlog':'ident'}, inplace = True)

    count_NA('mrcho', Lgtmen)
    Lgtmen['mrcho'].fillna(0, inplace = True)
    Lgtmen['mrret'].fillna(0, inplace = True)
    Lgtmen['mrsal'].fillna(0, inplace = True)
    Lgtmen['mrtns'].fillna(0, inplace = True)
    count_NA('mrcho', Lgtmen)
    Lgtmen['revtot'] = Lgtmen['mrcho']+Lgtmen['mrret']+Lgtmen['mrsal']+Lgtmen['mrtns'] # Virer les revenus négatifs ?
    count_NA('revtot', Lgtmen)
    Lgtmen['nvpr']=10.0*Lgtmen['revtot']/Lgtmen['muc1']

    count_NA('qex', Lgtmen)
    dec, values = mark_weighted_percentiles(Lgtmen['nvpr'],arange(1,11), Lgtmen['qex'],2,return_quantiles=True)
    values.sort()
    Lgtmen['deci'] = (1+(Lgtmen['nvpr']>values[1])+(Lgtmen['nvpr']>values[2])+(Lgtmen['nvpr']>values[3])
                      +(Lgtmen['nvpr']>values[4])+(Lgtmen['nvpr']>values[5])+(Lgtmen['nvpr']>values[6])
                      +(Lgtmen['nvpr']>values[7])+(Lgtmen['nvpr']>values[8])+(Lgtmen['nvpr']>values[9]))
    del dec, values
    print Lgtmen['deci'].describe()
    gc.collect()

    ##Table logement (pas en 2003 mais en 2006)
# str(lgtmen)
# if (year_lgt=="2006"){
#   message("preparing logement logement table")
#   lgtlgt <- LoadIn(lgtLgtFil,lgtLgtVars)
#   lgtlgt <- upData(lgtlgt, rename=renameidlgt)
#   lgtmen <- merge(lgtmen, lgtlgt, by.x="ident", by.y="ident")

    if year_lgt == 2006:
        print 'preparing logement logement table'
        lgtlgt = df.get_values(table = "lgt_logt", variables = LgtLgtVars)
        lgtlgt.rename(columns = {'idlog':'ident'}, inplace = True)
        Lgtmen = Lgtmen.merge(lgtlgt, left_on = 'ident', right_on = 'ident', how = 'inner')
        del lgtlgt

    data = Lgtmen[Lgtmen['sec1'].isin([21,22,23,24,30])]
    del Lgtmen
    gc.collect()

    if year_lgt == 2006:
        data.rename(columns = {'mnatio':'mnatior'}, inplace = True)

    data = (data[data['mnatior'].notnull()])
    data = (data[data['sec1'].notnull()])
    data['tmp'] = data['sec1'].astype(np.int64)
    data['tmp'][data['sec1'].isin([21,22,23])] = 3
    data['tmp'][data['sec1'] == 24] = 4
    data['tmp'][data['sec1'] == 30] = 5
    data['logt'] = data['tmp']
    count_NA('logt', data)
    data = (data[data['logt'].notnull()])
    Lgtmen = data

# ## Table adresse
    print "preparing logement adresse table"
# lgtadr <- LoadIn(lgtAdrFil,lgtAdrVars)
# lgtadr <- upData(lgtadr, rename=renameidlgt)
    # Je rajoute une étae bidon
    Lgtadr = df.get_values(table = "adresse", variables = LgtAdrVars)
    Lgtadr.rename(columns = {'idlog':'ident'}, inplace = True)

    print('Merging logement and menage tables')
    Logement = Lgtmen.merge(Lgtadr, on = 'ident', how = 'inner')
#     control(Logement) # Pas de idfoy, etc. dans la table logement ?

    Logement['hnph2'][Logement['hnph2'] >= 6] = 6
    Logement['hnph2'][Logement['hnph2'] < 1] = 1
    count_NA('hnph2', Logement)
    assert not Logement['hnph2'].isnull().any(), "Some hnph2 are null"
#     Logement=(Logement[Logement['hnph2'].notnull()]) # Mis en comment car 0 NA pour hnph2

    # On est dans la même étape within ici et par la suite ( cf code R )
    # TODO : ici problème je transforme les 07 en 7
    # car Python considère les 0n comme des nombres octaux ( < 08 ).
    # J'espère que ce n'est pas important.
    Logement['tmp'] = Logement['mnatior']
    Logement['tmp'][Logement['mnatior'].isin([0, 1])] = 1
    Logement['tmp'][Logement['mnatior'].isin([2, 3, 4, 5, 6, 7, 8, 9, 10, 11])] = 2
    Logement['mnatior'] = Logement['tmp']
    count_NA('mnatior', Logement)
    assert_variable_inrange('mnatior', [1,3], Logement)

    Logement['tmp'] = Logement['iaat']
    Logement['tmp'][Logement['iaat'].isin([1,2,3,4,5])] = 1
    Logement['tmp'][Logement['iaat'] == 6] = 2
    Logement['tmp'][Logement['iaat'] == 7] = 3
    Logement['tmp'][Logement['iaat'] == 8] = 4
    Logement['tmp'][Logement['iaat'] == 9] = 5
    Logement['tmp'][Logement['iaat'] == 10] = 6 # TODO question Clément : et le 9 et le 10 ?
    Logement['iaat'] = Logement['tmp']
    count_NA('iaat', Logement)
    assert_variable_inrange('iaat', [1,7], Logement)

    Logement['tmp'] = Logement['mdiplo']
    Logement['tmp'][Logement['mdiplo'] == 1] = 1
    Logement['tmp'][Logement['mdiplo'].isin([2,3,4])] = 2
    Logement['tmp'][Logement['mdiplo'].isin([5,6,7,8])] = 3
    Logement['tmp'][Logement['mdiplo'] == 9] = 4
    Logement['mdiplo'] = Logement['tmp']
    count_NA('mdiplo', Logement)
    assert_variable_inrange('mdiplo', [1,5], Logement)

    Logement['tmp'] = Logement['mtybd']
    Logement['tmp'][Logement['mtybd'] == 110] = 1
    Logement['tmp'][Logement['mtybd'] == 120] = 2
    Logement['tmp'][Logement['mtybd'] == 200] = 3
    Logement['tmp'][Logement['mtybd'].isin([311,321,401])] = 4
    Logement['tmp'][Logement['mtybd'].isin([312,322,402])] = 5
    Logement['tmp'][Logement['mtybd'].isin([313,323,403])] = 6
    Logement['tmp'][Logement['mtybd'] == 400] = 7
    Logement['mtybd'] = Logement['tmp']
    count_NA('mtybd', Logement)
    assert_variable_inrange('mtybd', [1,8], Logement)

    Logement['tmp'] = Logement['tu99']
    count_NA('tu99', Logement)
    Logement['tmp'][Logement['tu99'] == 0] = 1
    Logement['tmp'][Logement['tu99'].isin([1,2,3])] = 2
    Logement['tmp'][Logement['tu99'].isin([4,5,6])] = 3
    Logement['tmp'][Logement['tu99'] == 7] = 4
    Logement['tmp'][Logement['tu99'] == 8] = 5
    Logement['tu99_recoded'] = Logement['tmp']
    count_NA('tu99_recoded', Logement)
    assert_variable_inrange('tu99_recoded', [1,6], Logement)

    Logement['tmp'] = Logement['gzc2']
    Logement['tmp'][Logement['gzc2'] == 1] = 1
    Logement['tmp'][Logement['gzc2'].isin([2,3,4,5,6])] = 2
    Logement['tmp'][Logement['gzc2'] == 7] = 3
    Logement['gzc2'] = Logement['tmp']
    count_NA('gzc2', Logement)
    assert_variable_inrange('gzc2', [1,4], Logement)

    Logement['tmp'] = Logement['magtr']
    Logement['tmp'][Logement['magtr'].isin([1,2])] = 1
    Logement['tmp'][Logement['magtr'].isin([3,4])] = 2
    Logement['tmp'][Logement['magtr'] == 5] = 3
    Logement['magtr'] = Logement['tmp']
    count_NA('magtr', Logement)
    assert_variable_inrange('magtr', [1,4], Logement)

    Logement['tmp'] = Logement['mcs8']
    Logement['tmp'][Logement['mcs8'] == 1] = 1
    Logement['tmp'][Logement['mcs8'] == 2] = 2
    Logement['tmp'][Logement['mcs8'] == 3] = 3
    Logement['tmp'][Logement['mcs8'].isin([4,8])] = 4
    Logement['tmp'][Logement['mcs8'].isin([5,6,7])] = 5
    Logement['mcs8'] = Logement['tmp']
    count_NA('mcs8', Logement)
    assert_variable_inrange('mcs8', [1,6], Logement)

    Logement['logloy'] = Logement['lmlm'].apply(lambda x: math.log(x))

    Logement=(Logement[Logement['mdiplo'].notnull()])
    Logement=(Logement[Logement['mtybd'].notnull()])
    Logement=(Logement[Logement['magtr'].notnull()])
    Logement=(Logement[Logement['mcs8'].notnull()])
    Logement=(Logement[Logement['maa1at'].notnull()])

    ## Imputation des loyers proprement dite

# library(StatMatch) # loads StatMatch
# # library(mice) use md.pattern to locate missing data
    # TODO : à supprimer ?

# logt <- subset(logement,select=c(lmlm,logt , hnph2 , iaat , mdiplo , mtybd , tu99_recoded , magtr , mcs8 , deci, ident))
# logt$wprm <- logement$qex
# erf <- subset(erf,select=c( logt , hnph2 , iaat , mdiplo , mtybd , tu99_recoded , magtr , mcs8 , deci, wprm, ident))
    print ('Compute imputed rents')
    Logt = Logement[['lmlm','logt' , 'hnph2' , 'iaat' , 'mdiplo' , 'mtybd' , 'tu99_recoded' , 'magtr' , 'mcs8' , 'deci', 'ident']]
    Logt['wprm'] = Logement['qex']
    erf = erf[['logt' , 'hnph2' , 'iaat' , 'mdiplo' , 'mtybd' , 'tu99_recoded' , 'magtr' , 'mcs8' , 'deci', 'wprm' , 'ident']]

# # debug
# # derf  <- describe(erf, weights=as.numeric(erf$wprm))
# # dlogt <- describe(logt, weights=logt$wprm)
# #
# # for (var in as.list(names(derf))){
# #   print("erf")
# #   print(derf[[var]])
# #   print("logt")
# #   print(dlogt[[var]])
# #   print("================")
# # }


    # TODO add md.pattern

# erf1 <- na.omit(erf)
# logt <- na.omit(logt)
    from pandas import DataFrame
    erf = erf.dropna(how = 'any') # Si j'ai bien compris ce que l'on fait en R : dropper les lignes avec des NA
    #erf1 = erf # A-t-on toujours besoin de changer le nom du coup ?
    Logt = Logt.dropna(how = 'any')

# allvars <- c("logt", "hnph2", "iaat", "mdiplo", "mtybd", "tu99_recoded", "magtr", "mcs8", "deci")
# classes <- c("magtr","tu99_recoded")
# matchvars <- setdiff(allvars,classes)
    allvars = ['logt', 'hnph2', 'iaat', 'mdiplo', 'mtybd', 'tu99_recoded', 'magtr', 'mcs8', 'deci']
    classes = ['magtr', 'tu99_recoded']
    matchvars = list(set(allvars)-set(classes))
    erf['mcs8'] = erf['mcs8'].astype(int)
# out.nnd <- NND.hotdeck(data.rec=erf1,data.don=logt,match.vars=matchvars,don.class=classes,gdist.fun="Gower")
# fill.erf.nnd <- create.fused(data.rec=erf1, data.don=logt,mtc.ids=out.nnd$mtc.ids, z.vars="lmlm")
    from rpy2.robjects.packages import importr
    import rpy2.robjects.pandas2ri
    import rpy2.robjects.vectors as vectors
    rpy2.robjects.pandas2ri.activate() # Permet à rpy2 de convertir les dataframes
    sm = importr("StatMatch")#, lib_loc = "C:\Program Files\R\R-2.15.2\library")
    print 'TEST 2'
    out_nnd = sm.NND_hotdeck(data_rec = erf,
                              data_don = Logt,
                               match_vars = vectors.StrVector(matchvars),
                                don_class = vectors.StrVector(classes),
                                 dist_fun = "Gower")
    print 'TEST 3'
    fill_erf_nnd = sm.create_fused(data_rec = erf,
                                    data_don = Logt,
                                     mtc_ids = out_nnd[0],
                                      z_vars = vectors.StrVector(["lmlm"]))
    del allvars, matchvars, classes, out_nnd
    gc.collect()

# fill.erf.nnd <- upData(fill.erf.nnd, rename=c(lmlm='loym'))
    import pandas.rpy.common as com
    fill_erf_nnd = com.convert_robj(fill_erf_nnd)
    fill_erf_nnd = DataFrame(fill_erf_nnd)
    (fill_erf_nnd).rename(columns={'lmlm':'loym'}, inplace = True)

# loy_imput = fill.erf.nnd[c('ident','loym')]
    loy_imput = (fill_erf_nnd)[['ident','loym']]

# load(menm)
# menagem$loym <- NULL
# menagem <- merge(menagem,loy_imput,by='ident',all.x = TRUE)
# save(menagem,file=menm)
#     Mis en comment block, car à manipuler avec précaution je suppose ( ne souhaite pas faire de conneries )

    erfmenm = load_temp(name="menagem", year=year)
#     del erfmenm['loym']
    erfmenm = erfmenm.merge(loy_imput,on='ident',how='left')
    assert 'loym' in erfmenm.columns, 'No loym in erfmenm columns'
    save_temp(erfmenm, name = "menagem", year=year)
Example #13
0
def check_converted():
    #Retrieving the input and output files for analysis :
    store = HDFStore(survey_test)
    input_df = store['survey_2006']

    output = HDFStore(survey3_test)

    df_fam = output['survey_2006/fam']
    df_foy = output['survey_2006/foy']
    df_men = output['survey_2006/men']
    df_ind = output['survey_2006/ind']

    year = 2006
    erf = DataCollection(year=year)
    df = erf.get_of_values(table = "erf_indivi")
    df2 = erf.get_of_values(table = "eec_indivi")
    print '\n'
    print df.loc[df.ident==6030189, :].to_string()
    print df2.loc[df2.ident==6030189, :].to_string()

    print len(np.unique(input_df['idfoy'].values))
    print len(np.unique(input_df.loc[input_df['quifoy']==0,'idfoy'].values))

    liste = [601228002, 602671302, 602016402, 603069602, 601365902, 602679402, 602680905, 603074902, 600848302,
             602684902, 601508802, 601427302, 601774602, 600466102, 603448202, 603091202, 602437502, 603224003,
             603093102, 601261802, 601000002, 601789602, 601660602, 600350102, 601927802, 601797902, 601667902,
             601537502, 600227602, 602854502, 602071902, 600144702, 602205702, 600769302, 601096602, 602609202,
             601301302, 602220302, 602486102, 601376802, 601570902, 600654802, 601443202, 603412402, 603412902,
             601055502, 602893001, 601189902, 601850602, 600539902, 602507002, 601460902, 602511602, 601200902,
             601601802, 600946903, 600428502, 600953502, 601084802, 601350102, 600829602, 600174402]
    liste_men = np.unique(input_df.loc[input_df.idfoy.isin(liste), 'idmen'].values)
    print liste_men
    print df.loc[df.ident.isin(liste_men), ['noi', 'noindiv', 'ident', 'declar1', 'declar2', 'persfip', 'persfipd', 'quelfic']].head(30).to_string()
    print input_df.loc[input_df.idfoy.isin(liste), :].head(30).to_string()

#     print input_df.loc[input_df.idfoy==603018901,
#                        ['idfoy', 'quifoy', 'idfam', 'quifam', 'idmen', 'quimen', 'noi']].to_string()
#
#     print input_df.loc[input_df.idfam==603018902,
#                    ['idfoy', 'quifoy', 'idfam', 'quifam', 'idmen', 'quimen', 'noi']].to_string()
    return
#     df_foy['noindiv'] = df_foy['noi'] ; del df_foy['noi']
#     df_fam['noindiv'] = df_fam['noi'] ; del df_fam['noi']
#     df_men['noindiv'] = df_men['noi'] ; del df_men['noi']
#     print df_fam, df_foy, df_men

#     check_structure(store['survey_2006'])
#     control(input_df, verbose=True, verbose_columns=['noindiv'])
#     control(df_foy, verbose=True, verbose_columns=['noindiv'])

#     print input_df.duplicated('noindiv').sum(), len(input_df)
#     print df_foy.duplicated('noindiv').sum(), len(df_foy)
#     print df_fam.duplicated('noindiv').sum(), len(df_fam)
#     print df_men.duplicated('noindiv').sum(), len(df_men)
#     print df_ind.head(10).to_string()
    print '    FAM'
    print sorted(df_fam.columns)
    print '    FOY'
    print sorted(df_foy.columns)
    print '    MEN'
    print sorted(df_men.columns)
    print '    IND'
    print sorted(df_ind.columns)

#     print df_fam.columns
    print '    INPUT'
    print sorted(input_df.columns)
def create_totals(year=2006):

    print "Creating Totals"
    print "Etape 1 : Chargement des données"

    data = DataCollection(year=year)
    indivim = load_temp(name="indivim", year=year)

    assert indivim.duplicated(['noindiv'
                               ]).any() == False, "Présence de doublons"

    # Deals individuals with imputed income : some individuals are in 'erf individu table' but
    # not in the 'foyer' table. We need to create a foyer for them.

    selection = Series()
    for var in [
            "zsali", "zchoi", "zrsti", "zalri", "zrtoi", "zragi", "zrici",
            "zrnci"
    ]:
        varo = var[:-1] + "o"
        test = indivim[var] != indivim[varo]
        if len(selection) == 0:
            selection = test
        else:
            selection = (test) | (selection)

    indivi_i = indivim[selection]
    indivi_i.rename(
        columns={
            "ident": "idmen",
            "persfip": "quifoy",
            "zsali":
            "sali2",  # Inclu les salaires non imposables des agents d'assurance
            "zchoi": "choi2",
            "zrsti": "rsti2",
            "zalri": "alr2"
        },
        inplace=True)

    indivi_i["quifoy"] = where(indivi_i["quifoy"].isnull(), "vous",
                               indivi_i["quifoy"])
    indivi_i["quelfic"] = "FIP_IMP"

    ## We merge them with the other individuals
    #indivim <- rename(indivim, c(ident = "idmen",
    #                             persfip = "quifoy",
    #                             zsali = "sali2", # Inclu les salaires non imposables des agents d'assurance
    #                             zchoi = "choi2",
    #                             zrsti = "rsti2",
    #                             zalri = "alr2"))
    #
    #indivi <- rbind(indivim[!(indivim$noindiv %in% indivi_i$noindiv),], indivi_i)
    #rm(indivim, indivi_i)
    #gc()
    #table(indivi$quelfic)
    #

    indivim.rename(
        columns=dict(
            ident="idmen",
            persfip="quifoy",
            zsali=
            "sali2",  # Inclu les salaires non imposables des agents d'assurance
            zchoi="choi2",
            zrsti="rsti2",
            zalri="alr2"),
        inplace=True)

    if not (set(list(indivim.noindiv)) > set(list(indivi_i.noindiv))):
        raise Exception("Individual ")
    indivim.set_index("noindiv", inplace=True)
    indivi_i.set_index("noindiv", inplace=True)
    indivi = indivim
    del indivim
    indivi.update(indivi_i)

    indivi.reset_index(inplace=True)

    print ''
    print "Etape 2 : isolation des FIP"
    fip_imp = indivi.quelfic == "FIP_IMP"
    indivi["idfoy"] = (
        indivi["idmen"].astype("int64") * 100 +
        (indivi["declar1"].str[0:2]).convert_objects(convert_numeric=True))

    indivi.loc[fip_imp, "idfoy"] = nan

    ## Certains FIP (ou du moins avec revenus imputés) ont un num?ro de déclaration d'impôt ( pourquoi ?)

    fip_has_declar = (fip_imp) & (indivi.declar1.notnull())

    #    indivi.ix[fip_has_declar, "idfoy"] = ( indivi.ix[fip_has_declar, "idmen"]*100
    #                                        + (indivi.ix[fip_has_declar, "declar1"].str[0:1]).convert_objects(convert_numeric=True) )
    indivi["idfoy"] = where(
        fip_has_declar, indivi["idmen"] * 100 +
        indivi["declar1"].str[0:2].convert_objects(convert_numeric=True),
        indivi["idfoy"])

    del fip_has_declar

    fip_no_declar = (fip_imp) & (indivi.declar1.isnull())
    del fip_imp
    indivi["idfoy"] = where(fip_no_declar, indivi["idmen"] * 100 + 50,
                            indivi["idfoy"])

    indivi_fnd = indivi.loc[fip_no_declar, ["idfoy", "noindiv"]]

    while any(indivi_fnd.duplicated(cols=["idfoy"])):
        indivi_fnd["idfoy"] = where(indivi_fnd.duplicated(cols=["idfoy"]),
                                    indivi_fnd["idfoy"] + 1,
                                    indivi_fnd["idfoy"])

    assert indivi_fnd["idfoy"].duplicated().value_counts()[False] == len(
        indivi_fnd["idfoy"]), "Duplicates remaining"
    assert len(indivi[indivi.duplicated(['noindiv'])]) == 0, "Doublons"

    indivi.loc[fip_no_declar, ["idfoy"]] = indivi_fnd
    del indivi_fnd, fip_no_declar

    print ''
    print 'Etape 3 : Récupération des EE_NRT'

    nrt = indivi.quelfic == "EE_NRT"
    indivi.idfoy = where(nrt, indivi.idmen * 100 + indivi.noi, indivi.idfoy)
    indivi.loc[nrt, "quifoy"] = "vous"
    del nrt

    pref_or_cref = indivi['lpr'].isin([1, 2])
    adults = (indivi.quelfic.isin(["EE", "EE_CAF"])) & (pref_or_cref)
    indivi.idfoy = where(adults, indivi.idmen * 100 + indivi.noi, indivi.idfoy)
    indivi.loc[adults, "quifoy"] = "vous"
    del adults
    assert indivi.loc[indivi['lpr'].isin([1, 2]), "idfoy"].notnull().all()

    print ''
    print 'Etape 4 : Rattachement des enfants aux déclarations'

    assert indivi["noindiv"].duplicated().any(
    ) == False, "Some noindiv appear twice"
    lpr3_or_lpr4 = indivi['lpr'].isin([3, 4])
    enf_ee = (lpr3_or_lpr4) & (indivi.quelfic.isin(["EE", "EE_CAF"]))
    assert indivi.loc[enf_ee, "noindiv"].notnull().all(
    ), " Some noindiv are not set, which will ruin next stage"
    assert indivi.loc[
        enf_ee,
        "noindiv"].duplicated().any() == False, "Some noindiv appear twice"

    pere = DataFrame({
        "noindiv_enf":
        indivi.noindiv.loc[enf_ee],
        "noindiv":
        100 * indivi.idmen.loc[enf_ee] + indivi.noiper.loc[enf_ee]
    })
    mere = DataFrame({
        "noindiv_enf":
        indivi.noindiv.loc[enf_ee],
        "noindiv":
        100 * indivi.idmen.loc[enf_ee] + indivi.noimer.loc[enf_ee]
    })

    foyer = data.get_values(variables=["noindiv", "zimpof"], table="foyer")
    pere = pere.merge(foyer, how="inner", on="noindiv")
    mere = mere.merge(foyer, how="inner", on="noindiv")

    #     print "Some pere et mere are duplicated because people have two foyers"
    #     print pere[pere.duplicated()]
    #     print mere[mere.duplicated()]

    df = pere.merge(mere, how="outer", on="noindiv_enf", suffixes=('_p', '_m'))

    #     print len(pere)
    #     print len(mere)
    #     print len(df)
    #     ll = df.loc[df["noindiv_enf"].duplicated(), "noindiv_enf"]
    #     print df.loc[df["noindiv_enf"].isin(ll)]
    #     print df[df.duplicated()]

    print '    4.1 : gestion des personnes dans 2 foyers'
    for col in ["noindiv_p", "noindiv_m", "noindiv_enf"]:
        df[col] = df[col].fillna(
            0, inplace=True)  # beacause groupby drop groups with NA in index
    df = df.groupby(by=["noindiv_p", "noindiv_m", "noindiv_enf"]).sum()
    df.reset_index(inplace=True)

    df["which"] = ""
    df["which"] = where((df.zimpof_m.notnull()) & (df.zimpof_p.isnull()),
                        "mere", "")
    df["which"] = where((df.zimpof_p.notnull()) & (df.zimpof_m.isnull()),
                        "pere", "")
    both = (df.zimpof_p.notnull()) & (df.zimpof_m.notnull())
    df["which"] = where(both & (df.zimpof_p > df.zimpof_m), "pere", "mere")
    df["which"] = where(both & (df.zimpof_m >= df.zimpof_p), "mere", "pere")

    assert df["which"].notnull().all(
    ), "Some enf_ee individuals are not matched with any pere or mere"
    del lpr3_or_lpr4, pere, mere

    df.rename(columns={"noindiv_enf": "noindiv"}, inplace=True)
    df["idfoy"] = where(df.which == "pere", df.noindiv_p, df.noindiv_m)
    df["idfoy"] = where(df.which == "mere", df.noindiv_m, df.noindiv_p)

    assert df["idfoy"].notnull().all()

    for col in df.columns:
        if col not in ["idfoy", "noindiv"]:
            del df[col]


#     assert indivi.loc[enf_ee,"idfoy"].notnull().all()
    assert df.duplicated().any() == False

    df.set_index("noindiv", inplace=True, verify_integrity=True)
    indivi.set_index("noindiv", inplace=True, verify_integrity=True)

    ind_notnull = indivi["idfoy"].notnull().sum()
    ind_isnull = indivi["idfoy"].isnull().sum()
    indivi = indivi.combine_first(df)
    assert ind_notnull + ind_isnull == (indivi["idfoy"].notnull().sum() +
                                        indivi["idfoy"].isnull().sum())

    indivi.reset_index(inplace=True)
    assert indivi.duplicated().any() == False

    # MBJ: issue delt with when moving from R code to python
    ## TODO il faut rajouterles enfants_fip et créer un ménage pour les majeurs
    ## On suit guide méthodo erf 2003 page 135
    ## On supprime les conjoints FIP et les FIP de 25 ans et plus;
    ## On conserve les enfants FIP de 19 à 24 ans;
    ## On supprime les FIP de 18 ans et moins, exceptés les FIP nés en 2002 dans un
    ## ménage en 6ème interrogation car ce sont des enfants nés aprés la date d'enquète
    ## EEC que l'on ne retrouvera pas dans les EEC suivantes.
    #
    print '    4.2 : On enlève les individus pour lesquels il manque le déclarant'
    fip = load_temp(name="fipDat", year=year)
    fip["declar"] = nan
    fip["agepf"] = nan

    fip.drop(["actrec", "year", "noidec"], axis=1, inplace=True)
    fip.naia = fip.naia.astype("int32")
    fip.rename(
        columns=dict(
            ident="idmen",
            persfip="quifoy",
            zsali=
            "sali2",  # Inclu les salaires non imposables des agents d'assurance
            zchoi="choi2",
            zrsti="rsti2",
            zalri="alr2"),
        inplace=True)

    is_fip_19_25 = ((year - fip.naia - 1) >= 19) & ((year - fip.naia - 1) < 25)

    ## TODO: BUT for the time being we keep them in thier vous menage so the following lines are commented
    ## The idmen are of the form 60XXXX we use idmen 61XXXX, 62XXXX for the idmen of the kids over 18 and less than 25
    ##fip[is_fip_19_25 ,"idmen"] <- (99-fip[is_fip_19_25,"noi"]+1)*100000 + fip[is_fip_19_25,"idmen"]
    ##fip[is_fip_19_25 ,"lpr"]  <- 1
    #
    #indivi <- rbind.fill(indivi,fip[is_fip_19_25,])

    indivi = concat([indivi, fip.loc[is_fip_19_25]])
    del is_fip_19_25
    indivi['age'] = year - indivi.naia - 1
    indivi['agem'] = 12 * indivi.age + 12 - indivi.naim

    indivi["quimen"] = 0
    indivi.quimen[indivi.lpr == 1] = 0
    indivi.quimen[indivi.lpr == 2] = 1
    indivi.quimen[indivi.lpr == 3] = 2
    indivi.quimen[indivi.lpr == 4] = 3
    indivi['not_pr_cpr'] = nan
    indivi['not_pr_cpr'][indivi['lpr'] <= 2] = False
    indivi['not_pr_cpr'][indivi['lpr'] > 2] = True

    print "    4.3 : Creating non pr=0 and cpr=1 idmen's"
    indivi.reset_index(inplace=True)
    test1 = indivi.ix[indivi['not_pr_cpr'] == True, ['quimen', 'idmen']]
    test1['quimen'] = 2

    j = 2
    while any(test1.duplicated(['quimen', 'idmen'])):
        test1.loc[test1.duplicated(['quimen', 'idmen']), 'quimen'] = j + 1
        j += 1

    print_id(indivi)
    indivi.update(test1)

    print_id(indivi)

    #     indivi.set_index(['quiment']) #TODO: check relevance
    #     TODO problème avec certains idfoy qui n'ont pas de vous
    print ''
    print "Etape 5 : Gestion des idfoy qui n'ont pas de vous"
    all = indivi.drop_duplicates('idfoy')
    with_ = indivi.loc[indivi['quifoy'] == 'vous', 'idfoy']
    without = all[~(all.idfoy.isin(with_.values))]

    print 'On cherche si le déclarant donné par la deuxième déclaration est bien un vous'
    has_declar2 = (indivi.idfoy.isin(
        without.idfoy.values)) & (indivi.declar2.notnull())
    decl2_idfoy = (indivi.loc[has_declar2, 'idmen'].astype('int') * 100 +
                   indivi.loc[has_declar2, "declar2"].str[0:2].astype('int'))
    indivi.loc[has_declar2, 'idfoy'] = where(decl2_idfoy.isin(with_.values),
                                             decl2_idfoy, None)
    del all, with_, without, has_declar2

    print '    5.1 : Elimination idfoy restant'
    idfoyList = indivi.loc[indivi['quifoy'] == "vous",
                           'idfoy'].drop_duplicates()
    indivi = indivi[indivi.idfoy.isin(idfoyList.values)]
    del idfoyList
    print_id(indivi)

    myvars = [
        "noindiv", "noi", "idmen", "idfoy", "quifoy", "wprm", "age", "agem",
        "quelfic", "actrec", "quimen", "nbsala", "titc", "statut", "txtppb",
        "chpub", "prosa", "encadr"
    ]

    if not (len(set(myvars).difference(set(indivi.columns))) == 0):
        print set(myvars).difference(set(indivi.columns))

    assert len(set(myvars).difference(set(indivi.columns))) == 0

    indivi = indivi.loc[:, myvars]

    ## TODO les actrec des fip ne sont pas codées (on le fera à la fin quand on aura rassemblé
    ## les infos provenant des déclarations)

    print ''
    print 'Etape 6 : Création des variables descriptives'
    print '    6.1 : variable activité'
    indivi['activite'] = None
    indivi['activite'][indivi['actrec'] <= 3] = 0
    indivi['activite'][indivi['actrec'] == 4] = 1
    indivi['activite'][indivi['actrec'] == 5] = 2
    indivi['activite'][indivi['actrec'] == 7] = 3
    indivi['activite'][indivi['actrec'] == 8] = 4
    indivi['activite'][indivi['age'] <= 13] = 2  # ce sont en fait les actrec=9
    print indivi['activite'].value_counts()
    # TODO: MBJ problem avec les actrec

    indivi['titc'][indivi['titc'].isnull()] = 0
    assert indivi['titc'].notnull().all(), Exception("Problème avec les titc")

    print '    6.2 : variable statut'
    indivi['statut'][indivi['statut'].isnull()] = 0
    indivi['statut'] = indivi['statut'].astype('int')
    indivi['statut'][indivi['statut'] == 11] = 1
    indivi['statut'][indivi['statut'] == 12] = 2
    indivi['statut'][indivi['statut'] == 13] = 3
    indivi['statut'][indivi['statut'] == 21] = 4
    indivi['statut'][indivi['statut'] == 22] = 5
    indivi['statut'][indivi['statut'] == 33] = 6
    indivi['statut'][indivi['statut'] == 34] = 7
    indivi['statut'][indivi['statut'] == 35] = 8
    indivi['statut'][indivi['statut'] == 43] = 9
    indivi['statut'][indivi['statut'] == 44] = 10
    indivi['statut'][indivi['statut'] == 45] = 11
    assert indivi['statut'].isin(
        range(12)).all(), Exception("statut value over range")

    #indivi$nbsala <- as.numeric(indivi$nbsala)
    #indivi <- within(indivi,{
    #  nbsala[is.na(nbsala) ]    <- 0
    #  nbsala[nbsala==99 ] <- 10  # TODO  418 fip à retracer qui sont NA
    #})

    print '    6.3 : variable txtppb'
    indivi['txtppb'] = indivi['txtppb'].fillna(0)
    assert indivi['txtppb'].notnull().all()

    indivi['nbsala'] = indivi['nbsala'].fillna(0)
    indivi['nbsala'] = indivi['nbsala'].astype('int')
    indivi['nbsala'][indivi['nbsala'] == 99] = 10
    assert indivi['nbsala'].isin(range(11)).all()

    print '    6.4 : variable chpub et CSP'
    indivi['chpub'].fillna(0, inplace=True)
    indivi['chpub'] = indivi['chpub'].astype('int')
    indivi['chpub'][indivi['chpub'].isnull()] = 0
    print indivi['chpub'].value_counts()
    assert indivi['chpub'].isin(range(11)).all()

    indivi['cadre'] = 0
    indivi['prosa'][indivi['prosa'].isnull()] = 0
    assert indivi['prosa'].notnull().all()
    print indivi['encadr'].value_counts()

    # encadr : 1=oui, 2=non
    indivi['encadr'].fillna(2, inplace=True)
    assert indivi['encadr'].notnull().all()
    indivi['cadre'][indivi['prosa'].isin([7, 8])] = 1
    indivi['cadre'][(indivi['prosa'] == 9) & (indivi['encadr'] == 1)] = 1
    print "cadre"
    print indivi['cadre'].value_counts()
    assert indivi['cadre'].isin(range(2)).all()

    print ''
    print "Etape 7 : on vérifie qu'il ne manque pas d'info sur les liens avec la personne de référence"

    print 'nb de doublons idfam/quifam', len(
        indivi[indivi.duplicated(cols=['idfoy', 'quifoy'])])

    print 'On crée les n° de personnes à charge'
    assert indivi['idfoy'].notnull().all()
    print_id(indivi)
    indivi['quifoy2'] = 2
    indivi['quifoy2'][indivi['quifoy'] == 'vous'] = 0
    indivi['quifoy2'][indivi['quifoy'] == 'conj'] = 1
    indivi['quifoy2'][indivi['quifoy'] == 'pac'] = 2

    del indivi['quifoy']
    indivi['quifoy'] = indivi['quifoy2']
    del indivi['quifoy2']

    print_id(indivi)
    test2 = indivi.loc[indivi['quifoy'] == 2, ['quifoy', 'idfoy', 'noindiv']]
    print_id(test2)

    j = 2
    while test2.duplicated(['quifoy', 'idfoy']).any():
        test2.loc[test2.duplicated(['quifoy', 'idfoy']), 'quifoy'] = j
        j += 1

    print_id(test2)
    indivi = indivi.merge(test2, on=['noindiv', 'idfoy'], how="left")
    indivi['quifoy'] = indivi['quifoy_x']
    indivi['quifoy'] = where(indivi['quifoy_x'] == 2, indivi['quifoy_y'],
                             indivi['quifoy_x'])
    del indivi['quifoy_x'], indivi['quifoy_y']
    print_id(indivi)

    del test2, fip
    print 'nb de doublons idfam/quifam', len(
        indivi[indivi.duplicated(cols=['idfoy', 'quifoy'])])
    print_id(indivi)

    #####################################################################################
    ## On ajoute les idfam et quifam
    #load(famc)
    #
    #tot2 <- merge(indivi, famille, by = c('noindiv'), all.x = TRUE)
    #rm(famille)
    #print_id(tot2)
    #
    ### Les idfam des enfants FIP qui ne font plus partie des familles forment des famille seuls
    #tot2[is.na(tot2$quifam), "idfam"] <- tot2[is.na(tot2$quifam), "noindiv"]
    #tot2[is.na(tot2$quifam), "quifam"] <- 0
    #print_id(tot2)
    #saveTmp(tot2, file = "tot2.Rdata")
    #rm(indivi,tot2)
    #
    ## on merge les variables de revenus (foyer_aggr) avec les identifiants précédents
    ## load foyer
    #loadTmp(file = "tot2.Rdata")
    #loadTmp(file= "foyer_aggr.Rdata")
    #
    #tot3 <- merge(tot2, foyer, all.x = TRUE)
    #print_id(tot3) # OK
    #saveTmp(tot3, file= "tot3.Rdata")
    #rm(tot3,tot2,foyer)
    #
    print ''
    print 'Etape 8 : création des fichiers totaux'
    famille = load_temp(name='famc', year=year)

    print '    8.1 : création de tot2 & tot3'
    tot2 = indivi.merge(famille, on='noindiv', how='inner')
    #     del famille # TODO: MBJ increase in number of menage/foyer when merging with family ...
    del famille

    control(tot2, debug=True, verbose=True)
    assert tot2['quifam'].notnull().all()

    save_temp(tot2, name='tot2', year=year)
    del indivi
    print '    tot2 saved'

    #     #On combine les variables de revenu
    #     foyer = load_temp(name='foy_ind', year=year)
    #     print " INTERSERCT THE POOCHAY"
    #     tot2["idfoy"] = tot2["idfoy"][tot2["idfoy"].notnull()] +1
    #     print "pingas"
    #     print sorted(tot2.loc[tot2.idfoy.notnull(),"idfoy"].astype('int').unique())[0:10]
    #     print "pocchay"
    #     print sorted(foyer["idfoy"].unique())[0:10]
    #     print "final flash"
    #     print 602062550.0 in foyer["idfoy"].values
    #     print len(list(set(tot2["idfoy"].unique()) & set(foyer["idfoy"].unique())))
    #     print tot2.quifoy.value_counts()
    #tot2.update(foyer)
    tot2.merge(foyer, how='left')

    tot2 = tot2[tot2.idmen.notnull()]
    #     tot2['idfoy'] += 1

    print_id(tot2)

    tot3 = tot2
    # TODO: check where they come from
    tot3 = tot3.drop_duplicates(cols='noindiv')
    print len(tot3)

    #Block to remove any unwanted duplicated pair
    print "    check tot3"
    control(tot3, debug=True, verbose=True)
    tot3 = tot3.drop_duplicates(cols=['idfoy', 'quifoy'])
    tot3 = tot3.drop_duplicates(cols=['idfam', 'quifam'])
    tot3 = tot3.drop_duplicates(cols=['idmen', 'quimen'])
    tot3 = tot3.drop_duplicates(cols='noindiv')
    control(tot3)

    ## On ajoute les variables individualisables
    #loadTmp("foyer_individualise.Rdata") # foy_ind
    #loadTmp("tot3.Rdata")
    #loadTmp("allvars.Rdata")
    #loadTmp("sif.Rdata")
    #
    #vars2 <- setdiff(names(tot3),  allvars)
    #tot3 <- tot3[,vars2]
    #
    #print_id(tot3)
    #final <- merge(tot3, foy_ind, by = c('idfoy', 'quifoy'), all.x = TRUE)
    #
    print '    8.2 : On ajoute les variables individualisables'

    allvars = load_temp(name='ind_vars_to_remove', year=year)
    vars2 = set(tot3.columns).difference(set(allvars))
    tot3 = tot3[list(vars2)]
    print len(tot3)

    assert not (tot3.duplicated(
        cols=['noindiv']).any()), "doublon dans tot3['noindiv']"
    lg_dup = len(tot3[tot3.duplicated(['idfoy', 'quifoy'])])
    assert lg_dup == 0, "%i pairs of idfoy/quifoy in tot3 are duplicated" % (
        lg_dup)

    save_temp(tot3, name='tot3', year=year)
    control(tot3)

    del tot2, allvars, tot3, vars2
    print 'tot3 sauvegardé'
    gc.collect()
Example #15
0
def sif(year=2006):
    data = DataCollection(year=year)
    print u"05_foyer: extraction des données foyer"
    ## TODO Comment choisir le rfr n -2 pour éxonération de TH ?
    ## mnrvka  Revenu TH n-2
    ## mnrvkh  revenu TH (revenu fiscal de référence)
    #
    ## On récupère les variables du code sif
    #vars <- c("noindiv", 'sif', "nbptr", "mnrvka")
    vars = ["noindiv", 'sif', "nbptr", "mnrvka", "rbg", "tsrvbg"]
    #sif <- LoadIn(erfFoyFil, vars)
    sif = data.get_values(variables=vars, table="foyer" )
    #sif$statmarit <- 0
    sif['statmarit'] = 0

    print sif

    ## for (index in 60:80){
    ##    print(index)
    ##    print(table(substr(sif$sif,index,index)))
    ## }


    #
    #
    ## Pb with some 2 sifs that are misaligned
    ## index = 61
    ## sif[substr(sif$sif,index,index)=="F", "noindiv"]
    ## sif$sif[sif$noindiv == sif[substr(sif$sif,index,index)=="F", "noindiv"]]
    ## sif$sif[sif$noindiv == sif[substr(sif$sif,index,index)=="F", "noindiv"]+1]
    #
    ## index = 62
    ## sif[substr(sif$sif,index,index)=="G", "noindiv"]
    ## sif$sif[sif$noindiv == sif[substr(sif$sif,index,index)=="G", "noindiv"]]
    ## sif$sif[1]
    #
    #if (year==2009){
    #  # problem with one entry in 2009
    #  length  <- nchar(sif$sif[1])
    #  old_sif <- sif$sif[sif$noindiv == 901803201]
    #  new_sif <- paste(substr(old_sif,1,59), substr(old_sif,61,length),"0", sep="")
    #  sif$sif[sif$noindiv == 901803201] <- new_sif
    #  old_sif <- sif$sif[sif$noindiv == 900872201]
    #  new_sif <- paste(substr(old_sif,1,58), " ", substr(old_sif,59,length), sep="")
    #  sif$sif[sif$noindiv == 900872201] <- new_sif
    #  rm(old_sif,new_sif)
    #}
    if year == 2009:
        old_sif = sif['sif'][sif['noindiv'] == 901803201]
        new_sif = old_sif.str[0:59] + old_sif.str[60:] + "0"
        sif['sif'][sif['noindiv'] == 901803201] = new_sif
        old_sif = sif['sif'][sif['noindiv'] == 900872201]
        new_sif = old_sif.str[0:58] + " " + old_sif.str[58:]
        sif['sif'][sif['noindiv'] == 900872201] = new_sif
        del old_sif, new_sif
    #
    #
    ## for (index in 60:80){
    ##     print(index)
    ##     print(table(substr(sif$sif,index,index)))
    ## }
    #
    #
    #
    #sif <- within(sif,{
    ##  rbg = rbg*((tsrvbg =='+')-(tsrvbg =='-'))
    print sif["rbg"].describe()
    sif["rbg"] = sif["rbg"]*( (sif["tsrvbg"]=='+')-(sif["tsrvbg"] =='-'))
    print sif["rbg"].describe()

    #  stamar <- substr(sif,5,5)

    sif["stamar"] = sif["sif"].str[4:5]

    # Converting marital status

    #  statmarit[stamar =="M"] <- 1
    #  statmarit[stamar =="C"] <- 2
    #  statmarit[stamar =="D"] <- 3
    #  statmarit[stamar =="V"] <- 4
    #  statmarit[stamar =="O"] <- 5

    statmarit_dict = {"M": 1, "C" : 2, "D" : 3, "V" : 4, "O": 5}
    for key , val in statmarit_dict.iteritems():
        sif["statmarit"][ sif.stamar == key] = val

    #  birthvous <- as.numeric(substr(sif,6,9))
    #  birthconj <- as.numeric(substr(sif,11,14))
    #


    sif["birthvous"] = sif["sif"].str[5:9]
    sif["birthconj"] = sif["sif"].str[10:14]

    #  caseE <- as.numeric(substr(sif,16,16)=='E')
    #  caseF <- as.numeric(substr(sif,17,17)=='F')
    #  caseG <- as.numeric(substr(sif,18,18)=='G')
    #  caseK <- as.numeric(substr(sif,19,19)=='K')

    sif["caseE"] = sif["sif"].str[15:16] == "E"
    sif["caseF"] = sif["sif"].str[16:17] == "F"
    sif["caseG"] = sif["sif"].str[17:18] == "G"
    sif["caseK"] = sif["sif"].str[18:19] == "K"


    d = 0
    #
    #  if (year %in% c(2006,2007)){
    #    caseL <- as.numeric(substr(sif,20,20)=='L')
    #    caseP <- as.numeric(substr(sif,21,21)=='P')
    #    caseS <- as.numeric(substr(sif,22,22)=='S')
    #    caseW <- as.numeric(substr(sif,23,23)=='W')
    #    caseN <- as.numeric(substr(sif,24,24)=='N')
    #    caseH <- as.numeric(substr(sif,25,28))
    #    caseT <- as.numeric(substr(sif,29,29) == 'T')
    #  }

    if year in [2006,2007]:
        sif["caseL"] = sif["sif"].str[19:20] == "L"
        sif["caseP"] = sif["sif"].str[20:21] == "P"
        sif["caseS"] = sif["sif"].str[21:22] == "S"
        sif["caseW"] = sif["sif"].str[22:23] == "W"
        sif["caseN"] = sif["sif"].str[23:24] == "N"
        sif["caseH"] = sif["sif"].str[24:28]
        sif["caseT"] = sif["sif"].str[28:29] == "T"
    #
    #  if (year == 2008){
    #    d = - 1 # fin de la case L
    #    caseP <- as.numeric(substr(sif,21+d,21+d)=='P')
    #    caseS <- as.numeric(substr(sif,22+d,22+d)=='S')
    #    caseW <- as.numeric(substr(sif,23+d,23+d)=='W')
    #    caseN <- as.numeric(substr(sif,24+d,24+d)=='N')
    #    caseH <- as.numeric(substr(sif,25+d,28+d))
    #    caseT <- as.numeric(substr(sif,29+d,29+d)=='T')
    #  }
    #
    if year in [2008]:
        d = - 1 # fin de la case L
        sif["caseP"] = sif["sif"].str[20+d:21+d] == "P"
        sif["caseS"] = sif["sif"].str[21+d:22+d] == "S"
        sif["caseW"] = sif["sif"].str[22+d:23+d] == "W"
        sif["caseN"] = sif["sif"].str[23+d:24+d] == "N"
        sif["caseH"] = sif["sif"].str[24+d:28+d]
        sif["caseT"] = sif["sif"].str[28+d:29+d] == "T"

    #  if (year == 2009){
    #    # retour de la case L par rapport à 2008 (donc on retrouve 2006)
    #    caseL <- as.numeric(substr(sif,20,20)=='L')
    #    caseP <- as.numeric(substr(sif,21,21)=='P')
    #    caseS <- as.numeric(substr(sif,22,22)=='S')
    #    caseW <- as.numeric(substr(sif,23,23)=='W')
    #    caseN <- as.numeric(substr(sif,24,24)=='N')
    #    # caseH en moins par rapport à 2008 (mais case en L en plus)
    #    # donc décalage par rapport à 2006
    #    d = -4
    #    caseT <- as.numeric(substr(sif,29+d,29+d)=='T')
    #  }

    if year in [2009]:
        sif["caseL"] = sif["sif"].str[19:20] == "L"
        sif["caseP"] = sif["sif"].str[20:21] == "P"
        sif["caseS"] = sif["sif"].str[21:22] == "S"
        sif["caseW"] = sif["sif"].str[22:23] == "W"
        sif["caseN"] = sif["sif"].str[23:24] == "N"
        # caseH en moins par rapport à 2008 (mais case en L en plus)
        # donc décalage par rapport à 2006
        d = -4
        sif["caseT"] = sif["sif"].str[28+d:29+d] == "T"


    #
    #  caseX <- as.numeric(substr(sif,34+d,34+d)=='X')
    #  dateX <- as.Date(substr(sif,35+d,42+d),'%d%m%Y')
    #  caseY <- as.numeric(substr(sif,43+d,43+d)== 'Y')
    #  dateY <- as.Date(substr(sif,44+d,51+d),'%d%m%Y')
    #  caseZ <- as.numeric(substr(sif,52+d,53+d)== 'Z')
    #  dateZ <- as.Date(substr(sif,53+d,60+d),'%d%m%Y')  # ERROR 54+d
    #  causeXYZ <- substr(sif,61+d,61+d)
    #

    sif["caseX"] = sif["sif"].str[33+d:34+d] == "X"
    sif["dateX"] = sif["sif"].str[34+d:42+d]
    sif["caseY"] = sif["sif"].str[42+d:43+d] == "Y"
    sif["dateY"] = sif["sif"].str[43+d:51+d]
    sif["caseZ"] = sif["sif"].str[51+d:53+d] == "Z"
    sif["dateZ"] = sif["sif"].str[52+d:60+d]
    sif["causeXYZ"] = sif["sif"].str[60+d:61+d]

    # TODO: convert dateXYZ to appropriate date in pandas
    # print sif["dateY"].unique()


    #  nbptr <- nbptr/100
    #  rfr_n_2 <- mnrvka

    sif["nbptr"] =  sif["nbptr"]/100
    sif["rfr_n_2"] = sif["mnrvka"]

    #  nbF <- as.numeric(substr(sif,65+d,66+d))
    #  nbG <- as.numeric(substr(sif,68+d,69+d))
    #  nbR <- as.numeric(substr(sif,71+d,72+d))
    #  nbJ <- as.numeric(substr(sif,74+d,75+d))
    #  nbN <- as.numeric(substr(sif,77+d,78+d))
    #  nbH <- as.numeric(substr(sif,80+d,81+d))
    #  nbI <- as.numeric(substr(sif,83+d,84+d))

    sif["nbF"] = sif["sif"].str[64+d:66+d]
    sif["nbG"] = sif["sif"].str[67+d:69+d]
    sif["nbR"] = sif["sif"].str[70+d:72+d]
    sif["nbJ"] = sif["sif"].str[73+d:75+d]
    sif["nbN"] = sif["sif"].str[76+d:78+d]
    sif["nbH"] = sif["sif"].str[79+d:81+d]
    sif["nbI"] = sif["sif"].str[82+d:84+d]

    #  if (year != 2009){
    #  nbP <- as.numeric(substr(sif,86+d,87+d))
    #  }
    #})

    if (year != 2009):
        sif["nbP"] = sif["sif"].str[85+d:87+d]

    #sif$sif <- NULL
    #sif$stamar <- NULL
    #

    del sif["sif"], sif["stamar"]


    #table(sif$statmarit)
    print sif["statmarit"].value_counts()

    #print(length(table(sif$noindiv)))
    print "Number of individuals :", len(sif["noindiv"])
    print "Number of distinct individuals :", len(sif["noindiv"].value_counts())

    #dup <- duplicated(sif$noindiv)
    #table(dup)
    #sif <- sif[!dup,]
    #print(length(table(sif$noindiv)))

    sif_drop_duplicated = sif.drop_duplicates("noindiv")
    assert len(sif["noindiv"].value_counts()) == len(sif_drop_duplicated["noindiv"]), Exception("Number of distinct individuals after removing duplicates is not correct")

    print 'Saving sif'
    save_temp(sif, name='sif', year=year)
    del sif
    gc.collect()
def create_fip(year=2006):  # message('03_fip')
    """
    Creates a 'fipDat' table containing all these 'fip individuals'
    """

    df = DataCollection(year=year)

    print 'Démarrer 03_fip'
    # # anaisenf: année de naissance des PAC
    # erfFoyVar <- c('anaisenf','declar')
    # foyer <- LoadIn(erfFoyFil)
    # foyer <- LoadIn(erfFoyFil,erfFoyVar)

    # anaisenf is a string containing letter code of pac (F,G,H,I,J,N,R) and year of birth (example: 'F1990H1992')
    # when a child is invalid, he appears twice in anaisenf (example: F1900G1900 is a single invalid child born in 1990)
    erfFoyVar = ['declar', 'anaisenf']
    foyer = df.get_values(table="foyer", variables=erfFoyVar)
    print_id(foyer)
    #    control(foyer, verbose=True, verbose_length=10, debug=True)

    # #***********************************************************************************************************
    # # print "Step 1 : on recupere les personnes à charge des foyers"
    # #**********************************************************************************************************
    # # On traite les cas de declarations multiples pour ne pas créer de doublon de pac
    #
    #
    # # On récupère toutes les pac des foyers
    # L <- max(nchar(foyer$anaisenf))/5 # nombre de pac maximal
    # fip <-data.frame(declar = foyer$declar)
    # for (i in c(1:L)){
    #   eval(parse(text = paste('fip$typ.',as.character(i),'<- substr(foyer$anaisenf,5*(i-1)+1,5*(i-1)+1)',sep = '')))
    #   eval(parse(text = paste('fip$naia.',as.character(i),'<- as.numeric(substr(foyer$anaisenf,5*(i-1)+2,5*(i-1)+5))',sep = '')))
    # }
    # fip <- fip[!is.na(fip$typ.1),]
    # fip <- reshape(fip,direction ='long', varying=2:17, sep=".")
    # fip <- fip[!is.na(fip$naia),]
    # fip <- fip[order(fip$declar,-rank(fip$typ),fip$naia),c('declar','naia','typ')]
    # fip$N <- row(fip)[,1]
    # str(fip$N)

    print "Etape 1 : on recupere les personnes à charge des foyers"
    print "    1.1 : Création des codes des enfants"
    foyer['anaisenf'] = foyer['anaisenf'].astype('string')
    nb_pac_max = len(max(foyer['anaisenf'], key=len)) / 5
    print "il ya a au maximum %s pac par foyer" % nb_pac_max

    # Separating the string coding the pac of each "déclaration".
    # Creating a list containing the new variables.

    # Creating the multi_index for the columns
    multi_index_columns = []
    for i in range(1, nb_pac_max + 1):
        pac_tuples_list = [(i, 'declaration'), (i, 'type_pac'), (i, 'naia')]
        multi_index_columns += pac_tuples_list

    columns = MultiIndex.from_tuples(multi_index_columns,
                                     names=['pac_number', 'variable'])
    fip = DataFrame(randn(len(foyer), 3 * nb_pac_max), columns=columns)
    fip.fillna(
        NaN, inplace=True)  # inutile a cause de la ligne précédente, to remove
    for i in range(1, nb_pac_max + 1):
        fip[(i, 'declaration')] = foyer['declar'].values
        fip[(i, 'type_pac')] = foyer['anaisenf'].str[5 * (i - 1)]
        fip[(i, 'naia')] = foyer['anaisenf'].str[5 * (i - 1) + 1:5 * (i)]

    fip = fip.stack("pac_number")
    fip.reset_index(inplace=True)
    del fip["level_0"]

    #     print fip.describe()
    #     print fip.head().to_string()
    print "    1.2 : elimination des foyers fiscaux sans pac"
    #Clearing missing values and changing data format
    fip = fip[(fip['type_pac'].notnull()) & (fip['naia'] != 'an') &
              (fip['naia'] != '')]
    fip = fip.sort(columns=['declaration', 'naia', 'type_pac'])
    # TODO: check if useful
    fip.set_index(["declaration", "pac_number"], inplace=True)
    fip = fip.reset_index()

    del fip['pac_number']
    #    control(fip, debug=True, verbose=True, verbose_columns=['naia'])

    print "    1.3 : on enlève les individus F pour lesquels il existe un individu G"
    tyFG = fip[fip.type_pac.isin(['F', 'G'
                                  ])]  #Filtre pour ne travailler que sur F & G

    tyFG['same_pair'] = tyFG.duplicated(cols=['declaration', 'naia'],
                                        take_last=True)
    tyFG['is_twin'] = tyFG.duplicated(cols=['declaration', 'naia', 'type_pac'])
    tyFG['to_keep'] = (~(tyFG['same_pair']) | (tyFG['is_twin']))
    #Note : On conserve ceux qui ont des couples déclar/naia différents et les jumeaux
    #puis on retire les autres (à la fois F et G)
    print len(tyFG), '/', len(tyFG[tyFG['to_keep']])
    print 'longueur fip', len(fip)

    fip['to_keep'] = NaN
    fip.update(tyFG)
    print 'enfants F & G traités'

    print "    1.4 : on enlève les H pour lesquels il y a un I"
    tyHI = fip[fip.type_pac.isin(['H', 'I'])]
    tyHI['same_pair'] = tyHI.duplicated(cols=['declaration', 'naia'],
                                        take_last=True)
    tyHI['is_twin'] = tyHI.duplicated(cols=['declaration', 'naia', 'type_pac'])
    tyHI['to_keep'] = ~(tyHI['same_pair']) | (tyHI['is_twin'])

    fip.update(tyHI)
    fip['to_keep'] = fip['to_keep'].fillna(True)
    print 'nb lines to keep/nb initial lines'
    print len(fip[fip['to_keep']]), '/', len(fip)

    indivifip = fip[fip['to_keep']]
    del indivifip['to_keep'], fip, tyFG, tyHI

    #    control(indivifip, debug=True)

    # #************************************************************************************************************/
    print ''
    print 'Step 2 : matching indivifip with eec file'
    # #************************************************************************************************************/

    indivi = load_temp(name="indivim",
                       year=year)  #TODO: USE THIS INSTEAD OF PREVIOUS LINES

    # pac <- indivi[!is.na(indivi$persfip) & indivi$persfip == 'pac',]
    # pac$key1 <- paste(pac$naia,pac$declar1)
    # pac$key2 <- paste(pac$naia,pac$declar2)
    # indivifip$key <- paste(indivifip$naia,indivifip$declar)

    #TODO: replace Indivi['persfip'] is not NaN by indivi['persfip'].notnull()
    import pdb
    pdb.set_trace()
    pac = indivi[(indivi['persfip'] is not NaN) & (indivi['persfip'] == 'pac')]

    pac['naia'] = pac['naia'].astype(
        'int32')  # TODO: was float in pac fix upstream
    indivifip['naia'] = indivifip['naia'].astype('int32')
    pac['key1'] = zip(pac['naia'], pac['declar1'].str[:29])
    pac['key2'] = zip(pac['naia'], pac['declar2'].str[:29])
    indivifip['key'] = zip(indivifip['naia'],
                           indivifip['declaration'].str[:29])
    assert pac.naia.dtype == indivifip.naia.dtype, 'types %s , %s are different' % (
        pac.naia.dtype, indivifip.naia.dtype)

    # fip <- indivifip[!indivifip$key %in% pac$key1,]
    # fip <- fip[!fip$key %in% pac$key2,]
    fip = indivifip[~(indivifip.key.isin(pac.key1.values))]
    fip = fip[~(fip.key.isin(pac.key2.values))]

    print "    2.1 new fip created"
    # We build a dataframe to link the pac to their type and noindiv
    # table(duplicated(pac[,c("noindiv")]))
    countInd = pac.noindiv.value_counts()

    # pacInd1 <- merge(pac[,c("noindiv","key1","naia")],
    #                 indivifip[,c("key","typ")], by.x="key1", by.y="key")
    # pacInd2 <- merge(pac[,c("noindiv","key2","naia")],
    #                 indivifip[,c("key","typ")], by.x="key2", by.y="key")

    tmp_pac1 = pac[['noindiv', 'key1']]
    tmp_pac2 = pac[['noindiv', 'key2']]
    tmp_indivifip = indivifip[['key', 'type_pac', 'naia']]

    pac_ind1 = tmp_pac1.merge(tmp_indivifip,
                              left_on='key1',
                              right_on='key',
                              how='inner')
    print 'longueur pacInd1', len(pac_ind1)
    pac_ind2 = tmp_pac2.merge(tmp_indivifip,
                              left_on='key2',
                              right_on='key',
                              how='inner')
    print 'longueur pacInd2', len(pac_ind2)
    print "pacInd1&2 créés"

    # table(duplicated(pacInd1))
    # table(duplicated(pacInd2))

    print pac_ind1.duplicated().sum()
    print pac_ind2.duplicated().sum()

    # pacInd1 <-rename(pacInd1,c("key1" = "key"))
    # pacInd2 <-rename(pacInd2,c("key2" = "key"))
    # pacInd <- rbind(pacInd1,pacInd2)
    # rm(pacInd1,pacInd2)

    #     pacInd1.rename(columns={'key1':'key'}, inplace=True)
    #     pacInd2.rename(columns={'key2':'key'}, inplace=True)
    del pac_ind1['key1'], pac_ind2['key2']
    print pac_ind1.columns
    print pac_ind2.columns

    if pac_ind1.index == []:
        if pac_ind2.index == []:
            print "Warning : no link between pac and noindiv for both pacInd1&2"
        else:
            print "Warning : pacInd1 is an empty data frame"
            pacInd = pac_ind2
    elif pac_ind2.index == []:
        print "Warning : pacInd2 is an empty data frame"
        pacInd = pac_ind1
    else:
        pacInd = concat([pac_ind2, pac_ind1])
    print len(pac_ind1), len(pac_ind2), len(pacInd)
    print pac_ind2.type_pac.isnull().sum()
    print pacInd.type_pac.value_counts()

    print '    2.2 : pacInd created'

    # table(duplicated(pacInd[,c("noindiv","typ")]))
    # table(duplicated(pacInd$noindiv))

    print 'doublons noindiv, type_pac', pacInd.duplicated(
        ['noindiv', 'type_pac']).sum()
    print 'doublons noindiv seulement', pacInd.duplicated('noindiv').sum()
    print 'nb de NaN', pacInd.type_pac.isnull().sum()

    del pacInd["key"]
    pacIndiv = pacInd[~(pacInd.duplicated('noindiv'))]
    #     pacIndiv.reset_index(inplace=True)
    print pacIndiv.columns

    save_temp(pacIndiv, name="pacIndiv", year=year)

    print pacIndiv.type_pac.value_counts()
    gc.collect()

    # # We keep the fip in the menage of their parents because it is used in to
    # # build the famille. We should build an individual ident for the fip that are
    # # older than 18 since they are not in their parents' menage according to the eec

    # individec1 <- subset(indivi, (declar1 %in% fip$declar) & (persfip=="vous"))
    # individec1 <- individec1[,c("declar1","noidec","ident","rga","ztsai","ztsao")]
    # individec1 <- upData(individec1,rename=c(declar1="declar"))
    # fip1       <- merge(fip,individec1)
    # indivi$noidec <- as.numeric(substr(indivi$declar1,1,2))
    indivi['noidec'] = indivi['declar1'].str[0:2].astype(
        'float16')  # To be used later to set idfoy
    individec1 = indivi[(indivi.declar1.isin(fip.declaration.values))
                        & (indivi['persfip'] == "vous")]
    individec1 = individec1.loc[:, [
        "declar1", "noidec", "ident", "rga", "ztsai", "ztsao"
    ]]
    individec1 = individec1.rename(columns={'declar1': 'declaration'})
    fip1 = fip.merge(individec1, on='declaration')
    print '    2.3 : fip1 created'

    # # TODO: On ne s'occupe pas des declar2 pour l'instant
    # # individec2 <- subset(indivi, (declar2 %in% fip$declar) & (persfip=="vous"))
    # # individec2 <- individec2[,c("declar2","noidec","ident","rga","ztsai","ztsao")]
    # # individec2 <- upData(individec2,rename=c(declar2="declar"))
    # # fip2 <-merge(fip,individec2)

    individec2 = indivi[(indivi.declar2.isin(fip.declaration.values))
                        & (indivi['persfip'] == "vous")]
    individec2 = individec2.loc[:, [
        "declar2", "noidec", "ident", "rga", "ztsai", "ztsao"
    ]]
    individec2.rename(columns={'declar2': 'declaration'}, inplace=True)
    print individec2.head()
    fip2 = fip.merge(individec2)
    print '    2.4 : fip2 created'

    fip1.duplicated().value_counts()
    fip2.duplicated().value_counts()

    # #fip <- rbind(fip1,fip2)
    # fip <- fip1
    # table(fip$typ)

    fip = concat([fip1, fip2])
    #     fip = fip1 #TODO: Pourquoi cette ligne ?
    fip.type_pac.value_counts()

    print fip.columns
    fip['persfip'] = 'pac'
    fip['year'] = year
    fip['year'] = fip['year'].astype(
        'float')  # BUG; pas de colonne année dans la DF
    fip['noi'] = 99
    fip['noicon'] = None
    fip['noindiv'] = fip['declaration']
    fip['noiper'] = None
    fip['noimer'] = None
    fip['declar1'] = fip['declaration']  #TODO declar ?
    fip['naim'] = 99
    fip['lien'] = None
    fip['quelfic'] = 'FIP'
    fip['acteu'] = None
    fip['agepf'] = fip['year'] - fip['naia'].astype('float')
    fip['lpr'] = where(fip['agepf'] <= 20, 3,
                       4)  # TODO pas très propre d'après Mahdi/Clément
    fip['stc'] = None
    fip['contra'] = None
    fip['titc'] = None
    fip['mrec'] = None
    fip['forter'] = None
    fip['rstg'] = None
    fip['retrai'] = None
    fip['cohab'] = None
    fip['sexe'] = None
    fip['persfip'] = "pac"
    fip['agepr'] = None
    fip['actrec'] = where(fip['agepf'] <= 15, 9, 5)

    ## TODO: probleme actrec des enfants fip entre 16 et 20 ans : on ne sait pas s'ils sont étudiants ou salariés */
    ## TODO problème avec les mois des enfants FIP : voir si on ne peut pas remonter à ces valeurs: Alexis : clairement non

    # Reassigning noi for fip children if they are more than one per foyer fiscal
    # while ( any(duplicated( fip[,c("noi","ident")]) ) ) {
    #   dup <- duplicated( fip[, c("noi","ident")])
    #   tmp <- fip[dup,"noi"]
    #   fip[dup, "noi"] <- (tmp-1)
    # }
    #TODO: Le vecteur dup est-il correct
    fip["noi"] = fip["noi"].astype("int64")
    fip["ident"] = fip["ident"].astype("int64")

    fip_tmp = fip[['noi', 'ident']]

    while any(fip.duplicated(cols=['noi', 'ident'])):
        fip_tmp = fip.loc[:, ['noi', 'ident']]
        dup = fip_tmp.duplicated()
        tmp = fip.loc[dup, 'noi']
        print len(tmp)
        fip.loc[dup, 'noi'] = tmp.astype('int64') - 1

    fip['idfoy'] = 100 * fip['ident'] + fip['noidec']
    fip['noindiv'] = 100 * fip['ident'] + fip['noi']
    fip['type_pac'] = 0
    fip['key'] = 0

    print fip.duplicated('noindiv').value_counts()
    save_temp(fip, name="fipDat", year=year)
    del fip, fip1, individec1, indivifip, indivi, pac
    print 'fip sauvegardé'
def create_indivim(year=2006):
    '''
    '''
    # load
    data = DataCollection(year=year)
    erfmen = data.get_values(table="erf_menage")
    eecmen = data.get_values(table="eec_menage")
    print sorted(eecmen.columns)

    erfind = data.get_values(table="erf_indivi")
    eecind = data.get_values(table="eec_indivi")
    print eecind.columns
    print erfind.columns

    # travail sur la cohérence entre les bases
    noappar_m = eecmen[ ~(eecmen.ident.isin( erfmen.ident.values))]
    print 'describe noappar_m'
    print noappar_m.describe()

    noappar_i = eecmen[ ~(eecmen.ident.isin( erfmen.ident.values))]
    noappar_i = noappar_i.drop_duplicates(cols='ident', take_last = True)
    #TODO: vérifier qu'il n'y a théoriquement pas de doublon

    dif = set(noappar_i.ident).symmetric_difference(noappar_m.ident)
    int = set(noappar_i.ident) & set(noappar_m.ident)
    print "dif, int --------------------------------"
    print dif, int
    del noappar_i, noappar_m, dif, int
    gc.collect()

    #fusion enquete emploi et source fiscale
    menagem = erfmen.merge(eecmen)
    indivim = eecind.merge(erfind, on = ['noindiv', 'ident', 'noi'], how="inner")

    # optimisation des types? Controle de l'existence en passant
    #TODO: minimal dtype
    var_list = (['acteu', 'stc', 'contra', 'titc', 'forter', 'mrec', 'rstg', 'retrai', 'lien', 'noicon',
                 'noiper', 'noimer', 'naia', 'cohab', 'agepr', 'statut', 'txtppb', 'encadr', 'prosa'])
    for var in var_list:
        try:
            indivim[var] = indivim[var].astype("float32")
        except:
            print "%s is missing" %var

    # création de variables
    ## actrec
    indivim['actrec'] = 0
    #TODO: pas de 6 ?!!
    filter1 = (indivim['acteu'] == 1) & (indivim['stc'].isin([1,3]))
    indivim['actrec'][filter1] = 1
    filter2 = (indivim['acteu'] == 1) & (((indivim['stc'] == 2) & (indivim['contra'] == 1)) | (indivim['titc'] == 2))
    indivim['actrec'][filter2] = 2
    indivim['actrec'][indivim['acteu'] == 1] =  3
    filter4 = (indivim['acteu'] == 2) | ((indivim['acteu'] == 3) & (indivim['mrec'] == 1))
    indivim['actrec'][filter4] = 4
    filter5 = (indivim['acteu'] == 3) & ((indivim['forter'] == 2) | (indivim['rstg'] == 1))
    indivim['actrec'][filter5] = 5
    filter7 = (indivim['acteu'] == 3) & ((indivim['retrai'] == 1) | (indivim['retrai'] == 2))
    indivim['actrec'][filter7] = 7
    indivim['actrec'][indivim['acteu'] == 3] =  8
    indivim['actrec'][indivim['acteu'].isnull()] =  9
    print indivim['actrec'].value_counts()
    # tu99
    if year == 2009:
        #erfind['tu99'] = None
        #eecind['tu99'] = float(eecind['tu99'])
        erfind['tu99'] = NaN

    ## locataire
    menagem["locataire"] = menagem["so"].isin([3,4,5])
    menagem["locataire"] = menagem["locataire"].astype("int32")
    ## ?? c'est bizarre d'avoir besoin du diplome de la personne de référence,
    ## ce serait mieux de faire le merge quand on a besoin seulement
    ## laissons à la table individuel ce qui doit l'être

#    NOTE pas de ddipl en year=2006 visiblement
#    transfert = indivim.ix[indivim['lpr'] == 1, ['ident', 'ddipl']]
#    print transfert
#    #TODO: Forget not to uncomment 'dat
##     #menagem <- merge(erfmen,eecmen)
##     #menagem <- merge(menagem,transfert)
#    menagem  = menagem.merge(transfert)

    # correction
    def _manually_remove_errors():
        '''
        This method is here because some oddities can make it through the controls throughout the procedure
        It is here to remove all these individual errors that compromise the process.
        GL,HF
        '''

        if year==2006:
            indivim.lien[indivim.noindiv==603018905] = 2
            indivim.noimer[indivim.noindiv==603018905] = 1
            print indivim[indivim.noindiv==603018905].to_string()

    _manually_remove_errors()

    # save
    save_temp(menagem, name="menagem", year=year)
    del erfmen, eecmen, menagem #, transfert
    print 'menagem saved'
    gc.collect()
    save_temp(indivim, name="indivim", year=year)
    del erfind, eecind
    print 'indivim saved'
    gc.collect()
def foyer_all(year=2006):

    ## On ajoute les cases de la déclaration
    #foyer_all <- LoadIn(erfFoyFil)
    data = DataCollection(year=year)
    foyer_all = data.get_values(table="foyer")

    ## on ne garde que les cases de la déclaration ('fxzz')
    #vars <- names(foyer_all)
    #vars <- c("noindiv", vars[grep("^f[0-9]", vars)])
    #

    vars = foyer_all.columns
    regex = re.compile("^f[0-9]")
    vars = [x for x in vars if regex.match(x)]

    #foyer <- foyer_all[vars]
    #rm(foyer_all)
    #gc()
    #noindiv <- list(foyer$noindiv)
    #

    foyer = foyer_all[vars + ["noindiv"]]

    del foyer_all
    gc.collect()

    #
    ## On aggrège les déclarations dans le cas où un individu a fait plusieurs déclarations
    #foyer <- aggregate(foyer, by = noindiv, FUN = 'sum')
    #print foyer.describe()["f1aj"].to_string()
    foyer = foyer.groupby("noindiv", as_index=False).aggregate(numpy.sum)
    #
    #print foyer.describe()["f1aj"].to_string()
    #print foyer.describe()["noindiv"].to_string()
    #

    print_id(foyer)

    ## noindiv have been summed over original noindiv which are now in Group.1
    #foyer$noindiv <- NULL
    #foyer <- rename(foyer, c(Group.1 = 'noindiv'))
    ## problème avec les dummies ()
    #
    #saveTmp(foyer, file= "foyer_aggr.Rdata")
    #
    #
    #############################################################################
    ## On récupère les variables individualisables
    #loadTmp("foyer_aggr.Rdata")
    #
    #individualisable <- function(table, var, vars, qui){
    #  print(var)
    #  print(vars)
    #  temp <- table[c('noindiv', vars)]
    #  n = length(qui)
    #  names(temp)[2:(n+1)] <- qui
    #  temp$newvar <- NULL
    #  temp2 <- melt(temp, id = 'noindiv', variable_name = 'quifoy')
    #  temp2 <- transform(temp2, quifoy = as.character(quifoy))
    #  temp2 <- transform(temp2, noindiv = as.character(noindiv))
    #  str(temp2)
    #  rename(temp2, c(value = var))
    #}

    var_dict = {
        'sali': ['f1aj', 'f1bj', 'f1cj', 'f1dj', 'f1ej'],
        'choi': ['f1ap', 'f1bp', 'f1cp', 'f1dp', 'f1ep'],
        'fra': ['f1ak', 'f1bk', 'f1ck', 'f1dk', 'f1ek'],
        'cho_ld': ['f1ai', 'f1bi', 'f1ci', 'f1di', 'f1ei'],
        'ppe_tp_sa': ['f1ax', 'f1bx', 'f1cx', 'f1dx', 'f1qx'],
        'ppe_du_sa': ['f1av', 'f1bv', 'f1cv', 'f1dv', 'f1qv'],
        'rsti': ['f1as', 'f1bs', 'f1cs', 'f1ds', 'f1es'],
        'alr': ['f1ao', 'f1bo', 'f1co', 'f1do', 'f1eo'],
        'f1tv': ['f1tv', 'f1uv'],
        'f1tw': ['f1tw', 'f1uw'],
        'f1tx': ['f1tx', 'f1ux'],
        'ppe_tp_ns': ['f5nw', 'f5ow', 'f5pw'],
        'ppe_du_ns': ['f5nv', 'f5ov', 'f5pv'],
        'frag_exon': ['f5hn', 'f5in', 'f5jn'],
        'frag_impo': ['f5ho', 'f5io', 'f5jo'],
        'arag_exon': ['f5hb', 'f5ib', 'f5jb'],
        'arag_impg': ['f5hc', 'f5ic', 'f5jc'],
        'arag_defi': ['f5hf', 'f5if', 'f5jf'],
        'nrag_exon': ['f5hh', 'f5ih', 'f5jh'],
        'nrag_impg': ['f5hi', 'f5ii', 'f5ji'],
        'nrag_defi': ['f5hl', 'f5il', 'f5jl'],
        'nrag_ajag': ['f5hm', 'f5im', 'f5jm'],
        'mbic_exon': ['f5kn', 'f5ln', 'f5mn'],
        'abic_exon': ['f5kb', 'f5lb', 'f5mb'],
        'nbic_exon': ['f5kh', 'f5lh', 'f5mh'],
        'mbic_impv': ['f5ko', 'f5lo', 'f5mo'],
        'mbic_imps': ['f5kp', 'f5lp', 'f5mp'],
        'abic_impn': ['f5kc', 'f5lc', 'f5mc'],
        'abic_imps': ['f5kd', 'f5ld', 'f5md'],
        'nbic_impn': ['f5ki', 'f5li', 'f5mi'],
        'nbic_imps': ['f5kj', 'f5lj', 'f5mj'],
        'abic_defn': ['f5kf', 'f5lf', 'f5mf'],
        'abic_defs': ['f5kg', 'f5lg', 'f5mg'],
        'nbic_defn': ['f5kl', 'f5ll', 'f5ml'],
        'nbic_defs': ['f5km', 'f5lm', 'f5mm'],
        'nbic_apch': ['f5ks', 'f5ls', 'f5ms'],
        'macc_exon': ['f5nn', 'f5on', 'f5pn'],
        'aacc_exon': ['f5nb', 'f5ob', 'f5pb'],
        'nacc_exon': ['f5nh', 'f5oh', 'f5ph'],
        'macc_impv': ['f5no', 'f5oo', 'f5po'],
        'macc_imps': ['f5np', 'f5op', 'f5pp'],
        'aacc_impn': ['f5nc', 'f5oc', 'f5pc'],
        'aacc_imps': ['f5nd', 'f5od', 'f5pd'],
        'aacc_defn': ['f5nf', 'f5of', 'f5pf'],
        'aacc_defs': ['f5ng', 'f5og', 'f5pg'],
        'nacc_impn': ['f5ni', 'f5oi', 'f5pi'],
        'nacc_imps': ['f5nj', 'f5oj', 'f5pj'],
        'nacc_defn': ['f5nl', 'f5ol', 'f5pl'],
        'nacc_defs': ['f5nm', 'f5om', 'f5pm'],
        'mncn_impo': ['f5ku', 'f5lu', 'f5mu'],
        'cncn_bene': ['f5sn', 'f5ns', 'f5os'],
        'cncn_defi': ['f5sp', 'f5nu', 'f5ou', 'f5sr'],  # TODO: check
        'mbnc_exon': ['f5hp', 'f5ip', 'f5jp'],
        'abnc_exon': ['f5qb', 'f5rb', 'f5sb'],
        'nbnc_exon': ['f5qh', 'f5rh', 'f5sh'],
        'mbnc_impo': ['f5hq', 'f5iq', 'f5jq'],
        'abnc_impo': ['f5qc', 'f5rc', 'f5sc'],
        'abnc_defi': ['f5qe', 'f5re', 'f5se'],
        'nbnc_impo': ['f5qi', 'f5ri', 'f5si'],
        'nbnc_defi': ['f5qk', 'f5rk', 'f5sk'],
        #               'ebic_impv' : ['f5ta','f5ua', 'f5va'],
        #               'ebic_imps' : ['f5tb','f5ub', 'f5vb'],
        'mbic_mvct': ['f5hu'],
        'macc_mvct': ['f5iu'],
        'mncn_mvct': ['f5ju'],
        'mbnc_mvct': ['f5kz'],
        'frag_pvct': ['f5hw', 'f5iw', 'f5jw'],
        'mbic_pvct': ['f5kx', 'f5lx', 'f5mx'],
        'macc_pvct': ['f5nx', 'f5ox', 'f5px'],
        'mbnc_pvct': ['f5hv', 'f5iv', 'f5jv'],
        'mncn_pvct': ['f5ky', 'f5ly', 'f5my'],
        'mbic_mvlt': ['f5kr', 'f5lr', 'f5mr'],
        'macc_mvlt': ['f5nr', 'f5or', 'f5pr'],
        'mncn_mvlt': ['f5kw', 'f5lw', 'f5mw'],
        'mbnc_mvlt': ['f5hs', 'f5is', 'f5js'],
        'frag_pvce': ['f5hx', 'f5ix', 'f5jx'],
        'arag_pvce': ['f5he', 'f5ie', 'f5je'],
        'nrag_pvce': ['f5hk', 'f5lk', 'f5jk'],
        'mbic_pvce': ['f5kq', 'f5lq', 'f5mq'],
        'abic_pvce': ['f5ke', 'f5le', 'f5me'],
        'nbic_pvce': ['f5kk', 'f5ik', 'f5mk'],
        'macc_pvce': ['f5nq', 'f5oq', 'f5pq'],
        'aacc_pvce': ['f5ne', 'f5oe', 'f5pe'],
        'nacc_pvce': ['f5nk', 'f5ok', 'f5pk'],
        'mncn_pvce': ['f5kv', 'f5lv', 'f5mv'],
        'cncn_pvce': ['f5so', 'f5nt', 'f5ot'],
        'mbnc_pvce': ['f5hr', 'f5ir', 'f5jr'],
        'abnc_pvce': ['f5qd', 'f5rd', 'f5sd'],
        'nbnc_pvce': ['f5qj', 'f5rj', 'f5sj'],
        'demenage': ['f1ar', 'f1br', 'f1cr', 'f1dr', 'f1er']
    }  # (déménagement) uniquement en 2006

    #
    #varlist = list(list('sali', c('f1aj', 'f1bj', 'f1cj', 'f1dj', 'f1ej')),
    #                list('choi', c('f1ap', 'f1bp', 'f1cp', 'f1dp', 'f1ep')),
    #               list('fra', c('f1ak', 'f1bk', 'f1ck', 'f1dk', 'f1ek')),
    # ......
    #               list('mbnc_pvce', c('f5hr', 'f5ir', 'f5jr')),
    #               list('abnc_pvce', c('f5qd', 'f5rd', 'f5sd')),
    #               list('nbnc_pvce', c('f5qj', 'f5rj', 'f5sj')),
    #               list('demenage' , c('f1ar', 'f1br', 'f1cr', 'f1dr', 'f1er'))) # (déménagement) uniquement en 2006
    #
    vars_sets = [set(var_list) for var_list in var_dict.values()]
    eligible_vars = (set().union(*vars_sets)).intersection(
        set(list(foyer.columns)))

    print "From %i variables, we keep %i eligibles variables" % (len(
        set().union(*vars_sets)), len(eligible_vars))
    qui = ['vous', 'conj', 'pac1', 'pac2', 'pac3']
    err = 0
    err_vars = {}

    foy_ind = DataFrame()

    for individual_var, foyer_vars in var_dict.iteritems():
        try:
            selection = foyer[foyer_vars + ["noindiv"]]
        except KeyError:
            # Testing if at least one variable of foyers_vars is in the eligible list
            presence = [x in eligible_vars for x in foyer_vars]
            var_present = any(presence)
            if not var_present:
                print individual_var + " is not present"
                continue
            else:
                # Shrink the list
                foyer_vars_cleaned = [
                    var for var, present in zip(foyer_vars, presence)
                    if present is True
                ]
                selection = foyer[foyer_vars_cleaned + ["noindiv"]]

        # Reshape the dataframe
        selection.rename(columns=dict(zip(foyer_vars, qui)), inplace=True)
        selection.set_index("noindiv", inplace=True)
        selection.columns.name = "quifoy"

        selection = selection.stack()
        selection.name = individual_var
        selection = selection.reset_index(
        )  # A Series cannot see its index resetted to produce a DataFrame
        selection = selection.set_index(["quifoy", "noindiv"])
        selection = selection[selection[individual_var] != 0]
        #        print len(selection)

        if len(foy_ind) == 0:
            foy_ind = selection
        else:

            foy_ind = concat([foy_ind, selection], axis=1, join='outer')

    foy_ind.reset_index(inplace=True)

    print "foy_ind"
    print foy_ind.describe().to_string()

    #not_first <- FALSE
    #allvars = c()
    #for (v in varlist){
    #  vars = intersect(v[[2]],names(foyer)) # to deal with variabes that are not present
    #  if (length(vars) > 0) {
    #    allvars <-  c(allvars, vars)
    #    qui <- c('vous', 'conj', 'pac1', 'pac2', 'pac3')
    #    n <- length(vars)
    #    temp <- individualisable(foyer, v[[1]], vars, qui[1:n])
    #    if (not_first) {
    #      print('merge')
    #      foy_ind <- merge(temp, foy_ind, by = c('noindiv', 'quifoy'), all = TRUE)
    #      names(foy_ind)
    #    }
    #    else   {
    #      print('init')
    #      foy_ind <- temp
    #      not_first <- TRUE
    #    }
    #  }
    #}

    ind_vars_to_remove = Series(list(eligible_vars))
    save_temp(ind_vars_to_remove, name='ind_vars_to_remove', year=year)
    foy_ind.rename(columns={"noindiv": "idfoy"}, inplace=True)

    print_id(foy_ind)
    foy_ind['quifoy'][foy_ind['quifoy'] == 'vous'] = 0
    foy_ind['quifoy'][foy_ind['quifoy'] == 'conj'] = 1
    foy_ind['quifoy'][foy_ind['quifoy'] == 'pac1'] = 2
    foy_ind['quifoy'][foy_ind['quifoy'] == 'pac2'] = 3
    foy_ind['quifoy'][foy_ind['quifoy'] == 'pac3'] = 4

    assert foy_ind['quifoy'].isin(
        range(5)).all(), 'présence de valeurs aberrantes dans quifoy'

    print 'saving foy_ind'
    print_id(foy_ind)
    save_temp(foy_ind, name="foy_ind", year=year)
    show_temp()
    return
Example #19
0
    def preproc(self):

        erf_menage = self.erf_menage
        erf_eec_indivi = self.erf_eec_indivi
        simu_aggr_tables = self.simu_aggr_tables
        simu_nonaggr_tables = self.simu_nonaggr_tables

        def get_all_ancestors(varlist):
            if len(varlist) == 0:
                return []
            else:
                if varlist[0]._parents == set():
                    return ([varlist[0]] + get_all_ancestors(varlist[1:]))
                else:
                    return ([varlist[0]] +
                            get_all_ancestors(list(varlist[0]._parents)) +
                            get_all_ancestors(varlist[1:]))

        # We want to get all ancestors + children + the options that we're going to encounter
        parents = map(
            lambda x: self.simulation.output_table.column_by_name.get(x),
            [self.variable])
        parents = get_all_ancestors(parents)
        options = []
        for varcol in parents:
            options.extend(varcol._option.keys())
        options = list(set(options))
        #print options
        parents = map(lambda x: x.name, parents)
        for var in [self.variable]:
            children = set()
            varcol = self.simulation.output_table.column_by_name.get(var)
            children = children.union(
                set(map(lambda x: x.name, varcol._children)))
        variables = list(set(parents + list(children)))
        #print variables
        del parents, children
        gc.collect()

        def get_var(variable):
            variables = [variable]
            return self.simulation.aggregated_by_entity(entity="men",
                                                        variables=variables,
                                                        all_output_vars=False,
                                                        force_sum=True)[0]

        simu_aggr_tables = get_var(variables[0])
        for var in variables[1:]:
            simu_aggr_tables = simu_aggr_tables.merge(
                get_var(var)[['idmen', var]], on='idmen', how='outer')
        # We load the data from erf table in case we have to pick data there
        erf_data = DataCollection(year=self.simulation.datesim.year)
        os.system('cls')
        todo = set(variables + ["ident", "wprm"]).union(set(options))
        print 'Variables or equivalents to fetch :'
        print todo
        '''
        Méthode générale pour aller chercher les variables de l'erf/eec
        ( qui n'ont pas forcément le même nom
        et parfois sont les variables utilisées pour créér l'of ):
        1 - essayer le get_of2erf, ça doit marcher pour les variables principales ( au moins les aggrégats
        que l'on compare )
        Si les variables ne sont pas directement dans la table,
        elles ont été calculées à partir d'autres variables de données erf/eec
        donc chercher dans :
        2 - build_survey
        3 - model/model.py qui dira éventuellement dans quel module de model/ chercher
        Le 'print todo' vous indique quelles variables chercher
        ( attention à ne pas inclure les enfants directs )
        L'utilisation du Ctrl-H est profitable !
        '''

        fetch_eec = [
            'statut', 'titc', 'chpub', 'encadr', 'prosa', 'age', 'naim',
            'naia', 'noindiv'
        ]
        fetch_erf = [
            'zsali', 'af', 'ident', 'wprm', 'noi', 'noindiv', 'quelfic'
        ]
        erf_df = erf_data.get_of_values(variables=fetch_erf,
                                        table="erf_indivi")
        eec_df = erf_data.get_of_values(variables=fetch_eec,
                                        table="eec_indivi")
        erf_eec_indivi = erf_df.merge(eec_df, on='noindiv', how='inner')
        assert 'quelfic' in erf_eec_indivi.columns, "quelfic not in erf_indivi columns"
        del eec_df, erf_df

        # We then get the aggregate variables for the menage ( mainly to compare with of )
        print 'Loading data from erf_menage table'
        erf_menage = erf_data.get_of_values(variables=list(todo) + ['quelfic'],
                                            table="erf_menage")

        del todo
        gc.collect()
        assert 'ident' in erf_menage.columns, "ident not in erf_menage.columns"

        from openfisca_france.data.erf import get_erf2of
        erf2of = get_erf2of()
        erf_menage.rename(columns=erf2of, inplace=True)

        # We get the options from the simulation non aggregated tables:

        # First from the output_table
        # We recreate the noindiv in output_table
        self.simulation.output_table.table[
            'noindiv'] = 100 * self.simulation.output_table.table.idmen_ind + self.simulation.output_table.table.noi_ind
        self.simulation.output_table.table[
            'noindiv'] = self.simulation.output_table.table['noindiv'].astype(
                np.int64)
        s1 = [
            var for var in set(options).intersection(
                set(self.simulation.output_table.table.columns))
        ] + ['idmen_ind', 'quimen_ind', 'noindiv']
        simu_nonaggr_tables = (self.simulation.output_table.table)[s1]
        simu_nonaggr_tables.rename(columns={
            'idmen_ind': 'idmen',
            'quimen_ind': 'quimen'
        },
                                   inplace=True)
        assert 'noindiv' in simu_nonaggr_tables.columns

        # If not found, we dwelve into the input_table
        if (set(s1) -
                set(['idmen_ind', 'quimen_ind', 'noindiv'])) < set(options):
            assert 'noindiv' in self.simulation.input_table.table.columns, "'noindiv' not in simulation.input_table.table.columns"
            s2 = [
                var for var in (set(options).intersection(
                    set(self.simulation.input_table.table.columns)) - set(s1))
            ] + ['noindiv']
            #print s2
            temp = self.simulation.input_table.table[s2]
            simu_nonaggr_tables = simu_nonaggr_tables.merge(temp,
                                                            on='noindiv',
                                                            how='inner',
                                                            sort=False)

            del s2, temp
        del s1
        gc.collect()

        simu_nonaggr_tables = simu_nonaggr_tables[
            list(set(options)) + ['idmen', 'quimen', 'noindiv']]
        #print options, variables
        assert 'idmen' in simu_nonaggr_tables.columns, 'Idmen not in simu_nonaggr_tables columns'

        # Check the idmens that are not common
        erf_menage.rename(columns={'ident': 'idmen'}, inplace=True)

        print "\n"
        print 'Checking if idmen is here...'
        print '\n ERF : '
        print 'idmen' in erf_menage.columns
        print "\n Simulation output"
        print 'idmen' in simu_aggr_tables.columns
        print "\n"

        #print 'Dropping duplicates of idmen for both tables...'
        assert not erf_menage["idmen"].duplicated().any(
        ), "Duplicated idmen in erf_menage"
        #erf_menage.drop_duplicates('idmen', inplace = True)
        simu_aggr_tables.drop_duplicates('idmen', inplace=True)
        assert not simu_aggr_tables["idmen"].duplicated().any(
        ), "Duplicated idmen in of"

        print 'Checking mismatching idmen... '
        s1 = set(erf_menage['idmen']) - (set(simu_aggr_tables['idmen']))
        if s1:
            print "idmen that aren't in simu_aggr_tables : %s" % str(len(s1))
            pass
        s2 = (set(simu_aggr_tables['idmen'])) - set(erf_menage['idmen'])
        if s2:
            print "idmen that aren't in erf_menage : %s" % str(len(s2))
            pass
        del s1, s2

        # Restrict to common idmens and merge
        s3 = set(erf_menage['idmen']).intersection(
            set(simu_aggr_tables['idmen']))
        print "Restricting to %s common idmen... \n" % str(len(s3))
        erf_menage = erf_menage[erf_menage['idmen'].isin(s3)]
        simu_aggr_tables = simu_aggr_tables[simu_aggr_tables['idmen'].isin(s3)]
        del s3
        gc.collect()

        #print erf_menage.columns
        #print simu_aggr_tables.columns

        # Compare differences across of and erf dataframes
        print "Comparing differences between dataframes... \n"
        colcom = (set(erf_menage.columns).intersection(
            set(simu_aggr_tables.columns))) - set(['idmen', 'wprm'])
        print 'Common variables: '
        print colcom
        erf_menage.reset_index(inplace=True)
        simu_aggr_tables.reset_index(inplace=True)
        for col in colcom:
            temp = set(
                erf_menage['idmen'][erf_menage[col] != simu_aggr_tables[col]])
            print "Numbers of idmen that aren't equal on variable %s : %s \n" % (
                col, str(len(temp)))
            del temp

        self.erf_menage = erf_menage
        self.erf_eec_indivi = erf_eec_indivi
        self.simu_aggr_tables = simu_aggr_tables
        self.simu_nonaggr_tables = simu_nonaggr_tables
Example #20
0
def check_converted():
    #Retrieving the input and output files for analysis :
    store = HDFStore(survey_test)
    input_df = store['survey_2006']

    output = HDFStore(survey3_test)

    df_fam = output['survey_2006/fam']
    df_foy = output['survey_2006/foy']
    df_men = output['survey_2006/men']
    df_ind = output['survey_2006/ind']

    year = 2006
    erf = DataCollection(year=year)
    df = erf.get_of_values(table="erf_indivi")
    df2 = erf.get_of_values(table="eec_indivi")
    print '\n'
    print df.loc[df.ident == 6030189, :].to_string()
    print df2.loc[df2.ident == 6030189, :].to_string()

    print len(np.unique(input_df['idfoy'].values))
    print len(np.unique(input_df.loc[input_df['quifoy'] == 0, 'idfoy'].values))

    liste = [
        601228002, 602671302, 602016402, 603069602, 601365902, 602679402,
        602680905, 603074902, 600848302, 602684902, 601508802, 601427302,
        601774602, 600466102, 603448202, 603091202, 602437502, 603224003,
        603093102, 601261802, 601000002, 601789602, 601660602, 600350102,
        601927802, 601797902, 601667902, 601537502, 600227602, 602854502,
        602071902, 600144702, 602205702, 600769302, 601096602, 602609202,
        601301302, 602220302, 602486102, 601376802, 601570902, 600654802,
        601443202, 603412402, 603412902, 601055502, 602893001, 601189902,
        601850602, 600539902, 602507002, 601460902, 602511602, 601200902,
        601601802, 600946903, 600428502, 600953502, 601084802, 601350102,
        600829602, 600174402
    ]
    liste_men = np.unique(input_df.loc[input_df.idfoy.isin(liste),
                                       'idmen'].values)
    print liste_men
    print df.loc[df.ident.isin(liste_men), [
        'noi', 'noindiv', 'ident', 'declar1', 'declar2', 'persfip', 'persfipd',
        'quelfic'
    ]].head(30).to_string()
    print input_df.loc[input_df.idfoy.isin(liste), :].head(30).to_string()

    #     print input_df.loc[input_df.idfoy==603018901,
    #                        ['idfoy', 'quifoy', 'idfam', 'quifam', 'idmen', 'quimen', 'noi']].to_string()
    #
    #     print input_df.loc[input_df.idfam==603018902,
    #                    ['idfoy', 'quifoy', 'idfam', 'quifam', 'idmen', 'quimen', 'noi']].to_string()
    return
    #     df_foy['noindiv'] = df_foy['noi'] ; del df_foy['noi']
    #     df_fam['noindiv'] = df_fam['noi'] ; del df_fam['noi']
    #     df_men['noindiv'] = df_men['noi'] ; del df_men['noi']
    #     print df_fam, df_foy, df_men

    #     check_structure(store['survey_2006'])
    #     control(input_df, verbose=True, verbose_columns=['noindiv'])
    #     control(df_foy, verbose=True, verbose_columns=['noindiv'])

    #     print input_df.duplicated('noindiv').sum(), len(input_df)
    #     print df_foy.duplicated('noindiv').sum(), len(df_foy)
    #     print df_fam.duplicated('noindiv').sum(), len(df_fam)
    #     print df_men.duplicated('noindiv').sum(), len(df_men)
    #     print df_ind.head(10).to_string()
    print '    FAM'
    print sorted(df_fam.columns)
    print '    FOY'
    print sorted(df_foy.columns)
    print '    MEN'
    print sorted(df_men.columns)
    print '    IND'
    print sorted(df_ind.columns)

    #     print df_fam.columns
    print '    INPUT'
    print sorted(input_df.columns)