def create_fip(temporary_store = None, year = None): assert temporary_store is not None assert year is not None # fip : fichier d'imposition des personnes """ Creates a 'fipDat' table containing all these 'fip individuals' """ # Some individuals are declared as 'personne à charge' (pac) on 'tax forms' # but are not present in the erf or eec tables. # We add them to ensure consistency between concepts. year_specific_by_generic = year_specific_by_generic_data_frame_name(year) erfs_survey_collection = SurveyCollection.load( collection = 'erfs', config_files_directory = config_files_directory) survey = erfs_survey_collection.get_survey('erfs_{}'.format(year)) log.info(u"Démarrage de 03_fip") # anaisenf is a string containing letter code of pac (F,G,H,I,J,N,R) and year of birth (example: 'F1990H1992') # when a child is invalid, he appears twice in anaisenf (example: F1900G1900 is a single invalid child born in 1990) erfFoyVar = ['declar', 'anaisenf'] foyer = survey.get_values(table = year_specific_by_generic["foyer"], variables = erfFoyVar) foyer.replace({'anaisenf': {'NA': np.nan}}, inplace = True) log.info(u"Etape 1 : on récupere les personnes à charge des foyers") log.info(u" 1.1 : Création des codes des enfants") foyer['anaisenf'] = foyer['anaisenf'].astype('string') nb_pac_max = len(max(foyer['anaisenf'], key=len)) / 5 log.info(u"il ya a au maximum {} pac par foyer".format(nb_pac_max)) # Separating the string coding the pac of each "déclaration". # Creating a list containing the new variables. # Creating the multi_index for the columns multi_index_columns = [] assert int(nb_pac_max) == nb_pac_max, "nb_pac_max = {} which is not an integer".format(nb_pac_max) nb_pac_max = int(nb_pac_max) for i in range(1, nb_pac_max + 1): pac_tuples_list = [ (i, 'declaration'), (i, 'type_pac'), (i, 'naia') ] multi_index_columns += pac_tuples_list columns = MultiIndex.from_tuples( multi_index_columns, names = ['pac_number', 'variable'] ) fip = DataFrame(np.random.randn(len(foyer), 3 * nb_pac_max), columns = columns) for i in range(1, nb_pac_max + 1): # TODO: using values to deal with mismatching indexes fip[(i, 'declaration')] = foyer['declar'].values fip[(i, 'type_pac')] = foyer['anaisenf'].str[5 * (i - 1)].values fip[(i, 'naia')] = foyer['anaisenf'].str[5 * (i - 1) + 1: 5 * i].values fip = fip.stack("pac_number") fip.reset_index(inplace = True) fip.drop(['level_0'], axis = 1, inplace = True) log.info(u" 1.2 : elimination des foyers fiscaux sans pac") # Clearing missing values and changing data format fip = fip[(fip.type_pac.notnull()) & (fip.naia != 'an') & (fip.naia != '')].copy() fip = fip.sort(columns = ['declaration', 'naia', 'type_pac']) fip.set_index(["declaration", "pac_number"], inplace = True) fip = fip.reset_index() fip.drop(['pac_number'], axis = 1, inplace = True) assert fip.type_pac.isin(["F", "G", "H", "I", "J", "N", "R"]).all(), \ "Certains types de PAC ne sont pas des cases connues" # control(fip, debug=True, verbose=True, verbose_columns=['naia']) log.info(u" 1.3 : on enlève les individus F pour lesquels il existe un individu G") type_FG = fip[fip.type_pac.isin(['F', 'G'])].copy() # Filtre pour ne travailler que sur F & G type_FG['same_pair'] = type_FG.duplicated(subset = ['declaration', 'naia'], take_last = True) type_FG['is_twin'] = type_FG.duplicated(subset = ['declaration', 'naia', 'type_pac']) type_FG['to_keep'] = ~(type_FG['same_pair']) | type_FG['is_twin'] # Note : On conserve ceux qui ont des couples déclar/naia différents et les jumeaux # puis on retire les autres (à la fois F et G) fip['to_keep'] = np.nan fip.update(type_FG) log.info(u" 1.4 : on enlève les H pour lesquels il y a un I") type_HI = fip[fip.type_pac.isin(['H', 'I'])].copy() type_HI['same_pair'] = type_HI.duplicated(subset = ['declaration', 'naia'], take_last = True) type_HI['is_twin'] = type_HI.duplicated(subset = ['declaration', 'naia', 'type_pac']) type_HI['to_keep'] = (~(type_HI['same_pair']) | (type_HI['is_twin'])).values fip.update(type_HI) fip['to_keep'] = fip['to_keep'].fillna(True) log.info(u"{} F, G, H or I non redundant pac kept over {} potential candidates".format( fip['to_keep'].sum(), len(fip)) ) indivifip = fip[fip['to_keep']].copy() del indivifip['to_keep'], fip, type_FG, type_HI # # control(indivifip, debug=True) log.info(u"Step 2 : matching indivifip with eec file") indivi = temporary_store['indivim_{}'.format(year)] pac = indivi[(indivi.persfip.notnull()) & (indivi.persfip == 'pac')].copy() assert indivifip.naia.notnull().all(), "Il y a des valeurs manquantes de la variable naia" # For safety enforce pac.naia and indivifip.naia dtypes pac['naia'] = pac.naia.astype('int32') indivifip['naia'] = indivifip.naia.astype('int32') pac['key1'] = zip(pac.naia, pac['declar1'].str[:29]) pac['key2'] = zip(pac.naia, pac['declar2'].str[:29]) indivifip['key'] = zip(indivifip.naia.values, indivifip['declaration'].str[:29].values) assert pac.naia.dtype == indivifip.naia.dtype, \ "Les dtypes de pac.naia {} et indvifip.naia {} sont différents".format(pac.naia.dtype, indivifip.naia.dtype) fip = indivifip[~(indivifip.key.isin(pac.key1.values))].copy() fip = fip[~(fip.key.isin(pac.key2.values))].copy() log.info(u" 2.1 new fip created") # We build a dataframe to link the pac to their type and noindiv tmp_pac1 = pac[['noindiv', 'key1']].copy() tmp_pac2 = pac[['noindiv', 'key2']].copy() tmp_indivifip = indivifip[['key', 'type_pac', 'naia']].copy() pac_ind1 = tmp_pac1.merge(tmp_indivifip, left_on='key1', right_on='key', how='inner') log.info(u"{} pac dans les 1ères déclarations".format(len(pac_ind1))) pac_ind2 = tmp_pac2.merge(tmp_indivifip, left_on='key2', right_on='key', how='inner') log.info(u"{} pac dans les 2èms déclarations".format(len(pac_ind2))) log.info("{} duplicated pac_ind1".format(pac_ind1.duplicated().sum())) log.info("{} duplicated pac_ind2".format(pac_ind2.duplicated().sum())) del pac_ind1['key1'], pac_ind2['key2'] if len(pac_ind1.index) == 0: if len(pac_ind2.index) == 0: log.info(u"Warning : no link between pac and noindiv for both pacInd1&2") else: log.info(u"Warning : pacInd1 is an empty data frame") pacInd = pac_ind2 elif len(pac_ind2.index) == 0: log.info(u"Warning : pacInd2 is an empty data frame") pacInd = pac_ind1 else: pacInd = concat([pac_ind2, pac_ind1]) assert len(pac_ind1) + len(pac_ind2) == len(pacInd) log.info("{} null pac_ind2.type_pac".format(pac_ind2.type_pac.isnull().sum())) log.info("pacInd.type_pac.value_counts()) \n {}".format(pacInd.type_pac.value_counts(dropna = False))) log.info(u" 2.2 : pacInd created") log.info(u"doublons noindiv, type_pac {}".format(pacInd.duplicated(['noindiv', 'type_pac']).sum())) log.info(u"doublons noindiv seulement {}".format(pacInd.duplicated('noindiv').sum())) log.info(u"nb de NaN {}".format(pacInd.type_pac.isnull().sum())) del pacInd["key"] pacIndiv = pacInd[~(pacInd.duplicated('noindiv'))].copy() # pacIndiv.reset_index(inplace=True) log.info("{}".format(pacIndiv.columns)) temporary_store['pacIndiv_{}'.format(year)] = pacIndiv log.info("{}".format(pacIndiv.type_pac.value_counts())) gc.collect() # We keep the fip in the menage of their parents because it is used in to # build the famille. We should build an individual ident (ménage) for the fip that are # older than 18 since they are not in their parents' menage according to the eec log.info("{}".format(indivi['declar1'].str[0:2].value_counts())) log.info("{}".format(indivi['declar1'].str[0:2].describe())) log.info("{}".format(indivi['declar1'].str[0:2].notnull().all())) log.info("{}".format(indivi.info())) selection = indivi['declar1'].str[0:2] != "" indivi['noidec'] = indivi.declar1[selection].str[0:2].astype('int32') # To be used later to set idfoy individec1 = indivi[(indivi.declar1.isin(fip.declaration.values)) & (indivi.persfip == "vous")] individec1 = individec1[["declar1", "noidec", "ident", "rga", "ztsai", "ztsao"]].copy() individec1 = individec1.rename(columns = {'declar1': 'declaration'}) fip1 = fip.merge(individec1, on = 'declaration') log.info(u" 2.3 : fip1 created") individec2 = indivi.loc[ (indivi.declar2.isin(fip.declaration.values)) & (indivi['persfip'] == "vous"), ["declar2", "noidec", "ident", "rga", "ztsai", "ztsao"] ].copy() individec2.rename(columns = {'declar2': 'declaration'}, inplace = True) fip2 = fip.merge(individec2) log.info(u" 2.4 : fip2 created") fip1.duplicated().value_counts() fip2.duplicated().value_counts() fip = concat([fip1, fip2]) fip['persfip'] = 'pac' fip['year'] = year fip['year'] = fip['year'].astype('float') # BUG; pas de colonne année dans la DF fip['noi'] = 99 fip['noicon'] = None fip['noindiv'] = fip['declaration'].copy() fip['noiper'] = None fip['noimer'] = None fip['declar1'] = fip['declaration'].copy() fip['naim'] = 99 fip['lien'] = None fip['quelfic'] = 'FIP' fip['acteu'] = None fip['agepf'] = fip['year'] - fip.naia.astype('float') fip['lpr'] = (fip['agepf'] <= 20) * 3 + (fip['agepf'] > 20) * 4 fip['stc'] = None fip['contra'] = None fip['titc'] = None fip['mrec'] = None fip['forter'] = None fip['rstg'] = None fip['retrai'] = None fip['cohab'] = None fip['sexe'] = None fip['persfip'] = "pac" fip['agepr'] = None fip['actrec'] = (fip['agepf'] <= 15) * 9 + (fip['agepf'] > 15) * 5 # TODO: probleme actrec des enfants fip entre 16 et 20 ans : on ne sait pas s'ils sont étudiants ou salariés */ # TODO problème avec les mois des enfants FIP : voir si on ne peut pas remonter à ces valeurs: Alexis: clairement non # Reassigning noi for fip children if they are more than one per foyer fiscal fip["noi"] = fip["noi"].astype("int64") fip["ident"] = fip["ident"].astype("int64") fip_tmp = fip[['noi', 'ident']] while any(fip.duplicated(subset = ['noi', 'ident'])): fip_tmp = fip.loc[:, ['noi', 'ident']] dup = fip_tmp.duplicated() tmp = fip.loc[dup, 'noi'] log.info("{}".format(len(tmp))) fip.loc[dup, 'noi'] = tmp.astype('int64') - 1 fip['idfoy'] = 100 * fip['ident'] + fip['noidec'] fip['noindiv'] = 100 * fip['ident'] + fip['noi'] fip['type_pac'] = 0 fip['key'] = 0 log.info("Number of duplicated fip: {}".format(fip.duplicated('noindiv').value_counts())) temporary_store['fipDat_{}'.format(year)] = fip del fip, fip1, individec1, indivifip, indivi, pac log.info(u"fip sauvegardé")
""" Created on Mon Sep 28 11:25:40 2015 @author: malkaguillot """ import pandas from openfisca_france_data import default_config_files_directory as config_files_directory from openfisca_france_data.input_data_builders.build_openfisca_survey_data.base \ import year_specific_by_generic_data_frame_name from openfisca_france_data.temporary import get_store from openfisca_survey_manager.survey_collections import SurveyCollection # En entrée : tables individus, foyer et sif de ERFS (testé sur 2009) year = 2009 year_specific_by_generic = year_specific_by_generic_data_frame_name(year) erfs_survey_collection = SurveyCollection.load(collection = 'erfs', config_files_directory = config_files_directory) survey = erfs_survey_collection.get_survey('erfs_{}'.format(year)) foyer_all = survey.get_values(table = year_specific_by_generic["foyer"]) erfind = survey.get_values(table = year_specific_by_generic["erf_indivi"]) temporary_store = get_store(file_name = 'erfs') sif = temporary_store['sif'] ind = erfind[['ident', 'noindiv', 'declar1', 'declar2', 'zsali', 'persfip', 'persfipd']] small_sif = sif[['noindiv', 'declar', 'causeXYZ']] foyer = foyer_all[['ident', 'noindiv', 'declar', 'sif', '_1aj', '_1bj', '_1cj', '_1dj', '_1aq', '_1bq', '_8by', '_8cy' ]] foyer = foyer.drop(['_1cj', '_1dj', '_1aq', '_1bq', '_8by', '_8cy'], axis=1) foyer_sif = pandas.merge(foyer, small_sif, on = ['declar', 'noindiv'])
def create_enfants_a_naitre(temporary_store=None, year=None): ''' ''' assert temporary_store is not None assert year is not None erfs_survey_collection = SurveyCollection.load( collection='erfs', config_files_directory=config_files_directory) survey = erfs_survey_collection.get_survey('erfs_{}'.format(year)) # Enfant à naître (NN pour nouveaux nés) individual_vars = [ 'acteu', 'agepr', 'cohab', 'contra', 'forter', 'ident', 'lien', 'lpr', 'mrec', 'naia', 'naim', 'noi', 'noicon', 'noimer', 'noindiv', 'noiper', 'retrai', 'rga', 'rstg', 'sexe', 'stc', 'titc', ] year_specific_by_generic = year_specific_by_generic_data_frame_name(year) eeccmp1 = survey.get_values(table=year_specific_by_generic["eec_cmp_1"], variables=individual_vars) eeccmp2 = survey.get_values(table=year_specific_by_generic["eec_cmp_2"], variables=individual_vars) eeccmp3 = survey.get_values(table=year_specific_by_generic["eec_cmp_3"], variables=individual_vars) tmp = eeccmp1.merge(eeccmp2, how="outer") enfants_a_naitre = tmp.merge(eeccmp3, how="outer") # optimisation des types? Controle de l'existence en passant # pourquoi pas des int quand c'est possible # TODO: minimal dtype TODO: shoudln't be here for var in individual_vars: assert_dtype(enfants_a_naitre[var], 'float') del eeccmp1, eeccmp2, eeccmp3, individual_vars, tmp gc.collect() # création de variables enfants_a_naitre['declar1'] = '' enfants_a_naitre['noidec'] = 0 enfants_a_naitre['ztsai'] = 0 enfants_a_naitre['year'] = year enfants_a_naitre.year = enfants_a_naitre.year.astype( "float32") # TODO: should be an integer but NaN are present enfants_a_naitre['agepf'] = enfants_a_naitre.year - enfants_a_naitre.naia enfants_a_naitre.loc[enfants_a_naitre.naim >= 7, 'agepf'] -= 1 enfants_a_naitre['actrec'] = 9 enfants_a_naitre['quelfic'] = 'ENF_NN' enfants_a_naitre['persfip'] = "" # TODO: deal with agepf for series_name in ['actrec', 'noidec', 'ztsai']: assert_dtype(enfants_a_naitre[series_name], "int") # selection enfants_a_naitre = enfants_a_naitre[( (enfants_a_naitre.naia == enfants_a_naitre.year) & (enfants_a_naitre.naim >= 10)) | ( (enfants_a_naitre.naia == enfants_a_naitre.year + 1) & (enfants_a_naitre.naim <= 5))].copy() temporary_store["enfants_a_naitre_{}".format(year)] = enfants_a_naitre
def create_indivim_menagem(temporary_store=None, year=None): """ Création des tables ménages et individus concaténée (merged) """ # Prepare the some useful merged tables assert temporary_store is not None assert year is not None # load data erfs_survey_collection = SurveyCollection.load( collection='erfs', config_files_directory=config_files_directory) year_specific_by_generic = year_specific_by_generic_data_frame_name(year) survey = erfs_survey_collection.get_survey('erfs_{}'.format(year)) erfmen = survey.get_values(table=year_specific_by_generic["erf_menage"]) eecmen = survey.get_values(table=year_specific_by_generic["eec_menage"]) erfind = survey.get_values(table=year_specific_by_generic["erf_indivi"]) eecind = survey.get_values(table=year_specific_by_generic["eec_indivi"]) # travail sur la cohérence entre les bases noappar_m = eecmen[~(eecmen.ident.isin(erfmen.ident.values))].copy() noappar_i = eecmen[~(eecind.ident.isin(erfind.ident.values))].copy() noappar_i = noappar_i.drop_duplicates(subset='ident', take_last=True) # TODO: vérifier qu'il n'y a théoriquement pas de doublon difference = set(noappar_i.ident).symmetric_difference(noappar_m.ident) intersection = set(noappar_i.ident) & set(noappar_m.ident) log.info("There are {} differences and {} intersections".format( len(difference), len(intersection))) del noappar_i, noappar_m, difference, intersection gc.collect() # fusion enquete emploi et source fiscale menagem = erfmen.merge(eecmen) indivim = eecind.merge(erfind, on=['noindiv', 'ident', 'noi'], how="inner") # optimisation des types? Controle de l'existence en passant # TODO: minimal dtype # TODO: this should be done somewhere else var_list = ([ 'acteu', 'agepr', 'cohab', 'contra', 'encadr', 'forter', 'lien', 'mrec', 'naia', 'noicon', 'noimer', 'noiper', 'prosa', 'retrai', 'rstg', 'statut', 'stc', 'titc', 'txtppb', ]) for var in var_list: assert numpy.issubdtype( indivim[var].dtype, numpy.integer ), "Variable {} dtype is {} and should be an integer".format( var, indivim[var].dtype) ######################## # création de variables# ######################## # print indivim # actrec : activité recodée comme preconisé par l'INSEE p84 du guide utilisateur indivim["actrec"] = numpy.nan # Attention : Q: pas de 6 ?!! A : Non pas de 6, la variable recodée de l'INSEE (voit p84 du guide methodo), ici \ # la même nomenclature à été adopée # 3: contrat a durée déterminée indivim.actrec.loc[indivim.acteu == 1] = 3 # 8 : femme (homme) au foyer, autre inactif indivim.actrec.loc[indivim.acteu == 3] = 8 # 1 : actif occupé non salarié filter1 = (indivim.acteu == 1) & (indivim.stc.isin( [1, 3])) # actifs occupés non salariés à son compte ou pour un indivim.actrec.loc[filter1] = 1 # membre de sa famille # 2 : salarié pour une durée non limitée filter2 = (indivim.acteu == 1) & (((indivim.stc == 2) & (indivim.contra == 1)) | (indivim.titc == 2)) indivim.actrec.loc[filter2] = 2 # 4 : au chomage filter4 = (indivim.acteu == 2) | ((indivim.acteu == 3) & (indivim.mrec == 1)) indivim.actrec.loc[filter4] = 4 # 5 : élève étudiant , stagiaire non rémunéré filter5 = (indivim.acteu == 3) & ((indivim.forter == 2) | (indivim.rstg == 1)) indivim.actrec.loc[filter5] = 5 # 7 : retraité, préretraité, retiré des affaires unchecked filter7 = (indivim.acteu == 3) & ((indivim.retrai == 1) | (indivim.retrai == 2)) indivim.actrec.loc[filter7] = 7 # 9 : probablement enfants de - de 16 ans TODO: check that fact in database and questionnaire indivim.actrec.loc[indivim.acteu == 0] = 9 indivim.actrec = indivim.actrec.astype("int8") assert_dtype(indivim.actrec, "int8") assert indivim.actrec.isin(range( 1, 10)).all(), 'actrec values are outside the interval [1, 9]' # TODO : compare the result with results provided by Insee # tu99 if year == 2009: erfind['tu99'] = None # TODO: why ? # Locataire menagem["locataire"] = menagem.so.isin([3, 4, 5]) assert_dtype(menagem.locataire, "bool") transfert = indivim.loc[indivim.lpr == 1, ['ident', 'ddipl']].copy() menagem = menagem.merge(transfert) # Correction def _manually_remove_errors(): ''' This method is here because some oddities can make it through the controls throughout the procedure It is here to remove all these individual errors that compromise the process. ''' if year == 2006: indivim.lien[indivim.noindiv == 603018905] = 2 indivim.noimer[indivim.noindiv == 603018905] = 1 log.info("{}".format( indivim[indivim.noindiv == 603018905].to_string())) _manually_remove_errors() temporary_store['menagem_{}'.format(year)] = menagem del eecmen, erfmen, menagem, transfert gc.collect() temporary_store['indivim_{}'.format(year)] = indivim del erfind, eecind
def foyer_all(temporary_store=None, year=None): year_specific_by_generic = year_specific_by_generic_data_frame_name(year) # On ajoute les cases de la déclaration erfs_survey_collection = SurveyCollection.load( collection='erfs', config_files_directory=config_files_directory) data = erfs_survey_collection.get_survey('erfs_{}'.format(year)) foyer_all = data.get_values(table=year_specific_by_generic["foyer"]) # on ne garde que les cases de la déclaration ('_xzz') ou ^_[0-9][a-z]{2}") regex = re.compile("^_[0-9][a-z]{2}") variables = [x for x in foyer_all.columns if regex.match(x)] # rename variable to fxzz ou ^f[0-9][a-z]{2}") renamed_variables = ["f{}".format(x[1:]) for x in variables] foyer = foyer_all[variables + ["noindiv"]].copy() # Memory expensive ... del foyer_all gc.collect() foyer.rename(columns=dict(zip(variables, renamed_variables)), inplace=True) # On aggrège les déclarations dans le cas où un individu a fait plusieurs déclarations foyer = foyer.groupby("noindiv", as_index=False).aggregate(numpy.sum) print_id(foyer) # On récupère les variables individualisables var_dict = { 'sali': ['f1aj', 'f1bj', 'f1cj', 'f1dj', 'f1ej'], 'hsup': ['f1au', 'f1bu', 'f1cu', 'f1du', 'f1eu'], 'choi': ['f1ap', 'f1bp', 'f1cp', 'f1dp', 'f1ep'], 'fra': ['f1ak', 'f1bk', 'f1ck', 'f1dk', 'f1ek'], 'cho_ld': ['f1ai', 'f1bi', 'f1ci', 'f1di', 'f1ei'], 'ppe_tp_sa': ['f1ax', 'f1bx', 'f1cx', 'f1dx', 'f1qx'], 'ppe_du_sa': ['f1av', 'f1bv', 'f1cv', 'f1dv', 'f1qv'], 'rsti': ['f1as', 'f1bs', 'f1cs', 'f1ds', 'f1es'], 'alr': ['f1ao', 'f1bo', 'f1co', 'f1do', 'f1eo'], 'f1tv': ['f1tv', 'f1uv'], 'f1tw': ['f1tw', 'f1uw'], 'f1tx': ['f1tx', 'f1ux'], 'ppe_tp_ns': ['f5nw', 'f5ow', 'f5pw'], 'ppe_du_ns': ['f5nv', 'f5ov', 'f5pv'], 'frag_exon': ['f5hn', 'f5in', 'f5jn'], 'frag_impo': ['f5ho', 'f5io', 'f5jo'], 'arag_exon': ['f5hb', 'f5ib', 'f5jb'], 'arag_impg': ['f5hc', 'f5ic', 'f5jc'], 'arag_defi': ['f5hf', 'f5if', 'f5jf'], 'nrag_exon': ['f5hh', 'f5ih', 'f5jh'], 'nrag_impg': ['f5hi', 'f5ii', 'f5ji'], 'nrag_defi': ['f5hl', 'f5il', 'f5jl'], 'nrag_ajag': ['f5hm', 'f5im', 'f5jm'], 'mbic_exon': ['f5kn', 'f5ln', 'f5mn'], 'abic_exon': ['f5kb', 'f5lb', 'f5mb'], 'nbic_exon': ['f5kh', 'f5lh', 'f5mh'], 'mbic_impv': ['f5ko', 'f5lo', 'f5mo'], 'mbic_imps': ['f5kp', 'f5lp', 'f5mp'], 'abic_impn': ['f5kc', 'f5lc', 'f5mc'], 'abic_imps': ['f5kd', 'f5ld', 'f5md'], 'nbic_impn': ['f5ki', 'f5li', 'f5mi'], 'nbic_imps': ['f5kj', 'f5lj', 'f5mj'], 'abic_defn': ['f5kf', 'f5lf', 'f5mf'], 'abic_defs': ['f5kg', 'f5lg', 'f5mg'], 'nbic_defn': ['f5kl', 'f5ll', 'f5ml'], 'nbic_defs': ['f5km', 'f5lm', 'f5mm'], 'nbic_apch': ['f5ks', 'f5ls', 'f5ms'], 'macc_exon': ['f5nn', 'f5on', 'f5pn'], 'aacc_exon': ['f5nb', 'f5ob', 'f5pb'], 'nacc_exon': ['f5nh', 'f5oh', 'f5ph'], 'macc_impv': ['f5no', 'f5oo', 'f5po'], 'macc_imps': ['f5np', 'f5op', 'f5pp'], 'aacc_impn': ['f5nc', 'f5oc', 'f5pc'], 'aacc_imps': ['f5nd', 'f5od', 'f5pd'], 'aacc_defn': ['f5nf', 'f5of', 'f5pf'], 'aacc_defs': ['f5ng', 'f5og', 'f5pg'], 'nacc_impn': ['f5ni', 'f5oi', 'f5pi'], 'nacc_imps': ['f5nj', 'f5oj', 'f5pj'], 'nacc_defn': ['f5nl', 'f5ol', 'f5pl'], 'nacc_defs': ['f5nm', 'f5om', 'f5pm'], 'mncn_impo': ['f5ku', 'f5lu', 'f5mu'], 'cncn_bene': ['f5sn', 'f5ns', 'f5os'], 'cncn_defi': ['f5sp', 'f5nu', 'f5ou', 'f5sr'], # TODO: check 'mbnc_exon': ['f5hp', 'f5ip', 'f5jp'], 'abnc_exon': ['f5qb', 'f5rb', 'f5sb'], 'nbnc_exon': ['f5qh', 'f5rh', 'f5sh'], 'mbnc_impo': ['f5hq', 'f5iq', 'f5jq'], 'abnc_impo': ['f5qc', 'f5rc', 'f5sc'], 'abnc_defi': ['f5qe', 'f5re', 'f5se'], 'nbnc_impo': ['f5qi', 'f5ri', 'f5si'], 'nbnc_defi': ['f5qk', 'f5rk', 'f5sk'], # 'ebic_impv' : ['f5ta','f5ua', 'f5va'], # 'ebic_imps' : ['f5tb','f5ub', 'f5vb'], 'mbic_mvct': ['f5hu'], 'macc_mvct': ['f5iu'], 'mncn_mvct': ['f5ju'], 'mbnc_mvct': ['f5kz'], 'frag_pvct': ['f5hw', 'f5iw', 'f5jw'], 'mbic_pvct': ['f5kx', 'f5lx', 'f5mx'], 'macc_pvct': ['f5nx', 'f5ox', 'f5px'], 'mbnc_pvct': ['f5hv', 'f5iv', 'f5jv'], 'mncn_pvct': ['f5ky', 'f5ly', 'f5my'], 'mbic_mvlt': ['f5kr', 'f5lr', 'f5mr'], 'macc_mvlt': ['f5nr', 'f5or', 'f5pr'], 'mncn_mvlt': ['f5kw', 'f5lw', 'f5mw'], 'mbnc_mvlt': ['f5hs', 'f5is', 'f5js'], 'frag_pvce': ['f5hx', 'f5ix', 'f5jx'], 'arag_pvce': ['f5he', 'f5ie', 'f5je'], 'nrag_pvce': ['f5hk', 'f5lk', 'f5jk'], 'mbic_pvce': ['f5kq', 'f5lq', 'f5mq'], 'abic_pvce': ['f5ke', 'f5le', 'f5me'], 'nbic_pvce': ['f5kk', 'f5ik', 'f5mk'], 'macc_pvce': ['f5nq', 'f5oq', 'f5pq'], 'aacc_pvce': ['f5ne', 'f5oe', 'f5pe'], 'nacc_pvce': ['f5nk', 'f5ok', 'f5pk'], 'mncn_pvce': ['f5kv', 'f5lv', 'f5mv'], 'cncn_pvce': ['f5so', 'f5nt', 'f5ot'], 'mbnc_pvce': ['f5hr', 'f5ir', 'f5jr'], 'abnc_pvce': ['f5qd', 'f5rd', 'f5sd'], 'nbnc_pvce': ['f5qj', 'f5rj', 'f5sj'], 'demenage': ['f1ar', 'f1br', 'f1cr', 'f1dr', 'f1er'], # (déménagement) uniquement en 2006 } cases_f6_f7_f8 = build_cerfa_fields_by_column_name( year=year, sections_cerfa=[6, 7, 8]) var_dict.update(cases_f6_f7_f8) vars_sets = [set(var_list) for var_list in var_dict.values()] eligible_vars = (set().union(*vars_sets)).intersection( set(list(foyer.columns))) log.info(u"From {} variables, we keep {} eligibles variables".format( len(set().union(*vars_sets)), len(eligible_vars), )) qui = ['vous', 'conj', 'pac1', 'pac2', 'pac3'] # err = 0 # err_vars = {} foy_ind = DataFrame() for individual_var, foyer_vars in var_dict.iteritems(): try: selection = foyer[foyer_vars + ["noindiv"]].copy() except KeyError: # Testing if at least one variable of foyers_vars is in the eligible list presence = [x in eligible_vars for x in foyer_vars] if not any(presence): log.info("{} is not present".format(individual_var)) continue else: # Shrink the list foyer_vars_cleaned = [ var for var, present in zip(foyer_vars, presence) if present is True ] selection = foyer[foyer_vars_cleaned + ["noindiv"]].copy() # Reshape the dataframe selection.rename(columns=dict(zip(foyer_vars, qui)), inplace=True) selection.set_index("noindiv", inplace=True) selection.columns.name = "quifoy" selection = selection.stack() selection.name = individual_var selection = selection.reset_index( ) # A Series cannot see its index resetted to produce a DataFrame selection = selection.set_index(["quifoy", "noindiv"]) selection = selection[selection[individual_var] != 0].copy() if len(foy_ind) == 0: foy_ind = selection else: foy_ind = concat([foy_ind, selection], axis=1, join='outer') foy_ind.reset_index(inplace=True) ind_vars_to_remove = Series(list(eligible_vars)) temporary_store['ind_vars_to_remove_{}'.format(year)] = ind_vars_to_remove foy_ind.rename(columns={"noindiv": "idfoy"}, inplace=True) print_id(foy_ind) foy_ind.quifoy.loc[foy_ind.quifoy == 'vous'] = 0 foy_ind.quifoy.loc[foy_ind.quifoy == 'conj'] = 1 foy_ind.quifoy.loc[foy_ind.quifoy == 'pac1'] = 2 foy_ind.quifoy.loc[foy_ind.quifoy == 'pac2'] = 3 foy_ind.quifoy.loc[foy_ind.quifoy == 'pac3'] = 4 assert foy_ind.quifoy.isin( range(5)).all(), 'présence de valeurs aberrantes dans quifoy' log.info('saving foy_ind') print_id(foy_ind) temporary_store['foy_ind_{}'.format(year)] = foy_ind return
def sif(temporary_store=None, year=None): assert temporary_store is not None assert year is not None year_specific_by_generic = year_specific_by_generic_data_frame_name(year) erfs_survey_collection = SurveyCollection.load( collection='erfs', config_files_directory=config_files_directory) erfs_survey = erfs_survey_collection.get_survey('erfs_{}'.format(year)) log.info("05_foyer: extraction des données foyer") # TODO Comment choisir le rfr n -2 pour éxonération de TH ? # mnrvka Revenu TH n-2 # mnrvkh revenu TH (revenu fiscal de référence) # # On récupère les variables du code sif sif = erfs_survey.get_values(variables=[ "noindiv", 'sif', "nbptr", "mnrvka", "rbg", "tsrvbg", "declar" ], table=year_specific_by_generic["foyer"]) sif['statmarit'] = 0 if year == 2009: old_sif = sif['sif'][sif['noindiv'] == 901803201].copy() new_sif = old_sif.str[0:59] + old_sif.str[60:] + "0" sif.sif.loc[sif['noindiv'] == 901803201] = new_sif.values old_sif = sif.sif.loc[sif['noindiv'] == 900872201] new_sif = old_sif.str[0:58] + " " + old_sif.str[58:] sif.sif.loc[sif['noindiv'] == 900872201] = new_sif.values del old_sif, new_sif sif["rbg"] = sif["rbg"] * ((sif["tsrvbg"] == '+').astype(int) - (sif["tsrvbg"] == '-').astype(int)) sif["stamar"] = sif.sif.str[4:5] # Converting marital status statmarit_dict = {"M": 1, "C": 2, "D": 3, "V": 4, "O": 5} for key, val in statmarit_dict.iteritems(): sif.statmarit.loc[sif.stamar == key] = val sif["birthvous"] = sif.sif.str[5:9] sif["birthconj"] = sif.sif.str[10:14] sif["caseE"] = sif.sif.str[15:16] == "E" sif["caseF"] = sif.sif.str[16:17] == "F" sif["caseG"] = sif.sif.str[17:18] == "G" sif["caseK"] = sif.sif.str[18:19] == "K" d = 0 if year in [2006, 2007]: sif["caseL"] = sif.sif.str[19:20] == "L" sif["caseP"] = sif.sif.str[20:21] == "P" sif["caseS"] = sif.sif.str[21:22] == "S" sif["caseW"] = sif.sif.str[22:23] == "W" sif["caseN"] = sif.sif.str[23:24] == "N" sif["caseH"] = sif.sif.str[24:28] sif["caseT"] = sif.sif.str[28:29] == "T" if year in [2008]: d = -1 # fin de la case L sif["caseP"] = sif.sif.str[20 + d:21 + d] == "P" sif["caseS"] = sif.sif.str[21 + d:22 + d] == "S" sif["caseW"] = sif.sif.str[22 + d:23 + d] == "W" sif["caseN"] = sif.sif.str[23 + d:24 + d] == "N" sif["caseH"] = sif.sif.str[24 + d:28 + d] sif["caseT"] = sif.sif.str[28 + d:29 + d] == "T" if year in [2009]: sif["caseL"] = sif.sif.str[19:20] == "L" sif["caseP"] = sif.sif.str[20:21] == "P" sif["caseS"] = sif.sif.str[21:22] == "S" sif["caseW"] = sif.sif.str[22:23] == "W" sif["caseN"] = sif.sif.str[23:24] == "N" # caseH en moins par rapport à 2008 (mais case en L en plus) # donc décalage par rapport à 2006 d = -4 sif["caseT"] = sif.sif.str[28 + d:29 + d] == "T" sif["caseX"] = sif.sif.str[33 + d:34 + d] == "X" sif["dateX"] = sif.sif.str[34 + d:42 + d] sif["caseY"] = sif.sif.str[42 + d:43 + d] == "Y" sif["dateY"] = sif.sif.str[43 + d:51 + d] sif["caseZ"] = sif.sif.str[51 + d:52 + d] == "Z" sif["dateZ"] = sif.sif.str[52 + d:60 + d] sif["causeXYZ"] = sif.sif.str[60 + d:61 + d] # TODO: convert dateXYZ to appropriate date in pandas # print sif["dateY"].unique() sif["nbptr"] = sif.nbptr.values / 100 sif["rfr_n_2"] = sif.mnrvka.values sif["nbF"] = sif.sif.str[64 + d:66 + d] sif["nbG"] = sif.sif.str[67 + d:69 + d] sif["nbR"] = sif.sif.str[70 + d:72 + d] sif["nbJ"] = sif.sif.str[73 + d:75 + d] sif["nbN"] = sif.sif.str[76 + d:78 + d] sif["nbH"] = sif.sif.str[79 + d:81 + d] sif["nbI"] = sif.sif.str[82 + d:84 + d] if (year != 2009): sif["nbP"] = sif.sif.str[85 + d:87 + d] del sif["stamar"] duplicated_noindiv = sif.noindiv[sif.noindiv.duplicated()].copy() sif['duplicated_noindiv'] = sif.noindiv.isin(duplicated_noindiv) x = sif.loc[sif.duplicated_noindiv, ['noindiv', 'declar']] sif['change'] = "NONE" sif.loc[sif.duplicated_noindiv, 'change'] = sif.loc[sif.duplicated_noindiv, 'declar'].str[27:28] log.info("Number of individuals: {}".format(len(sif.noindiv))) log.info("Number of duplicated individuals: {}".format( len(duplicated_noindiv))) log.info("Number of distinct individuals: {}".format( len(sif.noindiv.value_counts()))) log.info(u"Saving sif") temporary_store['sif_{}'.format(year)] = sif del sif gc.collect()
def foyer_all(temporary_store = None, year = None): year_specific_by_generic = year_specific_by_generic_data_frame_name(year) # On ajoute les cases de la déclaration erfs_survey_collection = SurveyCollection.load(collection = 'erfs', config_files_directory = config_files_directory) data = erfs_survey_collection.get_survey('erfs_{}'.format(year)) foyer_all = data.get_values(table = year_specific_by_generic["foyer"]) # on ne garde que les cases de la déclaration ('_xzz') ou ^_[0-9][a-z]{2}") regex = re.compile("^_[0-9][a-z]{2}") variables = [x for x in foyer_all.columns if regex.match(x)] # rename variable to fxzz ou ^f[0-9][a-z]{2}") renamed_variables = ["f{}".format(x[1:]) for x in variables] foyer = foyer_all[variables + ["noindiv"]].copy() # Memory expensive ... del foyer_all gc.collect() foyer.rename(columns = dict(zip(variables, renamed_variables)), inplace = True) # On aggrège les déclarations dans le cas où un individu a fait plusieurs déclarations foyer = foyer.groupby("noindiv", as_index = False).aggregate(numpy.sum) print_id(foyer) # On récupère les variables individualisables var_dict = { 'sali': ['f1aj', 'f1bj', 'f1cj', 'f1dj', 'f1ej'], 'hsup': ['f1au', 'f1bu', 'f1cu', 'f1du', 'f1eu'], 'choi': ['f1ap', 'f1bp', 'f1cp', 'f1dp', 'f1ep'], 'fra': ['f1ak', 'f1bk', 'f1ck', 'f1dk', 'f1ek'], 'cho_ld': ['f1ai', 'f1bi', 'f1ci', 'f1di', 'f1ei'], 'ppe_tp_sa': ['f1ax', 'f1bx', 'f1cx', 'f1dx', 'f1qx'], 'ppe_du_sa': ['f1av', 'f1bv', 'f1cv', 'f1dv', 'f1qv'], 'rsti': ['f1as', 'f1bs', 'f1cs', 'f1ds', 'f1es'], 'alr': ['f1ao', 'f1bo', 'f1co', 'f1do', 'f1eo'], 'f1tv': ['f1tv', 'f1uv'], 'f1tw': ['f1tw', 'f1uw'], 'f1tx': ['f1tx', 'f1ux'], 'ppe_tp_ns': ['f5nw', 'f5ow', 'f5pw'], 'ppe_du_ns': ['f5nv', 'f5ov', 'f5pv'], 'frag_exon': ['f5hn', 'f5in', 'f5jn'], 'frag_impo': ['f5ho', 'f5io', 'f5jo'], 'arag_exon': ['f5hb', 'f5ib', 'f5jb'], 'arag_impg': ['f5hc', 'f5ic', 'f5jc'], 'arag_defi': ['f5hf', 'f5if', 'f5jf'], 'nrag_exon': ['f5hh', 'f5ih', 'f5jh'], 'nrag_impg': ['f5hi', 'f5ii', 'f5ji'], 'nrag_defi': ['f5hl', 'f5il', 'f5jl'], 'nrag_ajag': ['f5hm', 'f5im', 'f5jm'], 'mbic_exon': ['f5kn', 'f5ln', 'f5mn'], 'abic_exon': ['f5kb', 'f5lb', 'f5mb'], 'nbic_exon': ['f5kh', 'f5lh', 'f5mh'], 'mbic_impv': ['f5ko', 'f5lo', 'f5mo'], 'mbic_imps': ['f5kp', 'f5lp', 'f5mp'], 'abic_impn': ['f5kc', 'f5lc', 'f5mc'], 'abic_imps': ['f5kd', 'f5ld', 'f5md'], 'nbic_impn': ['f5ki', 'f5li', 'f5mi'], 'nbic_imps': ['f5kj', 'f5lj', 'f5mj'], 'abic_defn': ['f5kf', 'f5lf', 'f5mf'], 'abic_defs': ['f5kg', 'f5lg', 'f5mg'], 'nbic_defn': ['f5kl', 'f5ll', 'f5ml'], 'nbic_defs': ['f5km', 'f5lm', 'f5mm'], 'nbic_apch': ['f5ks', 'f5ls', 'f5ms'], 'macc_exon': ['f5nn', 'f5on', 'f5pn'], 'aacc_exon': ['f5nb', 'f5ob', 'f5pb'], 'nacc_exon': ['f5nh', 'f5oh', 'f5ph'], 'macc_impv': ['f5no', 'f5oo', 'f5po'], 'macc_imps': ['f5np', 'f5op', 'f5pp'], 'aacc_impn': ['f5nc', 'f5oc', 'f5pc'], 'aacc_imps': ['f5nd', 'f5od', 'f5pd'], 'aacc_defn': ['f5nf', 'f5of', 'f5pf'], 'aacc_defs': ['f5ng', 'f5og', 'f5pg'], 'nacc_impn': ['f5ni', 'f5oi', 'f5pi'], 'nacc_imps': ['f5nj', 'f5oj', 'f5pj'], 'nacc_defn': ['f5nl', 'f5ol', 'f5pl'], 'nacc_defs': ['f5nm', 'f5om', 'f5pm'], 'mncn_impo': ['f5ku', 'f5lu', 'f5mu'], 'cncn_bene': ['f5sn', 'f5ns', 'f5os'], 'cncn_defi': ['f5sp', 'f5nu', 'f5ou', 'f5sr'], # TODO: check 'mbnc_exon': ['f5hp', 'f5ip', 'f5jp'], 'abnc_exon': ['f5qb', 'f5rb', 'f5sb'], 'nbnc_exon': ['f5qh', 'f5rh', 'f5sh'], 'mbnc_impo': ['f5hq', 'f5iq', 'f5jq'], 'abnc_impo': ['f5qc', 'f5rc', 'f5sc'], 'abnc_defi': ['f5qe', 'f5re', 'f5se'], 'nbnc_impo': ['f5qi', 'f5ri', 'f5si'], 'nbnc_defi': ['f5qk', 'f5rk', 'f5sk'], # 'ebic_impv' : ['f5ta','f5ua', 'f5va'], # 'ebic_imps' : ['f5tb','f5ub', 'f5vb'], 'mbic_mvct': ['f5hu'], 'macc_mvct': ['f5iu'], 'mncn_mvct': ['f5ju'], 'mbnc_mvct': ['f5kz'], 'frag_pvct': ['f5hw', 'f5iw', 'f5jw'], 'mbic_pvct': ['f5kx', 'f5lx', 'f5mx'], 'macc_pvct': ['f5nx', 'f5ox', 'f5px'], 'mbnc_pvct': ['f5hv', 'f5iv', 'f5jv'], 'mncn_pvct': ['f5ky', 'f5ly', 'f5my'], 'mbic_mvlt': ['f5kr', 'f5lr', 'f5mr'], 'macc_mvlt': ['f5nr', 'f5or', 'f5pr'], 'mncn_mvlt': ['f5kw', 'f5lw', 'f5mw'], 'mbnc_mvlt': ['f5hs', 'f5is', 'f5js'], 'frag_pvce': ['f5hx', 'f5ix', 'f5jx'], 'arag_pvce': ['f5he', 'f5ie', 'f5je'], 'nrag_pvce': ['f5hk', 'f5lk', 'f5jk'], 'mbic_pvce': ['f5kq', 'f5lq', 'f5mq'], 'abic_pvce': ['f5ke', 'f5le', 'f5me'], 'nbic_pvce': ['f5kk', 'f5ik', 'f5mk'], 'macc_pvce': ['f5nq', 'f5oq', 'f5pq'], 'aacc_pvce': ['f5ne', 'f5oe', 'f5pe'], 'nacc_pvce': ['f5nk', 'f5ok', 'f5pk'], 'mncn_pvce': ['f5kv', 'f5lv', 'f5mv'], 'cncn_pvce': ['f5so', 'f5nt', 'f5ot'], 'mbnc_pvce': ['f5hr', 'f5ir', 'f5jr'], 'abnc_pvce': ['f5qd', 'f5rd', 'f5sd'], 'nbnc_pvce': ['f5qj', 'f5rj', 'f5sj'], 'demenage': ['f1ar', 'f1br', 'f1cr', 'f1dr', 'f1er'], # (déménagement) uniquement en 2006 } cases_f6_f7_f8 = build_cerfa_fields_by_column_name(year = year, sections_cerfa = [6, 7, 8]) var_dict.update(cases_f6_f7_f8) vars_sets = [set(var_list) for var_list in var_dict.values()] eligible_vars = (set().union(*vars_sets)).intersection(set(list(foyer.columns))) log.info( u"From {} variables, we keep {} eligibles variables".format( len(set().union(*vars_sets)), len(eligible_vars), ) ) qui = ['vous', 'conj', 'pac1', 'pac2', 'pac3'] # err = 0 # err_vars = {} foy_ind = DataFrame() for individual_var, foyer_vars in var_dict.iteritems(): try: selection = foyer[foyer_vars + ["noindiv"]].copy() except KeyError: # Testing if at least one variable of foyers_vars is in the eligible list presence = [x in eligible_vars for x in foyer_vars] if not any(presence): log.info("{} is not present".format(individual_var)) continue else: # Shrink the list foyer_vars_cleaned = [var for var, present in zip(foyer_vars, presence) if present is True] selection = foyer[foyer_vars_cleaned + ["noindiv"]].copy() # Reshape the dataframe selection.rename(columns = dict(zip(foyer_vars, qui)), inplace = True) selection.set_index("noindiv", inplace = True) selection.columns.name = "quifoy" selection = selection.stack() selection.name = individual_var selection = selection.reset_index() # A Series cannot see its index resetted to produce a DataFrame selection = selection.set_index(["quifoy", "noindiv"]) selection = selection[selection[individual_var] != 0].copy() if len(foy_ind) == 0: foy_ind = selection else: foy_ind = concat([foy_ind, selection], axis = 1, join = 'outer') foy_ind.reset_index(inplace = True) ind_vars_to_remove = Series(list(eligible_vars)) temporary_store['ind_vars_to_remove_{}'.format(year)] = ind_vars_to_remove foy_ind.rename(columns = {"noindiv": "idfoy"}, inplace = True) print_id(foy_ind) foy_ind.quifoy.loc[foy_ind.quifoy == 'vous'] = 0 foy_ind.quifoy.loc[foy_ind.quifoy == 'conj'] = 1 foy_ind.quifoy.loc[foy_ind.quifoy == 'pac1'] = 2 foy_ind.quifoy.loc[foy_ind.quifoy == 'pac2'] = 3 foy_ind.quifoy.loc[foy_ind.quifoy == 'pac3'] = 4 assert foy_ind.quifoy .isin(range(5)).all(), 'présence de valeurs aberrantes dans quifoy' log.info('saving foy_ind') print_id(foy_ind) temporary_store['foy_ind_{}'.format(year)] = foy_ind return
def sif(temporary_store = None, year = None): assert temporary_store is not None assert year is not None year_specific_by_generic = year_specific_by_generic_data_frame_name(year) erfs_survey_collection = SurveyCollection.load(collection = 'erfs', config_files_directory = config_files_directory) erfs_survey = erfs_survey_collection.get_survey('erfs_{}'.format(year)) log.info("05_foyer: extraction des données foyer") # TODO Comment choisir le rfr n -2 pour éxonération de TH ? # mnrvka Revenu TH n-2 # mnrvkh revenu TH (revenu fiscal de référence) # # On récupère les variables du code sif sif = erfs_survey.get_values( variables = ["noindiv", 'sif', "nbptr", "mnrvka", "rbg", "tsrvbg", "declar"], table = year_specific_by_generic["foyer"] ) sif['statmarit'] = 0 if year == 2009: old_sif = sif['sif'][sif['noindiv'] == 901803201].copy() new_sif = old_sif.str[0:59] + old_sif.str[60:] + "0" sif.sif.loc[sif['noindiv'] == 901803201] = new_sif.values old_sif = sif.sif.loc[sif['noindiv'] == 900872201] new_sif = old_sif.str[0:58] + " " + old_sif.str[58:] sif.sif.loc[sif['noindiv'] == 900872201] = new_sif.values del old_sif, new_sif sif["rbg"] = sif["rbg"] * ((sif["tsrvbg"] == '+').astype(int) - (sif["tsrvbg"] == '-').astype(int)) sif["stamar"] = sif.sif.str[4:5] # Converting marital status statmarit_dict = {"M": 1, "C": 2, "D": 3, "V": 4, "O": 5} for key, val in statmarit_dict.iteritems(): sif.statmarit.loc[sif.stamar == key] = val sif["birthvous"] = sif.sif.str[5:9] sif["birthconj"] = sif.sif.str[10:14] sif["caseE"] = sif.sif.str[15:16] == "E" sif["caseF"] = sif.sif.str[16:17] == "F" sif["caseG"] = sif.sif.str[17:18] == "G" sif["caseK"] = sif.sif.str[18:19] == "K" d = 0 if year in [2006, 2007]: sif["caseL"] = sif.sif.str[19:20] == "L" sif["caseP"] = sif.sif.str[20:21] == "P" sif["caseS"] = sif.sif.str[21:22] == "S" sif["caseW"] = sif.sif.str[22:23] == "W" sif["caseN"] = sif.sif.str[23:24] == "N" sif["caseH"] = sif.sif.str[24:28] sif["caseT"] = sif.sif.str[28:29] == "T" if year in [2008]: d = - 1 # fin de la case L sif["caseP"] = sif.sif.str[20 + d: 21 + d] == "P" sif["caseS"] = sif.sif.str[21 + d: 22 + d] == "S" sif["caseW"] = sif.sif.str[22 + d: 23 + d] == "W" sif["caseN"] = sif.sif.str[23 + d: 24 + d] == "N" sif["caseH"] = sif.sif.str[24 + d: 28 + d] sif["caseT"] = sif.sif.str[28 + d: 29 + d] == "T" if year in [2009]: sif["caseL"] = sif.sif.str[19: 20] == "L" sif["caseP"] = sif.sif.str[20: 21] == "P" sif["caseS"] = sif.sif.str[21: 22] == "S" sif["caseW"] = sif.sif.str[22: 23] == "W" sif["caseN"] = sif.sif.str[23: 24] == "N" # caseH en moins par rapport à 2008 (mais case en L en plus) # donc décalage par rapport à 2006 d = -4 sif["caseT"] = sif.sif.str[28 + d: 29 + d] == "T" sif["caseX"] = sif.sif.str[33 + d: 34 + d] == "X" sif["dateX"] = sif.sif.str[34 + d: 42 + d] sif["caseY"] = sif.sif.str[42 + d: 43 + d] == "Y" sif["dateY"] = sif.sif.str[43 + d: 51 + d] sif["caseZ"] = sif.sif.str[51 + d: 52 + d] == "Z" sif["dateZ"] = sif.sif.str[52 + d: 60 + d] sif["causeXYZ"] = sif.sif.str[60 + d: 61 + d] # TODO: convert dateXYZ to appropriate date in pandas # print sif["dateY"].unique() sif["nbptr"] = sif.nbptr.values / 100 sif["rfr_n_2"] = sif.mnrvka.values sif["nbF"] = sif.sif.str[64 + d: 66 + d] sif["nbG"] = sif.sif.str[67 + d: 69 + d] sif["nbR"] = sif.sif.str[70 + d: 72 + d] sif["nbJ"] = sif.sif.str[73 + d: 75 + d] sif["nbN"] = sif.sif.str[76 + d: 78 + d] sif["nbH"] = sif.sif.str[79 + d: 81 + d] sif["nbI"] = sif.sif.str[82 + d: 84 + d] if (year != 2009): sif["nbP"] = sif.sif.str[85 + d: 87 + d] del sif["stamar"] duplicated_noindiv = sif.noindiv[sif.noindiv.duplicated()].copy() sif['duplicated_noindiv'] = sif.noindiv.isin(duplicated_noindiv) x = sif.loc[sif.duplicated_noindiv, ['noindiv', 'declar']] sif['change'] = "NONE" sif.loc[sif.duplicated_noindiv, 'change'] = sif.loc[sif.duplicated_noindiv, 'declar'].str[27:28] log.info("Number of individuals: {}".format(len(sif.noindiv))) log.info("Number of duplicated individuals: {}".format(len(duplicated_noindiv))) log.info("Number of distinct individuals: {}".format(len(sif.noindiv.value_counts()))) log.info(u"Saving sif") temporary_store['sif_{}'.format(year)] = sif del sif gc.collect()
def build_erf_data_frames(self): # TODO: remove this self.columns_to_fetch = ['af'] variables = self.columns_to_fetch erf_survey_collection = SurveyCollection.load( collection="erfs", config_files_directory=config_files_directory) erf_survey = erf_survey_collection.get_survey("erfs_{}".format(year)) year_specific_by_generic = year_specific_by_generic_data_frame_name( year) generic_by_year_specific = dict( zip(year_specific_by_generic.values(), year_specific_by_generic.keys())) erf_variables = list( set(variables + ["ident", "wprm", "quelfic", "noi"])) of2erf = get_of2erf() for index, variable in enumerate(erf_variables): if variable in of2erf: erf_variables[index] = of2erf[variable] data_frame_by_table = dict(eec_indivi=None, erf_indivi=None, erf_menage=None) erf_variables_by_generic_table = dict(eec_indivi=[], erf_indivi=[], erf_menage=[]) year_specific_tables_by_erf_variable = dict([ (erf_variable, set(erf_survey.find_tables(variable=erf_variable)).intersection( set([ year_specific_by_generic[key] for key in erf_variables_by_generic_table.keys() ]))) for erf_variable in erf_variables ]) for variable, year_specific_tables in year_specific_tables_by_erf_variable.iteritems( ): if len(year_specific_tables) < 1: log.info( "No tables are present for variable {}".format(variable)) continue else: log.info( "Variable {} is present in multiple tables : {}".format( variable, year_specific_tables)) for table in year_specific_tables: log.info("Variable {} is retrieved from table {}".format( variable, table)) erf_variables_by_generic_table[ generic_by_year_specific[table]].append(variable) erf2of = get_erf2of() for table, erf_variables in erf_variables_by_generic_table.iteritems(): if erf_variables: data_frame_by_table[table] = erf_survey.get_values( variables=erf_variables, table=year_specific_by_generic[table]) data_frame_by_table[table].rename(columns=erf2of, inplace=True) data_frame_by_table[table].rename(columns={'ident': 'idmen'}, inplace=True) assert not data_frame_by_table["erf_menage"].duplicated().any( ), "Duplicated idmen in erf_menage" self.erf_data_frame_by_entity_key_plural = dict( menages=data_frame_by_table["erf_menage"], individus=data_frame_by_table["erf_indivi"].merge( data_frame_by_table["eec_indivi"]))
def create_fip(temporary_store=None, year=None): assert temporary_store is not None assert year is not None # fip : fichier d'imposition des personnes """ Creates a 'fipDat' table containing all these 'fip individuals' """ # Some individuals are declared as 'personne à charge' (pac) on 'tax forms' # but are not present in the erf or eec tables. # We add them to ensure consistency between concepts. year_specific_by_generic = year_specific_by_generic_data_frame_name(year) erfs_survey_collection = SurveyCollection.load( collection='erfs', config_files_directory=config_files_directory) survey = erfs_survey_collection.get_survey('erfs_{}'.format(year)) log.info(u"Démarrage de 03_fip") # anaisenf is a string containing letter code of pac (F,G,H,I,J,N,R) and year of birth (example: 'F1990H1992') # when a child is invalid, he appears twice in anaisenf (example: F1900G1900 is a single invalid child born in 1990) erfFoyVar = ['declar', 'anaisenf'] foyer = survey.get_values(table=year_specific_by_generic["foyer"], variables=erfFoyVar) foyer.replace({'anaisenf': {'NA': np.nan}}, inplace=True) log.info(u"Etape 1 : on récupere les personnes à charge des foyers") log.info(u" 1.1 : Création des codes des enfants") foyer['anaisenf'] = foyer['anaisenf'].astype('string') nb_pac_max = len(max(foyer['anaisenf'], key=len)) / 5 log.info(u"il ya a au maximum {} pac par foyer".format(nb_pac_max)) # Separating the string coding the pac of each "déclaration". # Creating a list containing the new variables. # Creating the multi_index for the columns multi_index_columns = [] assert int( nb_pac_max ) == nb_pac_max, "nb_pac_max = {} which is not an integer".format( nb_pac_max) nb_pac_max = int(nb_pac_max) for i in range(1, nb_pac_max + 1): pac_tuples_list = [(i, 'declaration'), (i, 'type_pac'), (i, 'naia')] multi_index_columns += pac_tuples_list columns = MultiIndex.from_tuples(multi_index_columns, names=['pac_number', 'variable']) fip = DataFrame(np.random.randn(len(foyer), 3 * nb_pac_max), columns=columns) for i in range(1, nb_pac_max + 1): # TODO: using values to deal with mismatching indexes fip[(i, 'declaration')] = foyer['declar'].values fip[(i, 'type_pac')] = foyer['anaisenf'].str[5 * (i - 1)].values fip[(i, 'naia')] = foyer['anaisenf'].str[5 * (i - 1) + 1:5 * i].values fip = fip.stack("pac_number") fip.reset_index(inplace=True) fip.drop(['level_0'], axis=1, inplace=True) log.info(u" 1.2 : elimination des foyers fiscaux sans pac") # Clearing missing values and changing data format fip = fip[(fip.type_pac.notnull()) & (fip.naia != 'an') & (fip.naia != '')].copy() fip = fip.sort(columns=['declaration', 'naia', 'type_pac']) fip.set_index(["declaration", "pac_number"], inplace=True) fip = fip.reset_index() fip.drop(['pac_number'], axis=1, inplace=True) assert fip.type_pac.isin(["F", "G", "H", "I", "J", "N", "R"]).all(), \ "Certains types de PAC ne sont pas des cases connues" # control(fip, debug=True, verbose=True, verbose_columns=['naia']) log.info( u" 1.3 : on enlève les individus F pour lesquels il existe un individu G" ) type_FG = fip[fip.type_pac.isin( ['F', 'G'])].copy() # Filtre pour ne travailler que sur F & G type_FG['same_pair'] = type_FG.duplicated(subset=['declaration', 'naia'], take_last=True) type_FG['is_twin'] = type_FG.duplicated( subset=['declaration', 'naia', 'type_pac']) type_FG['to_keep'] = ~(type_FG['same_pair']) | type_FG['is_twin'] # Note : On conserve ceux qui ont des couples déclar/naia différents et les jumeaux # puis on retire les autres (à la fois F et G) fip['to_keep'] = np.nan fip.update(type_FG) log.info(u" 1.4 : on enlève les H pour lesquels il y a un I") type_HI = fip[fip.type_pac.isin(['H', 'I'])].copy() type_HI['same_pair'] = type_HI.duplicated(subset=['declaration', 'naia'], take_last=True) type_HI['is_twin'] = type_HI.duplicated( subset=['declaration', 'naia', 'type_pac']) type_HI['to_keep'] = (~(type_HI['same_pair']) | (type_HI['is_twin'])).values fip.update(type_HI) fip['to_keep'] = fip['to_keep'].fillna(True) log.info( u"{} F, G, H or I non redundant pac kept over {} potential candidates". format(fip['to_keep'].sum(), len(fip))) indivifip = fip[fip['to_keep']].copy() del indivifip['to_keep'], fip, type_FG, type_HI # # control(indivifip, debug=True) log.info(u"Step 2 : matching indivifip with eec file") indivi = temporary_store['indivim_{}'.format(year)] pac = indivi[(indivi.persfip.notnull()) & (indivi.persfip == 'pac')].copy() assert indivifip.naia.notnull().all( ), "Il y a des valeurs manquantes de la variable naia" # For safety enforce pac.naia and indivifip.naia dtypes pac['naia'] = pac.naia.astype('int32') indivifip['naia'] = indivifip.naia.astype('int32') pac['key1'] = zip(pac.naia, pac['declar1'].str[:29]) pac['key2'] = zip(pac.naia, pac['declar2'].str[:29]) indivifip['key'] = zip(indivifip.naia.values, indivifip['declaration'].str[:29].values) assert pac.naia.dtype == indivifip.naia.dtype, \ "Les dtypes de pac.naia {} et indvifip.naia {} sont différents".format(pac.naia.dtype, indivifip.naia.dtype) fip = indivifip[~(indivifip.key.isin(pac.key1.values))].copy() fip = fip[~(fip.key.isin(pac.key2.values))].copy() log.info(u" 2.1 new fip created") # We build a dataframe to link the pac to their type and noindiv tmp_pac1 = pac[['noindiv', 'key1']].copy() tmp_pac2 = pac[['noindiv', 'key2']].copy() tmp_indivifip = indivifip[['key', 'type_pac', 'naia']].copy() pac_ind1 = tmp_pac1.merge(tmp_indivifip, left_on='key1', right_on='key', how='inner') log.info(u"{} pac dans les 1ères déclarations".format(len(pac_ind1))) pac_ind2 = tmp_pac2.merge(tmp_indivifip, left_on='key2', right_on='key', how='inner') log.info(u"{} pac dans les 2èms déclarations".format(len(pac_ind2))) log.info("{} duplicated pac_ind1".format(pac_ind1.duplicated().sum())) log.info("{} duplicated pac_ind2".format(pac_ind2.duplicated().sum())) del pac_ind1['key1'], pac_ind2['key2'] if len(pac_ind1.index) == 0: if len(pac_ind2.index) == 0: log.info( u"Warning : no link between pac and noindiv for both pacInd1&2" ) else: log.info(u"Warning : pacInd1 is an empty data frame") pacInd = pac_ind2 elif len(pac_ind2.index) == 0: log.info(u"Warning : pacInd2 is an empty data frame") pacInd = pac_ind1 else: pacInd = concat([pac_ind2, pac_ind1]) assert len(pac_ind1) + len(pac_ind2) == len(pacInd) log.info("{} null pac_ind2.type_pac".format( pac_ind2.type_pac.isnull().sum())) log.info("pacInd.type_pac.value_counts()) \n {}".format( pacInd.type_pac.value_counts(dropna=False))) log.info(u" 2.2 : pacInd created") log.info(u"doublons noindiv, type_pac {}".format( pacInd.duplicated(['noindiv', 'type_pac']).sum())) log.info(u"doublons noindiv seulement {}".format( pacInd.duplicated('noindiv').sum())) log.info(u"nb de NaN {}".format(pacInd.type_pac.isnull().sum())) del pacInd["key"] pacIndiv = pacInd[~(pacInd.duplicated('noindiv'))].copy() # pacIndiv.reset_index(inplace=True) log.info("{}".format(pacIndiv.columns)) temporary_store['pacIndiv_{}'.format(year)] = pacIndiv log.info("{}".format(pacIndiv.type_pac.value_counts())) gc.collect() # We keep the fip in the menage of their parents because it is used in to # build the famille. We should build an individual ident (ménage) for the fip that are # older than 18 since they are not in their parents' menage according to the eec log.info("{}".format(indivi['declar1'].str[0:2].value_counts())) log.info("{}".format(indivi['declar1'].str[0:2].describe())) log.info("{}".format(indivi['declar1'].str[0:2].notnull().all())) log.info("{}".format(indivi.info())) selection = indivi['declar1'].str[0:2] != "" indivi['noidec'] = indivi.declar1[selection].str[0:2].astype( 'int32') # To be used later to set idfoy individec1 = indivi[(indivi.declar1.isin(fip.declaration.values)) & (indivi.persfip == "vous")] individec1 = individec1[[ "declar1", "noidec", "ident", "rga", "ztsai", "ztsao" ]].copy() individec1 = individec1.rename(columns={'declar1': 'declaration'}) fip1 = fip.merge(individec1, on='declaration') log.info(u" 2.3 : fip1 created") individec2 = indivi.loc[ (indivi.declar2.isin(fip.declaration.values)) & (indivi['persfip'] == "vous"), ["declar2", "noidec", "ident", "rga", "ztsai", "ztsao"]].copy() individec2.rename(columns={'declar2': 'declaration'}, inplace=True) fip2 = fip.merge(individec2) log.info(u" 2.4 : fip2 created") fip1.duplicated().value_counts() fip2.duplicated().value_counts() fip = concat([fip1, fip2]) fip['persfip'] = 'pac' fip['year'] = year fip['year'] = fip['year'].astype( 'float') # BUG; pas de colonne année dans la DF fip['noi'] = 99 fip['noicon'] = None fip['noindiv'] = fip['declaration'].copy() fip['noiper'] = None fip['noimer'] = None fip['declar1'] = fip['declaration'].copy() fip['naim'] = 99 fip['lien'] = None fip['quelfic'] = 'FIP' fip['acteu'] = None fip['agepf'] = fip['year'] - fip.naia.astype('float') fip['lpr'] = (fip['agepf'] <= 20) * 3 + (fip['agepf'] > 20) * 4 fip['stc'] = None fip['contra'] = None fip['titc'] = None fip['mrec'] = None fip['forter'] = None fip['rstg'] = None fip['retrai'] = None fip['cohab'] = None fip['sexe'] = None fip['persfip'] = "pac" fip['agepr'] = None fip['actrec'] = (fip['agepf'] <= 15) * 9 + (fip['agepf'] > 15) * 5 # TODO: probleme actrec des enfants fip entre 16 et 20 ans : on ne sait pas s'ils sont étudiants ou salariés */ # TODO problème avec les mois des enfants FIP : voir si on ne peut pas remonter à ces valeurs: Alexis: clairement non # Reassigning noi for fip children if they are more than one per foyer fiscal fip["noi"] = fip["noi"].astype("int64") fip["ident"] = fip["ident"].astype("int64") fip_tmp = fip[['noi', 'ident']] while any(fip.duplicated(subset=['noi', 'ident'])): fip_tmp = fip.loc[:, ['noi', 'ident']] dup = fip_tmp.duplicated() tmp = fip.loc[dup, 'noi'] log.info("{}".format(len(tmp))) fip.loc[dup, 'noi'] = tmp.astype('int64') - 1 fip['idfoy'] = 100 * fip['ident'] + fip['noidec'] fip['noindiv'] = 100 * fip['ident'] + fip['noi'] fip['type_pac'] = 0 fip['key'] = 0 log.info("Number of duplicated fip: {}".format( fip.duplicated('noindiv').value_counts())) temporary_store['fipDat_{}'.format(year)] = fip del fip, fip1, individec1, indivifip, indivi, pac log.info(u"fip sauvegardé")
def create_indivim_menagem(temporary_store = None, year = None): """ Création des tables ménages et individus concaténée (merged) """ # Prepare the some useful merged tables assert temporary_store is not None assert year is not None # load data erfs_survey_collection = SurveyCollection.load( collection = 'erfs', config_files_directory = config_files_directory) year_specific_by_generic = year_specific_by_generic_data_frame_name(year) survey = erfs_survey_collection.get_survey('erfs_{}'.format(year)) erfmen = survey.get_values(table = year_specific_by_generic["erf_menage"]) eecmen = survey.get_values(table = year_specific_by_generic["eec_menage"]) erfind = survey.get_values(table = year_specific_by_generic["erf_indivi"]) eecind = survey.get_values(table = year_specific_by_generic["eec_indivi"]) # travail sur la cohérence entre les bases noappar_m = eecmen[~(eecmen.ident.isin(erfmen.ident.values))].copy() noappar_i = eecmen[~(eecind.ident.isin(erfind.ident.values))].copy() noappar_i = noappar_i.drop_duplicates(subset = 'ident', take_last = True) # TODO: vérifier qu'il n'y a théoriquement pas de doublon difference = set(noappar_i.ident).symmetric_difference(noappar_m.ident) intersection = set(noappar_i.ident) & set(noappar_m.ident) log.info("There are {} differences and {} intersections".format(len(difference), len(intersection))) del noappar_i, noappar_m, difference, intersection gc.collect() # fusion enquete emploi et source fiscale menagem = erfmen.merge(eecmen) indivim = eecind.merge(erfind, on = ['noindiv', 'ident', 'noi'], how = "inner") # optimisation des types? Controle de l'existence en passant # TODO: minimal dtype # TODO: this should be done somewhere else var_list = ([ 'acteu', 'agepr', 'cohab', 'contra', 'encadr', 'forter', 'lien', 'mrec', 'naia', 'noicon', 'noimer', 'noiper', 'prosa', 'retrai', 'rstg', 'statut', 'stc', 'titc', 'txtppb', ]) for var in var_list: assert numpy.issubdtype(indivim[var].dtype , numpy.integer), "Variable {} dtype is {} and should be an integer".format( var, indivim[var].dtype) ######################## # création de variables# ######################## # print indivim # actrec : activité recodée comme preconisé par l'INSEE p84 du guide utilisateur indivim["actrec"] = numpy.nan # Attention : Q: pas de 6 ?!! A : Non pas de 6, la variable recodée de l'INSEE (voit p84 du guide methodo), ici \ # la même nomenclature à été adopée # 3: contrat a durée déterminée indivim.actrec.loc[indivim.acteu == 1] = 3 # 8 : femme (homme) au foyer, autre inactif indivim.actrec.loc[indivim.acteu == 3] = 8 # 1 : actif occupé non salarié filter1 = (indivim.acteu == 1) & (indivim.stc.isin([1, 3])) # actifs occupés non salariés à son compte ou pour un indivim.actrec.loc[filter1] = 1 # membre de sa famille # 2 : salarié pour une durée non limitée filter2 = (indivim.acteu == 1) & (((indivim.stc == 2) & (indivim.contra == 1)) | (indivim.titc == 2)) indivim.actrec.loc[filter2] = 2 # 4 : au chomage filter4 = (indivim.acteu == 2) | ((indivim.acteu == 3) & (indivim.mrec == 1)) indivim.actrec.loc[filter4] = 4 # 5 : élève étudiant , stagiaire non rémunéré filter5 = (indivim.acteu == 3) & ((indivim.forter == 2) | (indivim.rstg == 1)) indivim.actrec.loc[filter5] = 5 # 7 : retraité, préretraité, retiré des affaires unchecked filter7 = (indivim.acteu == 3) & ((indivim.retrai == 1) | (indivim.retrai == 2)) indivim.actrec.loc[filter7] = 7 # 9 : probablement enfants de - de 16 ans TODO: check that fact in database and questionnaire indivim.actrec.loc[indivim.acteu == 0] = 9 indivim.actrec = indivim.actrec.astype("int8") assert_dtype(indivim.actrec, "int8") assert indivim.actrec.isin(range(1, 10)).all(), 'actrec values are outside the interval [1, 9]' # TODO : compare the result with results provided by Insee # tu99 if year == 2009: erfind['tu99'] = None # TODO: why ? # Locataire menagem["locataire"] = menagem.so.isin([3, 4, 5]) assert_dtype(menagem.locataire, "bool") transfert = indivim.loc[indivim.lpr == 1, ['ident', 'ddipl']].copy() menagem = menagem.merge(transfert) # Correction def _manually_remove_errors(): ''' This method is here because some oddities can make it through the controls throughout the procedure It is here to remove all these individual errors that compromise the process. ''' if year == 2006: indivim.lien[indivim.noindiv == 603018905] = 2 indivim.noimer[indivim.noindiv == 603018905] = 1 log.info("{}".format(indivim[indivim.noindiv == 603018905].to_string())) _manually_remove_errors() temporary_store['menagem_{}'.format(year)] = menagem del eecmen, erfmen, menagem, transfert gc.collect() temporary_store['indivim_{}'.format(year)] = indivim del erfind, eecind
def create_enfants_a_naitre(temporary_store = None, year = None): ''' ''' assert temporary_store is not None assert year is not None erfs_survey_collection = SurveyCollection.load( collection = 'erfs', config_files_directory = config_files_directory) survey = erfs_survey_collection.get_survey('erfs_{}'.format(year)) # Enfant à naître (NN pour nouveaux nés) individual_vars = [ 'acteu', 'agepr', 'cohab', 'contra', 'forter', 'ident', 'lien', 'lpr', 'mrec', 'naia', 'naim', 'noi', 'noicon', 'noimer', 'noindiv', 'noiper', 'retrai', 'rga', 'rstg', 'sexe', 'stc', 'titc', ] year_specific_by_generic = year_specific_by_generic_data_frame_name(year) eeccmp1 = survey.get_values(table = year_specific_by_generic["eec_cmp_1"], variables = individual_vars) eeccmp2 = survey.get_values(table = year_specific_by_generic["eec_cmp_2"], variables = individual_vars) eeccmp3 = survey.get_values(table = year_specific_by_generic["eec_cmp_3"], variables = individual_vars) tmp = eeccmp1.merge(eeccmp2, how = "outer") enfants_a_naitre = tmp.merge(eeccmp3, how = "outer") # optimisation des types? Controle de l'existence en passant # pourquoi pas des int quand c'est possible # TODO: minimal dtype TODO: shoudln't be here for var in individual_vars: assert_dtype(enfants_a_naitre[var], 'float') del eeccmp1, eeccmp2, eeccmp3, individual_vars, tmp gc.collect() # création de variables enfants_a_naitre['declar1'] = '' enfants_a_naitre['noidec'] = 0 enfants_a_naitre['ztsai'] = 0 enfants_a_naitre['year'] = year enfants_a_naitre.year = enfants_a_naitre.year.astype("float32") # TODO: should be an integer but NaN are present enfants_a_naitre['agepf'] = enfants_a_naitre.year - enfants_a_naitre.naia enfants_a_naitre.loc[enfants_a_naitre.naim >= 7,'agepf'] -= 1 enfants_a_naitre['actrec'] = 9 enfants_a_naitre['quelfic'] = 'ENF_NN' enfants_a_naitre['persfip'] = "" # TODO: deal with agepf for series_name in ['actrec', 'noidec', 'ztsai']: assert_dtype(enfants_a_naitre[series_name], "int") # selection enfants_a_naitre = enfants_a_naitre[ ( (enfants_a_naitre.naia == enfants_a_naitre.year) & (enfants_a_naitre.naim >= 10) ) | ( (enfants_a_naitre.naia == enfants_a_naitre.year + 1) & (enfants_a_naitre.naim <= 5) ) ].copy() temporary_store["enfants_a_naitre_{}".format(year)] = enfants_a_naitre
""" Created on Mon Sep 28 11:25:40 2015 @author: malkaguillot """ import pandas from openfisca_france_data import default_config_files_directory as config_files_directory from openfisca_france_data.input_data_builders.build_openfisca_survey_data.base \ import year_specific_by_generic_data_frame_name from openfisca_france_data.temporary import get_store from openfisca_survey_manager.survey_collections import SurveyCollection # En entrée : tables individus, foyer et sif de ERFS (testé sur 2009) year = 2009 year_specific_by_generic = year_specific_by_generic_data_frame_name(year) erfs_survey_collection = SurveyCollection.load( collection='erfs', config_files_directory=config_files_directory) survey = erfs_survey_collection.get_survey('erfs_{}'.format(year)) foyer_all = survey.get_values(table=year_specific_by_generic["foyer"]) erfind = survey.get_values(table=year_specific_by_generic["erf_indivi"]) temporary_store = get_store(file_name='erfs') sif = temporary_store['sif'] ind = erfind[[ 'ident', 'noindiv', 'declar1', 'declar2', 'zsali', 'persfip', 'persfipd' ]] small_sif = sif[['noindiv', 'declar', 'causeXYZ']] foyer = foyer_all[[