def build_erf_data_frames(self): # TODO: remove this self.columns_to_fetch = ['af'] variables = self.columns_to_fetch erf_survey_collection = SurveyCollection.load( collection = "erfs", config_files_directory = config_files_directory) erf_survey = erf_survey_collection.get_survey("erfs_{}".format(year)) year_specific_by_generic = year_specific_by_generic_data_frame_name(year) generic_by_year_specific = dict(zip(year_specific_by_generic.values(), year_specific_by_generic.keys())) erf_variables = list(set(variables + ["ident", "wprm", "quelfic", "noi"])) of2erf = get_of2erf() for index, variable in enumerate(erf_variables): if variable in of2erf: erf_variables[index] = of2erf[variable] data_frame_by_table = dict(eec_indivi = None, erf_indivi = None, erf_menage = None) erf_variables_by_generic_table = dict(eec_indivi = [], erf_indivi = [], erf_menage = []) year_specific_tables_by_erf_variable = dict( [ ( erf_variable, set( erf_survey.find_tables(variable = erf_variable) ).intersection( set([year_specific_by_generic[key] for key in erf_variables_by_generic_table.keys()]) ) ) for erf_variable in erf_variables ] ) for variable, year_specific_tables in year_specific_tables_by_erf_variable.iteritems(): if len(year_specific_tables) < 1: log.info("No tables are present for variable {}".format(variable)) continue else: log.info("Variable {} is present in multiple tables : {}".format(variable, year_specific_tables)) for table in year_specific_tables: log.info("Variable {} is retrieved from table {}".format(variable, table)) erf_variables_by_generic_table[generic_by_year_specific[table]].append(variable) erf2of = get_erf2of() for table, erf_variables in erf_variables_by_generic_table.iteritems(): if erf_variables: data_frame_by_table[table] = erf_survey.get_values( variables = erf_variables, table = year_specific_by_generic[table] ) data_frame_by_table[table].rename(columns = erf2of, inplace = True) data_frame_by_table[table].rename(columns = {'ident': 'idmen'}, inplace = True) assert not data_frame_by_table["erf_menage"].duplicated().any(), "Duplicated idmen in erf_menage" self.erf_data_frame_by_entity_key_plural = dict( menages = data_frame_by_table["erf_menage"], individus = data_frame_by_table["erf_indivi"].merge(data_frame_by_table["eec_indivi"]) )
def merge_tables(temporary_store = None, year = None): """ Création des tables ménages et individus concaténée (merged) """ # Prepare the some useful merged tables assert temporary_store is not None assert year is not None # load data erfs_survey_collection = SurveyCollection.load( collection = 'erfs', config_files_directory = config_files_directory) year_specific_by_generic = year_specific_by_generic_data_frame_name(year) survey = erfs_survey_collection.get_survey('erfs_{}'.format(year)) erfmen = survey.get_values(table = year_specific_by_generic["erf_menage"]) eecmen = survey.get_values(table = year_specific_by_generic["eec_menage"]) erfind = survey.get_values(table = year_specific_by_generic["erf_indivi"]) eecind = survey.get_values(table = year_specific_by_generic["eec_indivi"]) # travail sur la cohérence entre les bases noappar_m = eecmen[~(eecmen.ident.isin(erfmen.ident.values))].copy() noappar_i = eecmen[~(eecind.ident.isin(erfind.ident.values))].copy() noappar_i = noappar_i.drop_duplicates(subset = 'ident', take_last = True) # TODO: vérifier qu'il n'y a théoriquement pas de doublon difference = set(noappar_i.ident).symmetric_difference(noappar_m.ident) intersection = set(noappar_i.ident) & set(noappar_m.ident) log.info("There are {} differences and {} intersections".format(len(difference), len(intersection))) del noappar_i, noappar_m, difference, intersection gc.collect() # fusion enquete emploi et source fiscale menagem = erfmen.merge(eecmen) indivim = eecind.merge(erfind, on = ['noindiv', 'ident', 'noi'], how = "inner") var_list = [ 'acteu', 'agepr', 'cohab', 'contra', 'encadr', 'forter', 'lien', 'mrec', 'naia', 'noicon', 'noimer', 'noiper', 'prosa', 'retrai', 'rstg', 'statut', 'stc', 'otitc', 'txtppb', ] check_integer_dtype(indivim, var_list) create_actrec_variable(indivim) create_variable_locataire(menagem) menagem = menagem.merge( indivim.loc[indivim.lpr == 1, ['ident', 'ddipl']].copy() ) manually_remove_noindiv_errors(indivim) temporary_store['menagem_{}'.format(year)] = menagem del eecmen, erfmen, menagem gc.collect() temporary_store['indivim_{}'.format(year)] = indivim del erfind, eecind
def create_fip(temporary_store=None, year=None): assert temporary_store is not None assert year is not None # FIP : fichier d'imposition des personnes """ Creates a 'fipDat' table containing all these 'fip individuals' """ # Some individuals are declared as 'personne à charge' (pac) on 'tax forms' # but are not present in the erf or eec tables. # We add them to ensure consistency between concepts. year_specific_by_generic = year_specific_by_generic_data_frame_name(year) erfs_survey_collection = SurveyCollection.load( collection='erfs', config_files_directory=config_files_directory) survey = erfs_survey_collection.get_survey('erfs_{}'.format(year)) log.info(u"Démarrage de 03_fip") # anaisenf is a string containing letter code of pac (F,G,H,I,J,N,R) and year of birth (example: 'F1990H1992') # when a child is invalid, he appears twice in anaisenf (example: F1900G1900 is a single invalid child born in 1990) erfFoyVar = ['declar', 'anaisenf'] foyer = survey.get_values(table=year_specific_by_generic["foyer"], variables=erfFoyVar) foyer.replace({'anaisenf': {'NA': np.nan}}, inplace=True) log.info(u"Etape 1 : on récupere les personnes à charge des foyers") log.info(u" 1.1 : Création des codes des enfants") foyer['anaisenf'] = foyer['anaisenf'].astype('string') nb_pac_max = len(max(foyer['anaisenf'], key=len)) / 5 log.info(u"il ya a au maximum {} pac par foyer".format(nb_pac_max)) # Separating the string coding the pac of each "déclaration". # Creating a list containing the new variables. # Creating the multi_index for the columns multi_index_columns = [] assert int( nb_pac_max ) == nb_pac_max, "nb_pac_max = {} which is not an integer".format( nb_pac_max) nb_pac_max = int(nb_pac_max) for i in range(1, nb_pac_max + 1): pac_tuples_list = [(i, 'declaration'), (i, 'type_pac'), (i, 'naia')] multi_index_columns += pac_tuples_list columns = MultiIndex.from_tuples(multi_index_columns, names=['pac_number', 'variable']) fip = DataFrame(np.random.randn(len(foyer), 3 * nb_pac_max), columns=columns) for i in range(1, nb_pac_max + 1): # TODO: using values to deal with mismatching indexes fip[(i, 'declaration')] = foyer['declar'].values fip[(i, 'type_pac')] = foyer['anaisenf'].str[5 * (i - 1)].values fip[(i, 'naia')] = foyer['anaisenf'].str[5 * (i - 1) + 1:5 * i].values fip = fip.stack("pac_number") fip.reset_index(inplace=True) fip.drop(['level_0'], axis=1, inplace=True) log.info(u" 1.2 : elimination des foyers fiscaux sans pac") # Clearing missing values and changing data format fip = fip[(fip.type_pac.notnull()) & (fip.naia != 'an') & (fip.naia != '')].copy() fip = fip.sort(columns=['declaration', 'naia', 'type_pac']) fip.set_index(["declaration", "pac_number"], inplace=True) fip = fip.reset_index() fip.drop(['pac_number'], axis=1, inplace=True) assert fip.type_pac.isin(["F", "G", "H", "I", "J", "N", "R"]).all(), \ "Certains types de PAC ne sont pas des cases connues" # control(fip, debug=True, verbose=True, verbose_columns=['naia']) log.info( u" 1.3 : on enlève les individus F pour lesquels il existe un individu G" ) type_FG = fip[fip.type_pac.isin( ['F', 'G'])].copy() # Filtre pour ne travailler que sur F & G type_FG['same_pair'] = type_FG.duplicated(subset=['declaration', 'naia'], take_last=True) type_FG['is_twin'] = type_FG.duplicated( subset=['declaration', 'naia', 'type_pac']) type_FG['to_keep'] = ~(type_FG['same_pair']) | type_FG['is_twin'] # Note : On conserve ceux qui ont des couples déclar/naia différents et les jumeaux # puis on retire les autres (à la fois F et G) fip['to_keep'] = np.nan fip.update(type_FG) log.info(u" 1.4 : on enlève les H pour lesquels il y a un I") type_HI = fip[fip.type_pac.isin(['H', 'I'])].copy() type_HI['same_pair'] = type_HI.duplicated(subset=['declaration', 'naia'], take_last=True) type_HI['is_twin'] = type_HI.duplicated( subset=['declaration', 'naia', 'type_pac']) type_HI['to_keep'] = (~(type_HI['same_pair']) | (type_HI['is_twin'])).values fip.update(type_HI) fip['to_keep'] = fip['to_keep'].fillna(True) log.info( u"{} F, G, H or I non redundant pac kept over {} potential candidates". format(fip['to_keep'].sum(), len(fip))) indivifip = fip[fip['to_keep']].copy() del indivifip['to_keep'], fip, type_FG, type_HI # # control(indivifip, debug=True) log.info(u"Step 2 : matching indivifip with eec file") indivi = temporary_store['indivim_{}'.format(year)] pac = indivi[(indivi.persfip.notnull()) & (indivi.persfip == 'pac')].copy() assert indivifip.naia.notnull().all( ), "Il y a des valeurs manquantes de la variable naia" # For safety enforce pac.naia and indivifip.naia dtypes pac['naia'] = pac.naia.astype('int32') indivifip['naia'] = indivifip.naia.astype('int32') pac['key1'] = zip(pac.naia, pac['declar1'].str[:29]) pac['key2'] = zip(pac.naia, pac['declar2'].str[:29]) indivifip['key'] = zip(indivifip.naia.values, indivifip['declaration'].str[:29].values) assert pac.naia.dtype == indivifip.naia.dtype, \ "Les dtypes de pac.naia {} et indvifip.naia {} sont différents".format(pac.naia.dtype, indivifip.naia.dtype) fip = indivifip[~(indivifip.key.isin(pac.key1.values))].copy() fip = fip[~(fip.key.isin(pac.key2.values))].copy() log.info(u" 2.1 new fip created") # We build a dataframe to link the pac to their type and noindiv tmp_pac1 = pac[['noindiv', 'key1']].copy() tmp_pac2 = pac[['noindiv', 'key2']].copy() tmp_indivifip = indivifip[['key', 'type_pac', 'naia']].copy() pac_ind1 = tmp_pac1.merge(tmp_indivifip, left_on='key1', right_on='key', how='inner') log.info(u"{} pac dans les 1ères déclarations".format(len(pac_ind1))) pac_ind2 = tmp_pac2.merge(tmp_indivifip, left_on='key2', right_on='key', how='inner') log.info(u"{} pac dans les 2èms déclarations".format(len(pac_ind2))) log.info("{} duplicated pac_ind1".format(pac_ind1.duplicated().sum())) log.info("{} duplicated pac_ind2".format(pac_ind2.duplicated().sum())) del pac_ind1['key1'], pac_ind2['key2'] if len(pac_ind1.index) == 0: if len(pac_ind2.index) == 0: log.info( u"Warning : no link between pac and noindiv for both pacInd1&2" ) else: log.info(u"Warning : pacInd1 is an empty data frame") pacInd = pac_ind2 elif len(pac_ind2.index) == 0: log.info(u"Warning : pacInd2 is an empty data frame") pacInd = pac_ind1 else: pacInd = concat([pac_ind2, pac_ind1]) assert len(pac_ind1) + len(pac_ind2) == len(pacInd) log.info("{} null pac_ind2.type_pac".format( pac_ind2.type_pac.isnull().sum())) log.info("pacInd.type_pac.value_counts()) \n {}".format( pacInd.type_pac.value_counts(dropna=False))) log.info(u" 2.2 : pacInd created") log.info(u"doublons noindiv, type_pac {}".format( pacInd.duplicated(['noindiv', 'type_pac']).sum())) log.info(u"doublons noindiv seulement {}".format( pacInd.duplicated('noindiv').sum())) log.info(u"nb de NaN {}".format(pacInd.type_pac.isnull().sum())) del pacInd["key"] pacIndiv = pacInd[~(pacInd.duplicated('noindiv'))].copy() # pacIndiv.reset_index(inplace=True) log.info("{}".format(pacIndiv.columns)) temporary_store['pacIndiv_{}'.format(year)] = pacIndiv log.info("{}".format(pacIndiv.type_pac.value_counts())) gc.collect() # We keep the fip in the menage of their parents because it is used in to # build the famille. We should build an individual ident (ménage) for the fip that are # older than 18 since they are not in their parents' menage according to the eec log.info("{}".format(indivi['declar1'].str[0:2].value_counts())) log.info("{}".format(indivi['declar1'].str[0:2].describe())) log.info("{}".format(indivi['declar1'].str[0:2].notnull().all())) log.info("{}".format(indivi.info())) selection = indivi['declar1'].str[0:2] != "" indivi['noidec'] = indivi.declar1[selection].str[0:2].astype( 'int32') # To be used later to set idfoy individec1 = indivi[(indivi.declar1.isin(fip.declaration.values)) & (indivi.persfip == "vous")] individec1 = individec1[[ "declar1", "noidec", "ident", "rga", "ztsai", "ztsao" ]].copy() individec1 = individec1.rename(columns={'declar1': 'declaration'}) fip1 = fip.merge(individec1, on='declaration') log.info(u" 2.3 : fip1 created") individec2 = indivi.loc[ (indivi.declar2.isin(fip.declaration.values)) & (indivi['persfip'] == "vous"), ["declar2", "noidec", "ident", "rga", "ztsai", "ztsao"]].copy() individec2.rename(columns={'declar2': 'declaration'}, inplace=True) fip2 = fip.merge(individec2) log.info(u" 2.4 : fip2 created") fip1.duplicated().value_counts() fip2.duplicated().value_counts() fip = concat([fip1, fip2]) fip['persfip'] = 'pac' fip['year'] = year fip['year'] = fip['year'].astype( 'float') # BUG; pas de colonne année dans la DF fip['noi'] = 99 fip['noicon'] = None fip['noindiv'] = fip['declaration'].copy() fip['noiper'] = None fip['noimer'] = None fip['declar1'] = fip['declaration'].copy() fip['naim'] = 99 fip['lien'] = None fip['quelfic'] = 'FIP' fip['acteu'] = None fip['agepf'] = fip['year'] - fip.naia.astype('float') fip['lpr'] = (fip['agepf'] <= 20) * 3 + (fip['agepf'] > 20) * 4 fip['stc'] = None fip['contra'] = None fip['titc'] = None fip['mrec'] = None fip['forter'] = None fip['rstg'] = None fip['retrai'] = None fip['cohab'] = None fip['sexe'] = None fip['persfip'] = "pac" fip['agepr'] = None fip['actrec'] = (fip['agepf'] <= 15) * 9 + (fip['agepf'] > 15) * 5 # TODO: probleme actrec des enfants fip entre 16 et 20 ans : on ne sait pas s'ils sont étudiants ou salariés */ # TODO problème avec les mois des enfants FIP : voir si on ne peut pas remonter à ces valeurs: Alexis: clairement non # Reassigning noi for fip children if they are more than one per foyer fiscal fip["noi"] = fip["noi"].astype("int64") fip["ident"] = fip["ident"].astype("int64") fip_tmp = fip[['noi', 'ident']] while any(fip.duplicated(subset=['noi', 'ident'])): fip_tmp = fip.loc[:, ['noi', 'ident']] dup = fip_tmp.duplicated() tmp = fip.loc[dup, 'noi'] log.info("{}".format(len(tmp))) fip.loc[dup, 'noi'] = tmp.astype('int64') - 1 fip['idfoy'] = 100 * fip['ident'] + fip['noidec'] fip['noindiv'] = 100 * fip['ident'] + fip['noi'] fip['type_pac'] = 0 fip['key'] = 0 log.info("Number of duplicated fip: {}".format( fip.duplicated('noindiv').value_counts())) temporary_store['fipDat_{}'.format(year)] = fip del fip, fip1, individec1, indivifip, indivi, pac log.info(u"fip sauvegardé")
def sif(temporary_store=None, year=None): assert temporary_store is not None assert year is not None year_specific_by_generic = year_specific_by_generic_data_frame_name(year) erfs_survey_collection = SurveyCollection.load( collection='erfs', config_files_directory=config_files_directory) erfs_survey = erfs_survey_collection.get_survey('erfs_{}'.format(year)) log.info("05_foyer: extraction des données foyer") # TODO Comment choisir le rfr n -2 pour éxonération de TH ? # mnrvka Revenu TH n-2 # mnrvkh revenu TH (revenu fiscal de référence) # # On récupère les variables du code sif sif = erfs_survey.get_values(variables=[ "noindiv", 'sif', "nbptr", "mnrvka", "rbg", "tsrvbg", "declar" ], table=year_specific_by_generic["foyer"]) sif['statmarit'] = 0 if year == 2009: old_sif = sif['sif'][sif['noindiv'] == 901803201].copy() new_sif = old_sif.str[0:59] + old_sif.str[60:] + "0" sif.sif.loc[sif['noindiv'] == 901803201] = new_sif.values old_sif = sif.sif.loc[sif['noindiv'] == 900872201] new_sif = old_sif.str[0:58] + " " + old_sif.str[58:] sif.sif.loc[sif['noindiv'] == 900872201] = new_sif.values del old_sif, new_sif sif["rbg"] = sif["rbg"] * ((sif["tsrvbg"] == '+').astype(int) - (sif["tsrvbg"] == '-').astype(int)) sif["stamar"] = sif.sif.str[4:5] # Converting marital status statmarit_dict = {"M": 1, "C": 2, "D": 3, "V": 4, "O": 5} for key, val in statmarit_dict.iteritems(): sif.statmarit.loc[sif.stamar == key] = val sif["birthvous"] = sif.sif.str[5:9] sif["birthconj"] = sif.sif.str[10:14] sif["caseE"] = sif.sif.str[15:16] == "E" sif["caseF"] = sif.sif.str[16:17] == "F" sif["caseG"] = sif.sif.str[17:18] == "G" sif["caseK"] = sif.sif.str[18:19] == "K" d = 0 if year in [2006, 2007]: sif["caseL"] = sif.sif.str[19:20] == "L" sif["caseP"] = sif.sif.str[20:21] == "P" sif["caseS"] = sif.sif.str[21:22] == "S" sif["caseW"] = sif.sif.str[22:23] == "W" sif["caseN"] = sif.sif.str[23:24] == "N" sif["caseH"] = sif.sif.str[24:28] sif["caseT"] = sif.sif.str[28:29] == "T" if year in [2008]: d = -1 # fin de la case L sif["caseP"] = sif.sif.str[20 + d:21 + d] == "P" sif["caseS"] = sif.sif.str[21 + d:22 + d] == "S" sif["caseW"] = sif.sif.str[22 + d:23 + d] == "W" sif["caseN"] = sif.sif.str[23 + d:24 + d] == "N" sif["caseH"] = sif.sif.str[24 + d:28 + d] sif["caseT"] = sif.sif.str[28 + d:29 + d] == "T" if year in [2009]: sif["caseL"] = sif.sif.str[19:20] == "L" sif["caseP"] = sif.sif.str[20:21] == "P" sif["caseS"] = sif.sif.str[21:22] == "S" sif["caseW"] = sif.sif.str[22:23] == "W" sif["caseN"] = sif.sif.str[23:24] == "N" # caseH en moins par rapport à 2008 (mais case en L en plus) # donc décalage par rapport à 2006 d = -4 sif["caseT"] = sif.sif.str[28 + d:29 + d] == "T" sif["caseX"] = sif.sif.str[33 + d:34 + d] == "X" sif["dateX"] = sif.sif.str[34 + d:42 + d] sif["caseY"] = sif.sif.str[42 + d:43 + d] == "Y" sif["dateY"] = sif.sif.str[43 + d:51 + d] sif["caseZ"] = sif.sif.str[51 + d:52 + d] == "Z" sif["dateZ"] = sif.sif.str[52 + d:60 + d] sif["causeXYZ"] = sif.sif.str[60 + d:61 + d] # TODO: convert dateXYZ to appropriate date in pandas # print sif["dateY"].unique() sif["nbptr"] = sif.nbptr.values / 100 sif["rfr_n_2"] = sif.mnrvka.values sif["nbF"] = sif.sif.str[64 + d:66 + d] sif["nbG"] = sif.sif.str[67 + d:69 + d] sif["nbR"] = sif.sif.str[70 + d:72 + d] sif["nbJ"] = sif.sif.str[73 + d:75 + d] sif["nbN"] = sif.sif.str[76 + d:78 + d] sif["nbH"] = sif.sif.str[79 + d:81 + d] sif["nbI"] = sif.sif.str[82 + d:84 + d] if (year != 2009): sif["nbP"] = sif.sif.str[85 + d:87 + d] del sif["stamar"] duplicated_noindiv = sif.noindiv[sif.noindiv.duplicated()].copy() sif['duplicated_noindiv'] = sif.noindiv.isin(duplicated_noindiv) x = sif.loc[sif.duplicated_noindiv, ['noindiv', 'declar']] sif['change'] = "NONE" sif.loc[sif.duplicated_noindiv, 'change'] = sif.loc[sif.duplicated_noindiv, 'declar'].str[27:28] log.info("Number of individuals: {}".format(len(sif.noindiv))) log.info("Number of duplicated individuals: {}".format( len(duplicated_noindiv))) log.info("Number of distinct individuals: {}".format( len(sif.noindiv.value_counts()))) log.info(u"Saving sif") temporary_store['sif_{}'.format(year)] = sif del sif gc.collect()
def create_enfants_a_naitre(temporary_store = None, year = None): ''' ''' assert temporary_store is not None assert year is not None erfs_survey_collection = SurveyCollection.load( collection = 'erfs', config_files_directory = config_files_directory) survey = erfs_survey_collection.get_survey('erfs_{}'.format(year)) # Enfant à naître (NN pour nouveaux nés) individual_vars = [ 'acteu', 'agepr', 'cohab', 'contra', 'forter', 'ident', 'lien', 'lpr', 'mrec', 'naia', 'naim', 'noi', 'noicon', 'noimer', 'noindiv', 'noiper', 'retrai', 'rga', 'rstg', 'sexe', 'stc', 'titc', ] year_specific_by_generic = year_specific_by_generic_data_frame_name(year) eeccmp1 = survey.get_values(table = year_specific_by_generic["eec_cmp_1"], variables = individual_vars) eeccmp2 = survey.get_values(table = year_specific_by_generic["eec_cmp_2"], variables = individual_vars) eeccmp3 = survey.get_values(table = year_specific_by_generic["eec_cmp_3"], variables = individual_vars) tmp = eeccmp1.merge(eeccmp2, how = "outer") enfants_a_naitre = tmp.merge(eeccmp3, how = "outer") # optimisation des types? Controle de l'existence en passant # pourquoi pas des int quand c'est possible # TODO: minimal dtype TODO: shoudln't be here for var in individual_vars: assert_dtype(enfants_a_naitre[var], 'float') del eeccmp1, eeccmp2, eeccmp3, individual_vars, tmp gc.collect() # création de variables enfants_a_naitre['declar1'] = '' enfants_a_naitre['noidec'] = 0 enfants_a_naitre['ztsai'] = 0 enfants_a_naitre['year'] = year enfants_a_naitre.year = enfants_a_naitre.year.astype("float32") # TODO: should be an integer but NaN are present enfants_a_naitre['agepf'] = enfants_a_naitre.year - enfants_a_naitre.naia enfants_a_naitre.loc[enfants_a_naitre.naim >= 7,'agepf'] -= 1 enfants_a_naitre['actrec'] = 9 enfants_a_naitre['quelfic'] = 'ENF_NN' enfants_a_naitre['persfip'] = "" # TODO: deal with agepf for series_name in ['actrec', 'noidec', 'ztsai']: assert_dtype(enfants_a_naitre[series_name], "int") # selection enfants_a_naitre = enfants_a_naitre[ ( (enfants_a_naitre.naia == enfants_a_naitre.year) & (enfants_a_naitre.naim >= 10) ) | ( (enfants_a_naitre.naia == enfants_a_naitre.year + 1) & (enfants_a_naitre.naim <= 5) ) ].copy() temporary_store["enfants_a_naitre_{}".format(year)] = enfants_a_naitre
def foyer_all(temporary_store=None, year=None): year_specific_by_generic = year_specific_by_generic_data_frame_name(year) # On ajoute les cases de la déclaration erfs_survey_collection = SurveyCollection.load( collection='erfs', config_files_directory=config_files_directory) data = erfs_survey_collection.get_survey('erfs_{}'.format(year)) foyer_all = data.get_values(table=year_specific_by_generic["foyer"]) # on ne garde que les cases de la déclaration ('_xzz') ou ^_[0-9][a-z]{2}") regex = re.compile("^_[0-9][a-z]{2}") variables = [x for x in foyer_all.columns if regex.match(x)] # rename variable to fxzz ou ^f[0-9][a-z]{2}") renamed_variables = ["f{}".format(x[1:]) for x in variables] foyer = foyer_all[variables + ["noindiv"]].copy() # Memory expensive ... del foyer_all gc.collect() foyer.rename(columns=dict(zip(variables, renamed_variables)), inplace=True) # On aggrège les déclarations dans le cas où un individu a fait plusieurs déclarations foyer = foyer.groupby("noindiv", as_index=False).aggregate(numpy.sum) print_id(foyer) # On récupère les variables individualisables var_dict = { 'sali': ['f1aj', 'f1bj', 'f1cj', 'f1dj', 'f1ej'], 'hsup': ['f1au', 'f1bu', 'f1cu', 'f1du', 'f1eu'], 'choi': ['f1ap', 'f1bp', 'f1cp', 'f1dp', 'f1ep'], 'fra': ['f1ak', 'f1bk', 'f1ck', 'f1dk', 'f1ek'], 'cho_ld': ['f1ai', 'f1bi', 'f1ci', 'f1di', 'f1ei'], 'ppe_tp_sa': ['f1ax', 'f1bx', 'f1cx', 'f1dx', 'f1qx'], 'ppe_du_sa': ['f1av', 'f1bv', 'f1cv', 'f1dv', 'f1qv'], 'rsti': ['f1as', 'f1bs', 'f1cs', 'f1ds', 'f1es'], 'alr': ['f1ao', 'f1bo', 'f1co', 'f1do', 'f1eo'], 'f1tv': ['f1tv', 'f1uv'], 'f1tw': ['f1tw', 'f1uw'], 'f1tx': ['f1tx', 'f1ux'], 'ppe_tp_ns': ['f5nw', 'f5ow', 'f5pw'], 'ppe_du_ns': ['f5nv', 'f5ov', 'f5pv'], 'frag_exon': ['f5hn', 'f5in', 'f5jn'], 'frag_impo': ['f5ho', 'f5io', 'f5jo'], 'arag_exon': ['f5hb', 'f5ib', 'f5jb'], 'arag_impg': ['f5hc', 'f5ic', 'f5jc'], 'arag_defi': ['f5hf', 'f5if', 'f5jf'], 'nrag_exon': ['f5hh', 'f5ih', 'f5jh'], 'nrag_impg': ['f5hi', 'f5ii', 'f5ji'], 'nrag_defi': ['f5hl', 'f5il', 'f5jl'], 'nrag_ajag': ['f5hm', 'f5im', 'f5jm'], 'mbic_exon': ['f5kn', 'f5ln', 'f5mn'], 'abic_exon': ['f5kb', 'f5lb', 'f5mb'], 'nbic_exon': ['f5kh', 'f5lh', 'f5mh'], 'mbic_impv': ['f5ko', 'f5lo', 'f5mo'], 'mbic_imps': ['f5kp', 'f5lp', 'f5mp'], 'abic_impn': ['f5kc', 'f5lc', 'f5mc'], 'abic_imps': ['f5kd', 'f5ld', 'f5md'], 'nbic_impn': ['f5ki', 'f5li', 'f5mi'], 'nbic_imps': ['f5kj', 'f5lj', 'f5mj'], 'abic_defn': ['f5kf', 'f5lf', 'f5mf'], 'abic_defs': ['f5kg', 'f5lg', 'f5mg'], 'nbic_defn': ['f5kl', 'f5ll', 'f5ml'], 'nbic_defs': ['f5km', 'f5lm', 'f5mm'], 'nbic_apch': ['f5ks', 'f5ls', 'f5ms'], 'macc_exon': ['f5nn', 'f5on', 'f5pn'], 'aacc_exon': ['f5nb', 'f5ob', 'f5pb'], 'nacc_exon': ['f5nh', 'f5oh', 'f5ph'], 'macc_impv': ['f5no', 'f5oo', 'f5po'], 'macc_imps': ['f5np', 'f5op', 'f5pp'], 'aacc_impn': ['f5nc', 'f5oc', 'f5pc'], 'aacc_imps': ['f5nd', 'f5od', 'f5pd'], 'aacc_defn': ['f5nf', 'f5of', 'f5pf'], 'aacc_defs': ['f5ng', 'f5og', 'f5pg'], 'nacc_impn': ['f5ni', 'f5oi', 'f5pi'], 'nacc_imps': ['f5nj', 'f5oj', 'f5pj'], 'nacc_defn': ['f5nl', 'f5ol', 'f5pl'], 'nacc_defs': ['f5nm', 'f5om', 'f5pm'], 'mncn_impo': ['f5ku', 'f5lu', 'f5mu'], 'cncn_bene': ['f5sn', 'f5ns', 'f5os'], 'cncn_defi': ['f5sp', 'f5nu', 'f5ou', 'f5sr'], # TODO: check 'mbnc_exon': ['f5hp', 'f5ip', 'f5jp'], 'abnc_exon': ['f5qb', 'f5rb', 'f5sb'], 'nbnc_exon': ['f5qh', 'f5rh', 'f5sh'], 'mbnc_impo': ['f5hq', 'f5iq', 'f5jq'], 'abnc_impo': ['f5qc', 'f5rc', 'f5sc'], 'abnc_defi': ['f5qe', 'f5re', 'f5se'], 'nbnc_impo': ['f5qi', 'f5ri', 'f5si'], 'nbnc_defi': ['f5qk', 'f5rk', 'f5sk'], # 'ebic_impv' : ['f5ta','f5ua', 'f5va'], # 'ebic_imps' : ['f5tb','f5ub', 'f5vb'], 'mbic_mvct': ['f5hu'], 'macc_mvct': ['f5iu'], 'mncn_mvct': ['f5ju'], 'mbnc_mvct': ['f5kz'], 'frag_pvct': ['f5hw', 'f5iw', 'f5jw'], 'mbic_pvct': ['f5kx', 'f5lx', 'f5mx'], 'macc_pvct': ['f5nx', 'f5ox', 'f5px'], 'mbnc_pvct': ['f5hv', 'f5iv', 'f5jv'], 'mncn_pvct': ['f5ky', 'f5ly', 'f5my'], 'mbic_mvlt': ['f5kr', 'f5lr', 'f5mr'], 'macc_mvlt': ['f5nr', 'f5or', 'f5pr'], 'mncn_mvlt': ['f5kw', 'f5lw', 'f5mw'], 'mbnc_mvlt': ['f5hs', 'f5is', 'f5js'], 'frag_pvce': ['f5hx', 'f5ix', 'f5jx'], 'arag_pvce': ['f5he', 'f5ie', 'f5je'], 'nrag_pvce': ['f5hk', 'f5lk', 'f5jk'], 'mbic_pvce': ['f5kq', 'f5lq', 'f5mq'], 'abic_pvce': ['f5ke', 'f5le', 'f5me'], 'nbic_pvce': ['f5kk', 'f5ik', 'f5mk'], 'macc_pvce': ['f5nq', 'f5oq', 'f5pq'], 'aacc_pvce': ['f5ne', 'f5oe', 'f5pe'], 'nacc_pvce': ['f5nk', 'f5ok', 'f5pk'], 'mncn_pvce': ['f5kv', 'f5lv', 'f5mv'], 'cncn_pvce': ['f5so', 'f5nt', 'f5ot'], 'mbnc_pvce': ['f5hr', 'f5ir', 'f5jr'], 'abnc_pvce': ['f5qd', 'f5rd', 'f5sd'], 'nbnc_pvce': ['f5qj', 'f5rj', 'f5sj'], 'demenage': ['f1ar', 'f1br', 'f1cr', 'f1dr', 'f1er'], # (déménagement) uniquement en 2006 } cases_f6_f7_f8 = build_cerfa_fields_by_column_name( year=year, sections_cerfa=[6, 7, 8]) var_dict.update(cases_f6_f7_f8) vars_sets = [set(var_list) for var_list in var_dict.values()] eligible_vars = (set().union(*vars_sets)).intersection( set(list(foyer.columns))) log.info(u"From {} variables, we keep {} eligibles variables".format( len(set().union(*vars_sets)), len(eligible_vars), )) qui = ['vous', 'conj', 'pac1', 'pac2', 'pac3'] # err = 0 # err_vars = {} foy_ind = DataFrame() for individual_var, foyer_vars in var_dict.iteritems(): try: selection = foyer[foyer_vars + ["noindiv"]].copy() except KeyError: # Testing if at least one variable of foyers_vars is in the eligible list presence = [x in eligible_vars for x in foyer_vars] if not any(presence): log.info("{} is not present".format(individual_var)) continue else: # Shrink the list foyer_vars_cleaned = [ var for var, present in zip(foyer_vars, presence) if present is True ] selection = foyer[foyer_vars_cleaned + ["noindiv"]].copy() # Reshape the dataframe selection.rename(columns=dict(zip(foyer_vars, qui)), inplace=True) selection.set_index("noindiv", inplace=True) selection.columns.name = "quifoy" selection = selection.stack() selection.name = individual_var selection = selection.reset_index( ) # A Series cannot see its index resetted to produce a DataFrame selection = selection.set_index(["quifoy", "noindiv"]) selection = selection[selection[individual_var] != 0].copy() if len(foy_ind) == 0: foy_ind = selection else: foy_ind = concat([foy_ind, selection], axis=1, join='outer') foy_ind.reset_index(inplace=True) ind_vars_to_remove = Series(list(eligible_vars)) temporary_store['ind_vars_to_remove_{}'.format(year)] = ind_vars_to_remove foy_ind.rename(columns={"noindiv": "idfoy"}, inplace=True) print_id(foy_ind) foy_ind.quifoy.loc[foy_ind.quifoy == 'vous'] = 0 foy_ind.quifoy.loc[foy_ind.quifoy == 'conj'] = 1 foy_ind.quifoy.loc[foy_ind.quifoy == 'pac1'] = 2 foy_ind.quifoy.loc[foy_ind.quifoy == 'pac2'] = 3 foy_ind.quifoy.loc[foy_ind.quifoy == 'pac3'] = 4 assert foy_ind.quifoy.isin( range(5)).all(), 'présence de valeurs aberrantes dans quifoy' log.info('saving foy_ind') print_id(foy_ind) temporary_store['foy_ind_{}'.format(year)] = foy_ind return
def merge_tables(temporary_store=None, year=None): """ Création des tables ménages et individus concaténée (merged) """ # Prepare the some useful merged tables assert temporary_store is not None assert year is not None # load data erfs_survey_collection = SurveyCollection.load( collection='erfs', config_files_directory=config_files_directory) year_specific_by_generic = year_specific_by_generic_data_frame_name(year) survey = erfs_survey_collection.get_survey('erfs_{}'.format(year)) erfmen = survey.get_values(table=year_specific_by_generic["erf_menage"]) eecmen = survey.get_values(table=year_specific_by_generic["eec_menage"]) erfind = survey.get_values(table=year_specific_by_generic["erf_indivi"]) eecind = survey.get_values(table=year_specific_by_generic["eec_indivi"]) # travail sur la cohérence entre les bases noappar_m = eecmen[~(eecmen.ident.isin(erfmen.ident.values))].copy() noappar_i = eecmen[~(eecind.ident.isin(erfind.ident.values))].copy() noappar_i = noappar_i.drop_duplicates(subset='ident', take_last=True) # TODO: vérifier qu'il n'y a théoriquement pas de doublon difference = set(noappar_i.ident).symmetric_difference(noappar_m.ident) intersection = set(noappar_i.ident) & set(noappar_m.ident) log.info("There are {} differences and {} intersections".format( len(difference), len(intersection))) del noappar_i, noappar_m, difference, intersection gc.collect() # fusion enquete emploi et source fiscale menagem = erfmen.merge(eecmen) indivim = eecind.merge(erfind, on=['noindiv', 'ident', 'noi'], how="inner") var_list = [ 'acteu', 'agepr', 'cohab', 'contra', 'encadr', 'forter', 'lien', 'mrec', 'naia', 'noicon', 'noimer', 'noiper', 'prosa', 'retrai', 'rstg', 'statut', 'stc', 'otitc', 'txtppb', ] check_integer_dtype(indivim, var_list) create_actrec_variable(indivim) create_variable_locataire(menagem) menagem = menagem.merge(indivim.loc[indivim.lpr == 1, ['ident', 'ddipl']].copy()) manually_remove_noindiv_errors(indivim) temporary_store['menagem_{}'.format(year)] = menagem del eecmen, erfmen, menagem gc.collect() temporary_store['indivim_{}'.format(year)] = indivim del erfind, eecind
def create_enfants_a_naitre(temporary_store=None, year=None): ''' ''' assert temporary_store is not None assert year is not None erfs_survey_collection = SurveyCollection.load( collection='erfs', config_files_directory=config_files_directory) survey = erfs_survey_collection.get_survey('erfs_{}'.format(year)) # Enfant à naître (NN pour nouveaux nés) individual_vars = [ 'acteu', 'agepr', 'cohab', 'contra', 'forter', 'ident', 'lien', 'lpr', 'mrec', 'naia', 'naim', 'noi', 'noicon', 'noimer', 'noindiv', 'noiper', 'retrai', 'rga', 'rstg', 'sexe', 'stc', 'titc', ] year_specific_by_generic = year_specific_by_generic_data_frame_name(year) eeccmp1 = survey.get_values(table=year_specific_by_generic["eec_cmp_1"], variables=individual_vars) eeccmp2 = survey.get_values(table=year_specific_by_generic["eec_cmp_2"], variables=individual_vars) eeccmp3 = survey.get_values(table=year_specific_by_generic["eec_cmp_3"], variables=individual_vars) tmp = eeccmp1.merge(eeccmp2, how="outer") enfants_a_naitre = tmp.merge(eeccmp3, how="outer") # optimisation des types? Controle de l'existence en passant # pourquoi pas des int quand c'est possible # TODO: minimal dtype TODO: shoudln't be here for var in individual_vars: assert_dtype(enfants_a_naitre[var], 'float') del eeccmp1, eeccmp2, eeccmp3, individual_vars, tmp gc.collect() # création de variables enfants_a_naitre['declar1'] = '' enfants_a_naitre['noidec'] = 0 enfants_a_naitre['ztsai'] = 0 enfants_a_naitre['year'] = year enfants_a_naitre.year = enfants_a_naitre.year.astype( "float32") # TODO: should be an integer but NaN are present enfants_a_naitre['agepf'] = enfants_a_naitre.year - enfants_a_naitre.naia enfants_a_naitre.loc[enfants_a_naitre.naim >= 7, 'agepf'] -= 1 enfants_a_naitre['actrec'] = 9 enfants_a_naitre['quelfic'] = 'ENF_NN' enfants_a_naitre['persfip'] = "" # TODO: deal with agepf for series_name in ['actrec', 'noidec', 'ztsai']: assert_dtype(enfants_a_naitre[series_name], "int") # selection enfants_a_naitre = enfants_a_naitre[( (enfants_a_naitre.naia == enfants_a_naitre.year) & (enfants_a_naitre.naim >= 10)) | ( (enfants_a_naitre.naia == enfants_a_naitre.year + 1) & (enfants_a_naitre.naim <= 5))].copy() temporary_store["enfants_a_naitre_{}".format(year)] = enfants_a_naitre
def sif(temporary_store = None, year = None): assert temporary_store is not None assert year is not None year_specific_by_generic = year_specific_by_generic_data_frame_name(year) erfs_survey_collection = SurveyCollection.load(collection = 'erfs', config_files_directory = config_files_directory) erfs_survey = erfs_survey_collection.get_survey('erfs_{}'.format(year)) log.info("05_foyer: extraction des données foyer") # TODO Comment choisir le rfr n -2 pour éxonération de TH ? # mnrvka Revenu TH n-2 # mnrvkh revenu TH (revenu fiscal de référence) # # On récupère les variables du code sif sif = erfs_survey.get_values( variables = ["noindiv", 'sif', "nbptr", "mnrvka", "rbg", "tsrvbg", "declar"], table = year_specific_by_generic["foyer"] ) sif['statmarit'] = 0 if year == 2009: old_sif = sif['sif'][sif['noindiv'] == 901803201].copy() new_sif = old_sif.str[0:59] + old_sif.str[60:] + "0" sif.sif.loc[sif['noindiv'] == 901803201] = new_sif.values old_sif = sif.sif.loc[sif['noindiv'] == 900872201] new_sif = old_sif.str[0:58] + " " + old_sif.str[58:] sif.sif.loc[sif['noindiv'] == 900872201] = new_sif.values del old_sif, new_sif sif["rbg"] = sif["rbg"] * ((sif["tsrvbg"] == '+').astype(int) - (sif["tsrvbg"] == '-').astype(int)) sif["stamar"] = sif.sif.str[4:5] # Converting marital status statmarit_dict = {"M": 1, "C": 2, "D": 3, "V": 4, "O": 5} for key, val in statmarit_dict.iteritems(): sif.statmarit.loc[sif.stamar == key] = val sif["birthvous"] = sif.sif.str[5:9] sif["birthconj"] = sif.sif.str[10:14] sif["caseE"] = sif.sif.str[15:16] == "E" sif["caseF"] = sif.sif.str[16:17] == "F" sif["caseG"] = sif.sif.str[17:18] == "G" sif["caseK"] = sif.sif.str[18:19] == "K" d = 0 if year in [2006, 2007]: sif["caseL"] = sif.sif.str[19:20] == "L" sif["caseP"] = sif.sif.str[20:21] == "P" sif["caseS"] = sif.sif.str[21:22] == "S" sif["caseW"] = sif.sif.str[22:23] == "W" sif["caseN"] = sif.sif.str[23:24] == "N" sif["caseH"] = sif.sif.str[24:28] sif["caseT"] = sif.sif.str[28:29] == "T" if year in [2008]: d = - 1 # fin de la case L sif["caseP"] = sif.sif.str[20 + d: 21 + d] == "P" sif["caseS"] = sif.sif.str[21 + d: 22 + d] == "S" sif["caseW"] = sif.sif.str[22 + d: 23 + d] == "W" sif["caseN"] = sif.sif.str[23 + d: 24 + d] == "N" sif["caseH"] = sif.sif.str[24 + d: 28 + d] sif["caseT"] = sif.sif.str[28 + d: 29 + d] == "T" if year in [2009]: sif["caseL"] = sif.sif.str[19: 20] == "L" sif["caseP"] = sif.sif.str[20: 21] == "P" sif["caseS"] = sif.sif.str[21: 22] == "S" sif["caseW"] = sif.sif.str[22: 23] == "W" sif["caseN"] = sif.sif.str[23: 24] == "N" # caseH en moins par rapport à 2008 (mais case en L en plus) # donc décalage par rapport à 2006 d = -4 sif["caseT"] = sif.sif.str[28 + d: 29 + d] == "T" sif["caseX"] = sif.sif.str[33 + d: 34 + d] == "X" sif["dateX"] = sif.sif.str[34 + d: 42 + d] sif["caseY"] = sif.sif.str[42 + d: 43 + d] == "Y" sif["dateY"] = sif.sif.str[43 + d: 51 + d] sif["caseZ"] = sif.sif.str[51 + d: 52 + d] == "Z" sif["dateZ"] = sif.sif.str[52 + d: 60 + d] sif["causeXYZ"] = sif.sif.str[60 + d: 61 + d] # TODO: convert dateXYZ to appropriate date in pandas # print sif["dateY"].unique() sif["nbptr"] = sif.nbptr.values / 100 sif["rfr_n_2"] = sif.mnrvka.values sif["nbF"] = sif.sif.str[64 + d: 66 + d] sif["nbG"] = sif.sif.str[67 + d: 69 + d] sif["nbR"] = sif.sif.str[70 + d: 72 + d] sif["nbJ"] = sif.sif.str[73 + d: 75 + d] sif["nbN"] = sif.sif.str[76 + d: 78 + d] sif["nbH"] = sif.sif.str[79 + d: 81 + d] sif["nbI"] = sif.sif.str[82 + d: 84 + d] if (year != 2009): sif["nbP"] = sif.sif.str[85 + d: 87 + d] del sif["stamar"] duplicated_noindiv = sif.noindiv[sif.noindiv.duplicated()].copy() sif['duplicated_noindiv'] = sif.noindiv.isin(duplicated_noindiv) x = sif.loc[sif.duplicated_noindiv, ['noindiv', 'declar']] sif['change'] = "NONE" sif.loc[sif.duplicated_noindiv, 'change'] = sif.loc[sif.duplicated_noindiv, 'declar'].str[27:28] log.info("Number of individuals: {}".format(len(sif.noindiv))) log.info("Number of duplicated individuals: {}".format(len(duplicated_noindiv))) log.info("Number of distinct individuals: {}".format(len(sif.noindiv.value_counts()))) log.info(u"Saving sif") temporary_store['sif_{}'.format(year)] = sif del sif gc.collect()
def foyer_all(temporary_store = None, year = None): year_specific_by_generic = year_specific_by_generic_data_frame_name(year) # On ajoute les cases de la déclaration erfs_survey_collection = SurveyCollection.load(collection = 'erfs', config_files_directory = config_files_directory) data = erfs_survey_collection.get_survey('erfs_{}'.format(year)) foyer_all = data.get_values(table = year_specific_by_generic["foyer"]) # on ne garde que les cases de la déclaration ('_xzz') ou ^_[0-9][a-z]{2}") regex = re.compile("^_[0-9][a-z]{2}") variables = [x for x in foyer_all.columns if regex.match(x)] # rename variable to fxzz ou ^f[0-9][a-z]{2}") renamed_variables = ["f{}".format(x[1:]) for x in variables] foyer = foyer_all[variables + ["noindiv"]].copy() # Memory expensive ... del foyer_all gc.collect() foyer.rename(columns = dict(zip(variables, renamed_variables)), inplace = True) # On aggrège les déclarations dans le cas où un individu a fait plusieurs déclarations foyer = foyer.groupby("noindiv", as_index = False).aggregate(numpy.sum) print_id(foyer) # On récupère les variables individualisables var_dict = { 'sali': ['f1aj', 'f1bj', 'f1cj', 'f1dj', 'f1ej'], 'hsup': ['f1au', 'f1bu', 'f1cu', 'f1du', 'f1eu'], 'choi': ['f1ap', 'f1bp', 'f1cp', 'f1dp', 'f1ep'], 'fra': ['f1ak', 'f1bk', 'f1ck', 'f1dk', 'f1ek'], 'cho_ld': ['f1ai', 'f1bi', 'f1ci', 'f1di', 'f1ei'], 'ppe_tp_sa': ['f1ax', 'f1bx', 'f1cx', 'f1dx', 'f1qx'], 'ppe_du_sa': ['f1av', 'f1bv', 'f1cv', 'f1dv', 'f1qv'], 'rsti': ['f1as', 'f1bs', 'f1cs', 'f1ds', 'f1es'], 'alr': ['f1ao', 'f1bo', 'f1co', 'f1do', 'f1eo'], 'f1tv': ['f1tv', 'f1uv'], 'f1tw': ['f1tw', 'f1uw'], 'f1tx': ['f1tx', 'f1ux'], 'ppe_tp_ns': ['f5nw', 'f5ow', 'f5pw'], 'ppe_du_ns': ['f5nv', 'f5ov', 'f5pv'], 'frag_exon': ['f5hn', 'f5in', 'f5jn'], 'frag_impo': ['f5ho', 'f5io', 'f5jo'], 'arag_exon': ['f5hb', 'f5ib', 'f5jb'], 'arag_impg': ['f5hc', 'f5ic', 'f5jc'], 'arag_defi': ['f5hf', 'f5if', 'f5jf'], 'nrag_exon': ['f5hh', 'f5ih', 'f5jh'], 'nrag_impg': ['f5hi', 'f5ii', 'f5ji'], 'nrag_defi': ['f5hl', 'f5il', 'f5jl'], 'nrag_ajag': ['f5hm', 'f5im', 'f5jm'], 'mbic_exon': ['f5kn', 'f5ln', 'f5mn'], 'abic_exon': ['f5kb', 'f5lb', 'f5mb'], 'nbic_exon': ['f5kh', 'f5lh', 'f5mh'], 'mbic_impv': ['f5ko', 'f5lo', 'f5mo'], 'mbic_imps': ['f5kp', 'f5lp', 'f5mp'], 'abic_impn': ['f5kc', 'f5lc', 'f5mc'], 'abic_imps': ['f5kd', 'f5ld', 'f5md'], 'nbic_impn': ['f5ki', 'f5li', 'f5mi'], 'nbic_imps': ['f5kj', 'f5lj', 'f5mj'], 'abic_defn': ['f5kf', 'f5lf', 'f5mf'], 'abic_defs': ['f5kg', 'f5lg', 'f5mg'], 'nbic_defn': ['f5kl', 'f5ll', 'f5ml'], 'nbic_defs': ['f5km', 'f5lm', 'f5mm'], 'nbic_apch': ['f5ks', 'f5ls', 'f5ms'], 'macc_exon': ['f5nn', 'f5on', 'f5pn'], 'aacc_exon': ['f5nb', 'f5ob', 'f5pb'], 'nacc_exon': ['f5nh', 'f5oh', 'f5ph'], 'macc_impv': ['f5no', 'f5oo', 'f5po'], 'macc_imps': ['f5np', 'f5op', 'f5pp'], 'aacc_impn': ['f5nc', 'f5oc', 'f5pc'], 'aacc_imps': ['f5nd', 'f5od', 'f5pd'], 'aacc_defn': ['f5nf', 'f5of', 'f5pf'], 'aacc_defs': ['f5ng', 'f5og', 'f5pg'], 'nacc_impn': ['f5ni', 'f5oi', 'f5pi'], 'nacc_imps': ['f5nj', 'f5oj', 'f5pj'], 'nacc_defn': ['f5nl', 'f5ol', 'f5pl'], 'nacc_defs': ['f5nm', 'f5om', 'f5pm'], 'mncn_impo': ['f5ku', 'f5lu', 'f5mu'], 'cncn_bene': ['f5sn', 'f5ns', 'f5os'], 'cncn_defi': ['f5sp', 'f5nu', 'f5ou', 'f5sr'], # TODO: check 'mbnc_exon': ['f5hp', 'f5ip', 'f5jp'], 'abnc_exon': ['f5qb', 'f5rb', 'f5sb'], 'nbnc_exon': ['f5qh', 'f5rh', 'f5sh'], 'mbnc_impo': ['f5hq', 'f5iq', 'f5jq'], 'abnc_impo': ['f5qc', 'f5rc', 'f5sc'], 'abnc_defi': ['f5qe', 'f5re', 'f5se'], 'nbnc_impo': ['f5qi', 'f5ri', 'f5si'], 'nbnc_defi': ['f5qk', 'f5rk', 'f5sk'], # 'ebic_impv' : ['f5ta','f5ua', 'f5va'], # 'ebic_imps' : ['f5tb','f5ub', 'f5vb'], 'mbic_mvct': ['f5hu'], 'macc_mvct': ['f5iu'], 'mncn_mvct': ['f5ju'], 'mbnc_mvct': ['f5kz'], 'frag_pvct': ['f5hw', 'f5iw', 'f5jw'], 'mbic_pvct': ['f5kx', 'f5lx', 'f5mx'], 'macc_pvct': ['f5nx', 'f5ox', 'f5px'], 'mbnc_pvct': ['f5hv', 'f5iv', 'f5jv'], 'mncn_pvct': ['f5ky', 'f5ly', 'f5my'], 'mbic_mvlt': ['f5kr', 'f5lr', 'f5mr'], 'macc_mvlt': ['f5nr', 'f5or', 'f5pr'], 'mncn_mvlt': ['f5kw', 'f5lw', 'f5mw'], 'mbnc_mvlt': ['f5hs', 'f5is', 'f5js'], 'frag_pvce': ['f5hx', 'f5ix', 'f5jx'], 'arag_pvce': ['f5he', 'f5ie', 'f5je'], 'nrag_pvce': ['f5hk', 'f5lk', 'f5jk'], 'mbic_pvce': ['f5kq', 'f5lq', 'f5mq'], 'abic_pvce': ['f5ke', 'f5le', 'f5me'], 'nbic_pvce': ['f5kk', 'f5ik', 'f5mk'], 'macc_pvce': ['f5nq', 'f5oq', 'f5pq'], 'aacc_pvce': ['f5ne', 'f5oe', 'f5pe'], 'nacc_pvce': ['f5nk', 'f5ok', 'f5pk'], 'mncn_pvce': ['f5kv', 'f5lv', 'f5mv'], 'cncn_pvce': ['f5so', 'f5nt', 'f5ot'], 'mbnc_pvce': ['f5hr', 'f5ir', 'f5jr'], 'abnc_pvce': ['f5qd', 'f5rd', 'f5sd'], 'nbnc_pvce': ['f5qj', 'f5rj', 'f5sj'], 'demenage': ['f1ar', 'f1br', 'f1cr', 'f1dr', 'f1er'], # (déménagement) uniquement en 2006 } cases_f6_f7_f8 = build_cerfa_fields_by_column_name(year = year, sections_cerfa = [6, 7, 8]) var_dict.update(cases_f6_f7_f8) vars_sets = [set(var_list) for var_list in var_dict.values()] eligible_vars = (set().union(*vars_sets)).intersection(set(list(foyer.columns))) log.info( u"From {} variables, we keep {} eligibles variables".format( len(set().union(*vars_sets)), len(eligible_vars), ) ) qui = ['vous', 'conj', 'pac1', 'pac2', 'pac3'] # err = 0 # err_vars = {} foy_ind = DataFrame() for individual_var, foyer_vars in var_dict.iteritems(): try: selection = foyer[foyer_vars + ["noindiv"]].copy() except KeyError: # Testing if at least one variable of foyers_vars is in the eligible list presence = [x in eligible_vars for x in foyer_vars] if not any(presence): log.info("{} is not present".format(individual_var)) continue else: # Shrink the list foyer_vars_cleaned = [var for var, present in zip(foyer_vars, presence) if present is True] selection = foyer[foyer_vars_cleaned + ["noindiv"]].copy() # Reshape the dataframe selection.rename(columns = dict(zip(foyer_vars, qui)), inplace = True) selection.set_index("noindiv", inplace = True) selection.columns.name = "quifoy" selection = selection.stack() selection.name = individual_var selection = selection.reset_index() # A Series cannot see its index resetted to produce a DataFrame selection = selection.set_index(["quifoy", "noindiv"]) selection = selection[selection[individual_var] != 0].copy() if len(foy_ind) == 0: foy_ind = selection else: foy_ind = concat([foy_ind, selection], axis = 1, join = 'outer') foy_ind.reset_index(inplace = True) ind_vars_to_remove = Series(list(eligible_vars)) temporary_store['ind_vars_to_remove_{}'.format(year)] = ind_vars_to_remove foy_ind.rename(columns = {"noindiv": "idfoy"}, inplace = True) print_id(foy_ind) foy_ind.quifoy.loc[foy_ind.quifoy == 'vous'] = 0 foy_ind.quifoy.loc[foy_ind.quifoy == 'conj'] = 1 foy_ind.quifoy.loc[foy_ind.quifoy == 'pac1'] = 2 foy_ind.quifoy.loc[foy_ind.quifoy == 'pac2'] = 3 foy_ind.quifoy.loc[foy_ind.quifoy == 'pac3'] = 4 assert foy_ind.quifoy .isin(range(5)).all(), 'présence de valeurs aberrantes dans quifoy' log.info('saving foy_ind') print_id(foy_ind) temporary_store['foy_ind_{}'.format(year)] = foy_ind return