def get_input_data_frame(year): openfisca_survey_collection = SurveyCollection.load(collection="eipp") openfisca_survey = openfisca_survey_collection.surveys[ "eipp_data_{}".format(year)] input_data_frame = openfisca_survey.get_values(table="input") input_data_frame.reset_index(inplace=True) return input_data_frame
def build_merged_dataframes(temporary_store = None, year = None): assert temporary_store is not None assert year is not None log.debug("Chargement des tables des enquêtes") erfs_fpr_survey_collection = SurveyCollection.load(collection = "erfs_fpr") yr = str(year)[-2:] # 12 for 2012 add_suffix_retropole_years = [2012] survey = erfs_fpr_survey_collection.get_survey(f"erfs_fpr_{year}") eec_menage = survey.get_values(table = f"fpr_mrf{yr}e{yr}t4") eec_individu = survey.get_values(table = f"fpr_irf{yr}e{yr}t4") if year in add_suffix_retropole_years: fpr_individu = survey.get_values(table = f"fpr_indiv_{year}_retropole") fpr_menage = survey.get_values(table = f"fpr_menage_{year}_retropole") else: fpr_individu = survey.get_values(table = f"fpr_indiv_{year}") fpr_menage = survey.get_values(table = f"fpr_menage_{year}") individus, menages = merge_tables(fpr_menage, eec_menage, eec_individu, fpr_individu, year) temporary_store[f"menages_{year}"] = menages del eec_menage, fpr_menage, menages gc.collect() temporary_store[f"individus_{year}_post_01"] = individus del eec_individu, fpr_individu
def store_input_data_frame(data_frame = None, collection = None, survey = None, table = None): assert data_frame is not None assert collection is not None assert survey is not None try: openfisca_survey_collection = SurveyCollection.load(collection = collection) except Exception as e: openfisca_survey_collection = SurveyCollection(name = collection) log.debug("In collection {} the following survey are present: {}".format(collection, openfisca_survey_collection.surveys)) output_data_directory = openfisca_survey_collection.config.get('data', 'output_directory') if table is None: table = "input" # survey_name = survey hdf5_file_path = os.path.join(os.path.dirname(output_data_directory), "{}.h5".format(survey_name)) available_survey_names = [survey_.name for survey_ in openfisca_survey_collection.surveys] if survey_name in available_survey_names: survey = openfisca_survey_collection.get_survey(survey_name) else: survey = Survey(name = survey_name, hdf5_file_path = hdf5_file_path) survey.insert_table(name = table, data_frame = data_frame) openfisca_survey_collection.surveys.append(survey) collections_directory = openfisca_survey_collection.config.get('collections', 'collections_directory') json_file_path = os.path.join(collections_directory, '{}.json'.format(collection)) log.debug("In collection {} the following surveyx are present: {}".format(collection, openfisca_survey_collection.surveys)) openfisca_survey_collection.dump(json_file_path = json_file_path)
def get_input_data_frame(year): openfisca_survey_collection = SurveyCollection.load( collection = "openfisca_indirect_taxation", config_files_directory = config_files_directory) openfisca_survey = openfisca_survey_collection.get_survey("openfisca_indirect_taxation_data_{}".format(year)) input_data_frame = openfisca_survey.get_values(table = "input") input_data_frame.reset_index(inplace = True) return input_data_frame
def get_input_data_frame(year): openfisca_survey_collection = SurveyCollection.load( collection="openfisca_indirect_taxation") openfisca_survey = openfisca_survey_collection.get_survey( "openfisca_indirect_taxation_data_{}".format(year)) input_data_frame = openfisca_survey.get_values(table="input") input_data_frame.reset_index(inplace=True) return input_data_frame
def build_erf_data_frames(self): # TODO: remove this self.columns_to_fetch = ['af'] variables = self.columns_to_fetch erf_survey_collection = SurveyCollection.load( collection = "erfs", config_files_directory = config_files_directory) erf_survey = erf_survey_collection.get_survey("erfs_{}".format(year)) year_specific_by_generic = year_specific_by_generic_data_frame_name(year) generic_by_year_specific = dict(zip(year_specific_by_generic.values(), year_specific_by_generic.keys())) erf_variables = list(set(variables + ["ident", "wprm", "quelfic", "noi"])) of2erf = get_of2erf() for index, variable in enumerate(erf_variables): if variable in of2erf: erf_variables[index] = of2erf[variable] data_frame_by_table = dict(eec_indivi = None, erf_indivi = None, erf_menage = None) erf_variables_by_generic_table = dict(eec_indivi = [], erf_indivi = [], erf_menage = []) year_specific_tables_by_erf_variable = dict( [ ( erf_variable, set( erf_survey.find_tables(variable = erf_variable) ).intersection( set([year_specific_by_generic[key] for key in erf_variables_by_generic_table.keys()]) ) ) for erf_variable in erf_variables ] ) for variable, year_specific_tables in year_specific_tables_by_erf_variable.iteritems(): if len(year_specific_tables) < 1: log.info("No tables are present for variable {}".format(variable)) continue else: log.info("Variable {} is present in multiple tables : {}".format(variable, year_specific_tables)) for table in year_specific_tables: log.info("Variable {} is retrieved from table {}".format(variable, table)) erf_variables_by_generic_table[generic_by_year_specific[table]].append(variable) erf2of = get_erf2of() for table, erf_variables in erf_variables_by_generic_table.iteritems(): if erf_variables: data_frame_by_table[table] = erf_survey.get_values( variables = erf_variables, table = year_specific_by_generic[table] ) data_frame_by_table[table].rename(columns = erf2of, inplace = True) data_frame_by_table[table].rename(columns = {'ident': 'idmen'}, inplace = True) assert not data_frame_by_table["erf_menage"].duplicated().any(), "Duplicated idmen in erf_menage" self.erf_data_frame_by_entity_key_plural = dict( menages = data_frame_by_table["erf_menage"], individus = data_frame_by_table["erf_indivi"].merge(data_frame_by_table["eec_indivi"]) )
def build_homogeneisation_vehicules(temporary_store = None, year = None): assert temporary_store is not None """Compute vehicule numbers by type""" assert year is not None # Load data bdf_survey_collection = SurveyCollection.load( collection = 'budget_des_familles', config_files_directory = config_files_directory) survey = bdf_survey_collection.get_survey('budget_des_familles_{}'.format(year)) if year == 1995: vehicule = None # L'enquête BdF 1995 ne contient pas d'information sur le type de carburant utilisé par les véhicules. if year == 2000: vehicule = survey.get_values(table = "depmen") kept_variables = ['ident', 'carbu01', 'carbu02'] vehicule = vehicule[kept_variables] vehicule.rename(columns = {'ident': 'ident_men'}, inplace = True) vehicule.rename(columns = {'carbu01': 'carbu1'}, inplace = True) vehicule.rename(columns = {'carbu02': 'carbu2'}, inplace = True) vehicule["veh_tot"] = 1 vehicule["veh_essence"] = 1 * (vehicule['carbu1'] == 1) + 1 * (vehicule['carbu2'] == 1) vehicule["veh_diesel"] = 1 * (vehicule['carbu1'] == 2) + 1 * (vehicule['carbu2'] == 2) vehicule.index = vehicule.index.astype(ident_men_dtype) if year == 2005: vehicule = survey.get_values(table = "automobile") kept_variables = ['ident_men', 'carbu'] vehicule = vehicule[kept_variables] vehicule["veh_tot"] = 1 vehicule["veh_essence"] = (vehicule['carbu'] == 1) vehicule["veh_diesel"] = (vehicule['carbu'] == 2) if year == 2011: try: vehicule = survey.get_values(table = "AUTOMOBILE") except: vehicule = survey.get_values(table = "automobile") kept_variables = ['ident_me', 'carbu'] vehicule = vehicule[kept_variables] vehicule.rename(columns = {'ident_me': 'ident_men'}, inplace = True) vehicule["veh_tot"] = 1 vehicule["veh_essence"] = (vehicule['carbu'] == 1) vehicule["veh_diesel"] = (vehicule['carbu'] == 2) # Compute the number of cars by category and save if year != 1995: vehicule = vehicule.groupby(by = 'ident_men')["veh_tot", "veh_essence", "veh_diesel"].sum() vehicule["pourcentage_vehicule_essence"] = 0 vehicule.pourcentage_vehicule_essence.loc[vehicule.veh_tot != 0] = vehicule.veh_essence / vehicule.veh_tot # Save in temporary store temporary_store['automobile_{}'.format(year)] = vehicule
def get_input_data_frame(year): openfisca_survey_collection = SurveyCollection.load( collection = "openfisca", config_files_directory = config_files_directory) openfisca_survey = openfisca_survey_collection.get_survey("openfisca_data_{}".format(year)) input_data_frame = openfisca_survey.get_values(table = "input") input_data_frame.rename( columns = dict(sali = 'sal', choi = 'cho', rsti = 'rst'), inplace = True, ) input_data_frame.reset_index(inplace = True) return input_data_frame
def run_all(year_calage = 2007, year_data_list = [1995, 2000, 2005, 2011]): # Quelle base de données choisir pour le calage ? year_data = find_nearest_inferior(year_data_list, year_calage) # 4 étape parallèles d'homogénéisation des données sources : # Gestion des dépenses de consommation: build_depenses_homogenisees(year = year_data) build_imputation_loyers_proprietaires(year = year_data) build_depenses_calees(year_calage, year_data) build_menage_consumption_by_categorie_fiscale(year_calage, year_data) categorie_fiscale_data_frame = temporary_store["menage_consumption_by_categorie_fiscale_{}".format(year_calage)] depenses_calees_by_grosposte = temporary_store["depenses_calees_by_grosposte_{}".format(year_calage)] # Gestion des véhicules: build_homogeneisation_vehicules(year = year_data) vehicule = temporary_store['automobile_{}'.format(year_data)] # Gestion des variables socio démographiques: build_homogeneisation_caracteristiques_sociales(year = year_data) menage = temporary_store['donnes_socio_demog_{}'.format(year_data)] # Gestion des variables revenues: build_homogeneisation_revenus_menages(year = year_data) revenus = temporary_store["revenus_{}".format(year_data)] # DataFrame résultant de ces 4 étapes data_frame = pandas.concat( [revenus, vehicule, categorie_fiscale_data_frame, menage, depenses_calees_by_grosposte], axis = 1) data_frame.index.name = "ident_men" data_frame.reset_index(inplace = True) # Remove duplicated colums causing bug with HDFStore # according to https://github.com/pydata/pandas/issues/6240 # using solution form stackoverflow # http://stackoverflow.com/questions/16938441/how-to-remove-duplicate-columns-from-a-dataframe-using-python-pandas data_frame = data_frame.T.groupby(level = 0).first().T # Saving the data_frame openfisca_survey_collection = SurveyCollection.load( collection = 'openfisca_indirect_taxation', config_files_directory = config_files_directory) output_data_directory = openfisca_survey_collection.config.get('data', 'output_directory') survey_name = "openfisca_indirect_taxation_data_{}".format(year_calage) table = "input" hdf5_file_path = os.path.join(os.path.dirname(output_data_directory), "{}.h5".format(survey_name)) survey = Survey( name = survey_name, hdf5_file_path = hdf5_file_path, ) survey.insert_table(name = table, data_frame = data_frame) openfisca_survey_collection.surveys.append(survey) openfisca_survey_collection.dump()
def build_survey_collection(name=None, erase_collection_json=False, overwrite_surveys=False, data_directory_path_by_year=None, source_format='sas'): assert name is not None assert data_directory_path_by_year is not None years = data_directory_path_by_year.keys() if years is None: log.error("A list of years to process is needed") if erase_collection_json: survey_collection = SurveyCollection( name=name, config_files_directory=config_files_directory) else: try: survey_collection = SurveyCollection.load( collection=name, config_files_directory=config_files_directory) except ConfigParser.NoOptionError: survey_collection = SurveyCollection( name=name, config_files_directory=config_files_directory) for year, data_directory_path in data_directory_path_by_year.iteritems(): if not os.path.isdir(data_directory_path): input_data_directory = survey_collection.config.get( 'data', 'input_directory') assert os.path.isdir(input_data_directory) data_directory_path = os.path.join(input_data_directory, data_directory_path) assert os.path.isdir(input_data_directory) data_file_by_format = create_data_file_by_format(data_directory_path) print data_file_by_format survey_name = '{}_{}'.format(name, year) add_survey_to_collection( survey_name=survey_name, survey_collection=survey_collection, sas_files=data_file_by_format[source_format], ) collections_directory = survey_collection.config.get( 'collections', 'collections_directory') collection_json_path = os.path.join(collections_directory, "{}.json".format(name)) survey_collection.dump(json_file_path=collection_json_path) surveys = [ survey for survey in survey_collection.surveys if survey.name.endswith(str(year)) ] survey_collection.fill_hdf(source_format=source_format, surveys=surveys, overwrite=overwrite_surveys) return survey_collection
def create(cls, year = None, rebuild_input_data = False): assert year is not None if rebuild_input_data: cls.build_input_data(year = year) openfisca_survey_collection = SurveyCollection.load( collection = "openfisca", config_files_directory = config_files_directory) openfisca_survey = openfisca_survey_collection.get_survey("openfisca_erfs_fpr_data_{}".format(year)) input_data_frame = openfisca_survey.get_values(table = "input").reset_index(drop = True) return cls().init_from_data_frame( input_data_frame = input_data_frame, year = year, )
def get_input_data_frame(year): openfisca_survey_collection = SurveyCollection.load( collection="openfisca", config_files_directory=config_files_directory) openfisca_survey = openfisca_survey_collection.get_survey( "openfisca_data_{}".format(year)) input_data_frame = openfisca_survey.get_values(table="input") input_data_frame.rename( columns=dict(sali='sal', choi='cho', rsti='rst'), inplace=True, ) input_data_frame.reset_index(inplace=True) return input_data_frame
def set_table_in_survey(input_dataframe, entity, period, collection, survey_name, survey_label = None, table_label = None, table_name = None): period = periods.period(period) if table_name is None: table_name = entity + '_' + str(period) if table_label is None: table_label = "Input data for entity {} at period {}".format(entity, period) try: survey_collection = SurveyCollection.load(collection = collection) except configparser.NoOptionError: survey_collection = SurveyCollection(name = collection) except configparser.NoSectionError: # For tests data_dir = os.path.join( pkg_resources.get_distribution('openfisca-survey-manager').location, 'openfisca_survey_manager', 'tests', 'data_files', ) survey_collection = SurveyCollection( name = collection, config_files_directory = data_dir, ) try: survey = survey_collection.get_survey(survey_name) except AssertionError: survey = Survey( name = survey_name, label = survey_label or None, survey_collection = survey_collection, ) if survey.hdf5_file_path is None: config = survey.survey_collection.config directory_path = config.get("data", "output_directory") if not os.path.isdir(directory_path): log.warn("{} who should be the HDF5 data directory does not exist: we create the directory".format( directory_path)) os.makedirs(directory_path) survey.hdf5_file_path = os.path.join(directory_path, survey.name + '.h5') assert survey.hdf5_file_path is not None survey.insert_table(label = table_label, name = table_name, dataframe = input_dataframe) survey_collection.surveys = [ kept_survey for kept_survey in survey_collection.surveys if kept_survey.name != survey_name ] survey_collection.surveys.append(survey) collections_directory = survey_collection.config.get('collections', 'collections_directory') assert os.path.isdir(collections_directory), """{} who should be the collections' directory does not exist. Fix the option collections_directory in the collections section of your config file.""".format(collections_directory) collection_json_path = os.path.join(collections_directory, "{}.json".format(collection)) survey_collection.dump(json_file_path = collection_json_path)
def build_survey_collection( config_files_directory: str, collection_name = None, replace_metadata = False, replace_data = False, data_directory_path_by_survey_suffix = None, source_format = 'sas', ): assert collection_name is not None assert data_directory_path_by_survey_suffix is not None surveys_name = list(data_directory_path_by_survey_suffix.keys()) assert surveys_name is not None, "A list of surveys to process is needed" if replace_metadata: survey_collection = SurveyCollection( name = collection_name, config_files_directory = config_files_directory) else: try: survey_collection = SurveyCollection.load( collection = collection_name, config_files_directory = config_files_directory) except configparser.NoOptionError: survey_collection = SurveyCollection( name = collection_name, config_files_directory = config_files_directory) for survey_suffix, data_directory_path in data_directory_path_by_survey_suffix.items(): assert os.path.isdir(data_directory_path), '{} is not a valid directory path'.format(data_directory_path) data_file_by_format = create_data_file_by_format(data_directory_path) survey_name = '{}_{}'.format(collection_name, survey_suffix) add_survey_to_collection( survey_name = survey_name, survey_collection = survey_collection, sas_files = data_file_by_format.get('sas'), stata_files = data_file_by_format.get('stata'), ) valid_source_format = [ _format for _format in list(data_file_by_format.keys()) if data_file_by_format.get((_format)) ] log.info("Valid source formats are: {}".format(valid_source_format)) source_format = valid_source_format[0] log.info("Using the following format: {}".format(source_format)) collections_directory = survey_collection.config.get('collections', 'collections_directory') assert os.path.isdir(collections_directory), """{} who should be the collections' directory does not exist. Fix the option collections_directory in the collections section of your config file.""".format(collections_directory) collection_json_path = os.path.join(collections_directory, "{}.json".format(collection_name)) survey_collection.dump(json_file_path = collection_json_path) surveys = [survey for survey in survey_collection.surveys if survey.name.endswith(str(survey_suffix))] survey_collection.fill_hdf(source_format = source_format, surveys = surveys, overwrite = replace_data) return survey_collection
def build_bdf_survey_collection(years=None, erase=False, overwrite=False): if years is None: log.error("A list of years to process is needed") if erase: bdf_survey_collection = SurveyCollection( name="budget_des_familles", config_files_directory=config_files_directory) else: try: bdf_survey_collection = SurveyCollection.load( collection='budget_des_familles', config_files_directory=config_files_directory) except ConfigParser.NoOptionError: bdf_survey_collection = SurveyCollection( name="budget_des_familles", config_files_directory=config_files_directory) input_data_directory = bdf_survey_collection.config.get( 'data', 'input_directory') if getpass.getuser() == 'benjello': input_data_directory = os.path.join( os.path.dirname(input_data_directory), 'INSEE') else: input_data_directory = os.path.dirname(input_data_directory) for year in years: data_directory_path = os.path.join( input_data_directory, 'budget_des_familles/{}'.format(year)) data_file_by_format = create_data_file_by_format(data_directory_path) survey_name = 'budget_des_familles_{}'.format(year) add_survey_to_collection( survey_name=survey_name, survey_collection=bdf_survey_collection, stata_files=data_file_by_format['stata'], ) collections_directory = bdf_survey_collection.config.get( 'collections', 'collections_directory') collection_json_path = os.path.join(collections_directory, "budget_des_familles" + ".json") bdf_survey_collection.dump(json_file_path=collection_json_path) surveys = [ survey for survey in bdf_survey_collection.surveys if survey.name.endswith(str(year)) ] bdf_survey_collection.fill_hdf(source_format='stata', surveys=surveys, overwrite=overwrite) return bdf_survey_collection
def get_input_data_frame(year): openfisca_survey_collection = SurveyCollection.load(collection = "openfisca") openfisca_survey = openfisca_survey_collection.get_survey("openfisca_data_{}".format(year)) input_data_frame = openfisca_survey.get_values(table = "input").reset_index(drop = True) input_data_frame.rename( columns = dict( alr = 'pensions_alimentaires_percues', choi = 'chomage_imposable', cho_ld = 'chomeur_longue_duree', fra = 'frais_reels', rsti = 'retraite_imposable', sali = 'salaire_imposable', ), inplace = True, ) return input_data_frame
def build_merged_dataframes(temporary_store = None, year = None): assert temporary_store is not None assert year is not None log.debug("Chargement des tables des enquêtes") erfs_fpr_survey_collection = SurveyCollection.load(collection = 'erfs_fpr') yr = str(year)[-2:] # 12 for 2012 survey = erfs_fpr_survey_collection.get_survey('erfs_fpr_{}'.format(year)) fpr_menage = survey.get_values(table = 'fpr_menage_{}_retropole'.format(year)) eec_menage = survey.get_values(table = 'fpr_mrf{}e{}t4'.format(yr, yr)) eec_individu = survey.get_values(table = 'fpr_irf{}e{}t4'.format(yr, yr)) fpr_individu = survey.get_values(table = 'fpr_indiv_{}_retropole'.format(year)) individus, menages = merge_tables(fpr_menage, eec_menage, eec_individu, fpr_individu, year) temporary_store['menages_{}'.format(year)] = menages del eec_menage, fpr_menage, menages gc.collect() temporary_store['individus_{}_post_01'.format(year)] = individus del eec_individu, fpr_individu
def get_input_data_frame(year): openfisca_survey_collection = SurveyCollection.load(collection="openfisca") openfisca_survey = openfisca_survey_collection.get_survey( "openfisca_data_{}".format(year)) input_data_frame = openfisca_survey.get_values(table="input").reset_index( drop=True) input_data_frame.rename( columns=dict( alr='pensions_alimentaires_percues', choi='chomage_imposable', cho_ld='chomeur_longue_duree', fra='frais_reels', rsti='retraite_imposable', sali='salaire_imposable', ), inplace=True, ) return input_data_frame
def build_other_menage_variables(year = None): """Build menage consumption by categorie fiscale dataframe """ assert year is not None # Load data bdf_survey_collection = SurveyCollection.load( collection = 'budget_des_familles', config_files_directory = config_files_directory ) survey = bdf_survey_collection.get_survey('budget_des_familles_{}'.format(year)) c05d = survey.get_values(table = "c05d") kept_variables = [u'ident_men', u'pondmen'] c05d = c05d[kept_variables] menage = survey.get_values(table = "menage") kept_variables = [u'ident_men', u'pondmen', u'revtot', u'revtotuc', u'decuc'] menage = menage[kept_variables] data_frame = menage.merge(c05d, copy = True) return data_frame
def build_erfs_survey_collection(years = None, erase = False, overwrite = False): if years is None: log.error("A list of years to process is needed") if erase: erfs_survey_collection = SurveyCollection( name = "erfs", config_files_directory = config_files_directory) else: try: erfs_survey_collection = SurveyCollection.load( collection = 'erfs', config_files_directory = config_files_directory) except ConfigParser.NoOptionError: erfs_survey_collection = SurveyCollection( name = "erfs", config_files_directory = config_files_directory) input_data_directory = erfs_survey_collection.config.get('data', 'input_directory') if getpass.getuser() == 'benjello': input_data_directory = os.path.join(os.path.dirname(input_data_directory), 'INSEE') else: input_data_directory = os.path.dirname(input_data_directory) for year in years: data_directory_path = os.path.join( input_data_directory, 'ERF/ERFS_{}'.format(year) ) data_file_by_format = create_data_file_by_format(data_directory_path) survey_name = 'erfs_{}'.format(year) add_survey_to_collection( survey_name = survey_name, survey_collection = erfs_survey_collection, sas_files = data_file_by_format['sas'], ) collections_directory = erfs_survey_collection.config.get('collections', 'collections_directory') collection_json_path = os.path.join(collections_directory, "erfs" + ".json") erfs_survey_collection.dump(json_file_path = collection_json_path) surveys = [survey for survey in erfs_survey_collection.surveys if survey.name.endswith(str(year))] erfs_survey_collection.fill_hdf(source_format = 'sas', surveys = surveys, overwrite = overwrite) return erfs_survey_collection
def build_homogeneisation_vehicules(year = None): """Compute vehicule numbers by type""" assert year is not None # Load data bdf_survey_collection = SurveyCollection.load( collection = 'budget_des_familles', config_files_directory = config_files_directory) survey = bdf_survey_collection.get_survey('budget_des_familles_{}'.format(year)) if year == 2000: vehicule = survey.get_values(table = "depmen") kept_variables = ['ident', 'carbu01', 'carbu02'] vehicule = vehicule[kept_variables] vehicule.rename(columns = {'ident': 'ident_men'}, inplace = True) vehicule.rename(columns = {'carbu01': 'carbu1'}, inplace = True) vehicule.rename(columns = {'carbu02': 'carbu2'}, inplace = True) vehicule["veh_tot"] = '1' vehicule["veh_essence"] = 1*(vehicule['carbu1'] == '1') + 1*(vehicule['carbu2'] == '1') vehicule["veh_diesel"] = 1*(vehicule['carbu1'] == '2') + 1*(vehicule['carbu2'] == '2') if year == 2005: vehicule = survey.get_values(table = "automobile") kept_variables = ['ident_men', 'carbu'] vehicule = vehicule[kept_variables] vehicule["veh_tot"] = 1 vehicule["veh_essence"] = (vehicule['carbu'] == 1) vehicule["veh_diesel"] = (vehicule['carbu'] == 2) if year == 2011: vehicule = survey.get_values(table = "automobile") kept_variables = ['ident_me', 'carbu'] vehicule = vehicule[kept_variables] vehicule.rename(columns = {'ident_me': 'ident_men'}, inplace = True) vehicule["veh_tot"] = 1 vehicule["veh_essence"] = (vehicule['carbu'] == 1) vehicule["veh_diesel"] = (vehicule['carbu'] == 2) # Compute the number of cars by category vehicule = vehicule.groupby(by = 'ident_men')["veh_tot", "veh_essence", "veh_diesel"].sum() # Save in temporary store temporary_store['automobile_{}'.format(year)] = vehicule
def build_clean_aliss_data_frame(): year = 2011 aliss_survey_collection = SurveyCollection.load( collection = 'aliss', config_files_directory = config_files_directory ) survey = aliss_survey_collection.get_survey('aliss_{}'.format(year)) aliss = survey.get_values(table = 'Base_ALISS_2011') aliss['age'] = 99 aliss['revenus'] = 99 triplets = [ ('1 : Jeune/Ais', 0, 3), ('2 : Jeune/MoyenSup', 0, 2), ('3 : Jeune/MoyenInf', 0, 1), ('4 : Jeune/Modeste', 0, 0), ('5 : Age Moyen/Ais', 1, 3), ('6 : Age Moyen/MoyenSup', 1, 2), ('7 : Age Moyen/MoyenInf', 1, 1), ('8 : Age Moyen/Modeste', 1, 0), ('9 : Age Sup/Ais', 2, 3), ('10 : Age Sup/MoyenSup', 2, 2), ('11 : Age Sup/MoyenInf', 2, 1), ('12 : Age Sup/Modeste', 2, 0), ('13 : Vieux/Ais', 3, 3), ('14 : Vieux/MoyenSup', 3, 2), ('15 : Vieux/MoyenInf', 3, 1), ('16 : Vieux/Modeste', 3, 0), ] for household_type, age, revenus in triplets: print household_type, age, revenus selection = aliss.type.str.startswith(household_type) aliss.loc[selection, 'age'] = age aliss.loc[selection, 'revenus'] = revenus assert aliss.age.isin(range(4)).all() assert aliss.revenus.isin(range(4)).all() del aliss['type'] return aliss
def build_survey_collection(name = None, erase_collection_json = False, overwrite_surveys = False, data_directory_path_by_year = None, source_format = 'sas'): assert name is not None assert data_directory_path_by_year is not None years = data_directory_path_by_year.keys() if years is None: log.error("A list of years to process is needed") if erase_collection_json: survey_collection = SurveyCollection( name = name, config_files_directory = config_files_directory) else: try: survey_collection = SurveyCollection.load( collection = name, config_files_directory = config_files_directory) except ConfigParser.NoOptionError: survey_collection = SurveyCollection( name = name, config_files_directory = config_files_directory) for year, data_directory_path in data_directory_path_by_year.iteritems(): if not os.path.isdir(data_directory_path): input_data_directory = survey_collection.config.get('data', 'input_directory') assert os.path.isdir(input_data_directory) data_directory_path = os.path.join(input_data_directory, data_directory_path) assert os.path.isdir(input_data_directory) data_file_by_format = create_data_file_by_format(data_directory_path) print data_file_by_format survey_name = '{}_{}'.format(name, year) add_survey_to_collection( survey_name = survey_name, survey_collection = survey_collection, sas_files = data_file_by_format[source_format], ) collections_directory = survey_collection.config.get('collections', 'collections_directory') collection_json_path = os.path.join(collections_directory, "{}.json".format(name)) survey_collection.dump(json_file_path = collection_json_path) surveys = [survey for survey in survey_collection.surveys if survey.name.endswith(str(year))] survey_collection.fill_hdf(source_format = source_format, surveys = surveys, overwrite = overwrite_surveys) return survey_collection
def build_merged_dataframes(temporary_store=None, year=None): assert temporary_store is not None assert year is not None log.debug("Chargement des tables des enquêtes") erfs_fpr_survey_collection = SurveyCollection.load(collection='erfs_fpr') yr = str(year)[-2:] # 12 for 2012 survey = erfs_fpr_survey_collection.get_survey('erfs_fpr_{}'.format(year)) fpr_menage = survey.get_values( table='fpr_menage_{}_retropole'.format(year)) eec_menage = survey.get_values(table='fpr_mrf{}e{}t4'.format(yr, yr)) eec_individu = survey.get_values(table='fpr_irf{}e{}t4'.format(yr, yr)) fpr_individu = survey.get_values( table='fpr_indiv_{}_retropole'.format(year)) individus, menages = merge_tables(fpr_menage, eec_menage, eec_individu, fpr_individu, year) temporary_store['menages_{}'.format(year)] = menages del eec_menage, fpr_menage, menages gc.collect() temporary_store['individus_{}_post_01'.format(year)] = individus del eec_individu, fpr_individu
def build_clean_aliss_data_frame(): year = 2011 aliss_survey_collection = SurveyCollection.load( collection='aliss', config_files_directory=config_files_directory) survey = aliss_survey_collection.get_survey('aliss_{}'.format(year)) aliss = survey.get_values(table='Base_ALISS_2011') aliss['age'] = 99 aliss['revenus'] = 99 triplets = [ ('1 : Jeune/Ais', 0, 3), ('2 : Jeune/MoyenSup', 0, 2), ('3 : Jeune/MoyenInf', 0, 1), ('4 : Jeune/Modeste', 0, 0), ('5 : Age Moyen/Ais', 1, 3), ('6 : Age Moyen/MoyenSup', 1, 2), ('7 : Age Moyen/MoyenInf', 1, 1), ('8 : Age Moyen/Modeste', 1, 0), ('9 : Age Sup/Ais', 2, 3), ('10 : Age Sup/MoyenSup', 2, 2), ('11 : Age Sup/MoyenInf', 2, 1), ('12 : Age Sup/Modeste', 2, 0), ('13 : Vieux/Ais', 3, 3), ('14 : Vieux/MoyenSup', 3, 2), ('15 : Vieux/MoyenInf', 3, 1), ('16 : Vieux/Modeste', 3, 0), ] for household_type, age, revenus in triplets: print household_type, age, revenus selection = aliss.type.str.startswith(household_type) aliss.loc[selection, 'age'] = age aliss.loc[selection, 'revenus'] = revenus assert aliss.age.isin(range(4)).all() assert aliss.revenus.isin(range(4)).all() del aliss['type'] return aliss
def store_input_data_frame(data_frame=None, collection=None, survey=None, table=None): assert data_frame is not None assert collection is not None assert survey is not None try: openfisca_survey_collection = SurveyCollection.load( collection=collection) except Exception as e: openfisca_survey_collection = SurveyCollection(name=collection) log.debug("In collection {} the following survey are present: {}".format( collection, openfisca_survey_collection.surveys)) output_data_directory = openfisca_survey_collection.config.get( 'data', 'output_directory') if table is None: table = "input" # survey_name = survey hdf5_file_path = os.path.join(os.path.dirname(output_data_directory), "{}.h5".format(survey_name)) available_survey_names = [ survey_.name for survey_ in openfisca_survey_collection.surveys ] if survey_name in available_survey_names: survey = openfisca_survey_collection.get_survey(survey_name) else: survey = Survey(name=survey_name, hdf5_file_path=hdf5_file_path) survey.insert_table(name=table, data_frame=data_frame) openfisca_survey_collection.surveys.append(survey) collections_directory = openfisca_survey_collection.config.get( 'collections', 'collections_directory') json_file_path = os.path.join(collections_directory, '{}.json'.format(collection)) log.debug("In collection {} the following surveyx are present: {}".format( collection, openfisca_survey_collection.surveys)) openfisca_survey_collection.dump(json_file_path=json_file_path)
def show(ctx, collection_name, survey_name = None, tables_names = None): parser = SafeConfigParser() parser.read(ctx.obj['CONFIG_FILE']) json_file_path = os.path.abspath(parser.get("collections", collection_name)) survey_collection = SurveyCollection.load(json_file_path = json_file_path) click.echo(survey_collection) if survey_name is not None: survey = [ kept_survey for kept_survey in survey_collection.surveys if kept_survey.name == survey_name ][0] if survey is not None: click.echo(survey) else: click.echo("{} is not an element of collection {} surveys ({})".format( survey_name, collection_name, str(survey_collection.surveys.keys()).strip('[]'))) if tables_names: for table_name in tables_names: click.echo(yaml.safe_dump( {"table {}".format(table_name): survey.tables[table_name]}, default_flow_style = False, ))
def create(cls, year = None, input_data_frame = None): assert year is not None openfisca_survey_collection = SurveyCollection.load( collection = "openfisca", config_files_directory = config_files_directory) openfisca_survey = openfisca_survey_collection.get_survey("openfisca_data_{}".format(year)) if input_data_frame: input_data_frame = (input_data_frame .reset_index(drop = True) .rename( columns = dict( alr = 'pensions_alimentaires_percues', choi = 'chomage_imposable', cho_ld = 'chomeur_longue_duree', fra = 'frais_reels', rsti = 'retraite_imposable', sali = 'salaire_imposable', ), inplace = True, ) ) else: input_data_frame = (openfisca_survey.get_values(table = "input") .reset_index(drop = True) .rename( columns = dict( alr = 'pensions_alimentaires_percues', choi = 'chomage_imposable', cho_ld = 'chomeur_longue_duree', fra = 'frais_reels', rsti = 'retraite_imposable', sali = 'salaire_imposable', ), inplace = True, ) ) return cls().init_from_data_frame( input_data_frame = input_data_frame, year = year, )
def build_survey_collection(collection_name = None, replace_metadata = False, replace_data = False, data_directory_path_by_survey_suffix = None, source_format = 'sas'): assert collection_name is not None assert data_directory_path_by_survey_suffix is not None surveys_name = data_directory_path_by_survey_suffix.keys() assert surveys_name is not None, "A list of surveys to process is needed" if replace_metadata: survey_collection = SurveyCollection( name = collection_name, config_files_directory = config_files_directory) else: try: survey_collection = SurveyCollection.load( collection = collection_name, config_files_directory = config_files_directory) except ConfigParser.NoOptionError: survey_collection = SurveyCollection( name = collection_name, config_files_directory = config_files_directory) for survey_suffix, data_directory_path in data_directory_path_by_survey_suffix.iteritems(): assert os.path.isdir(data_directory_path) data_file_by_format = create_data_file_by_format(data_directory_path) print data_file_by_format survey_name = '{}_{}'.format(collection_name, survey_suffix) add_survey_to_collection( survey_name = survey_name, survey_collection = survey_collection, sas_files = data_file_by_format[source_format], ) collections_directory = survey_collection.config.get('collections', 'collections_directory') assert os.path.isdir(collections_directory), """{} who should be the collections' directory does not exist. Fix the option collections_directory in the collections section of your config file.""".format(collections_directory) collection_json_path = os.path.join(collections_directory, "{}.json".format(collection_name)) survey_collection.dump(json_file_path = collection_json_path) surveys = [survey for survey in survey_collection.surveys if survey.name.endswith(str(survey_suffix))] survey_collection.fill_hdf(source_format = source_format, surveys = surveys, overwrite = replace_data) return survey_collection
def coicop_from_aliss(year=2011): assert year == 2011 aliss_survey_collection = SurveyCollection.load( collection='aliss', config_files_directory=config_files_directory) survey = aliss_survey_collection.get_survey('aliss_{}'.format(year)) aliss = survey.get_values(table='Base_ALISS_2011') dirty_produits = aliss.souscode.unique() entries = list() for dirty_produit in dirty_produits: entries.append( dict( code_coicop='0' + '.'.join(dirty_produit[:4]), label=dirty_produit[6:], code_aliss=dirty_produit[:6], )) result = pandas.DataFrame(entries) assert not result.code_coicop.duplicated().any() return result
def coicop_from_aliss(year = 2011): assert year == 2011 aliss_survey_collection = SurveyCollection.load( collection = 'aliss', config_files_directory = config_files_directory ) survey = aliss_survey_collection.get_survey('aliss_{}'.format(year)) aliss = survey.get_values(table = 'Base_ALISS_2011') dirty_produits = aliss.souscode.unique() entries = list() for dirty_produit in dirty_produits: entries.append(dict( code_coicop = '0' + '.'.join(dirty_produit[:4]), label = dirty_produit[6:], code_aliss = dirty_produit[:6], )) result = pandas.DataFrame(entries) assert not result.code_coicop.duplicated().any() return result
def create(cls, year=None, input_data_frame=None): assert year is not None openfisca_survey_collection = SurveyCollection.load( collection="openfisca", config_files_directory=config_files_directory) openfisca_survey = openfisca_survey_collection.get_survey( "openfisca_data_{}".format(year)) if input_data_frame: input_data_frame = (input_data_frame.reset_index(drop=True).rename( columns=dict( alr='pensions_alimentaires_percues', choi='chomage_imposable', cho_ld='chomeur_longue_duree', fra='frais_reels', rsti='retraite_imposable', sali='salaire_imposable', ), inplace=True, )) else: input_data_frame = (openfisca_survey.get_values( table="input").reset_index(drop=True).rename( columns=dict( alr='pensions_alimentaires_percues', choi='chomage_imposable', cho_ld='chomeur_longue_duree', fra='frais_reels', rsti='retraite_imposable', sali='salaire_imposable', ), inplace=True, )) return cls().init_from_data_frame( input_data_frame=input_data_frame, year=year, )
def create_fip(year = None): assert year is not None # fip : fichier d'imposition des personnes """ Creates a 'fipDat' table containing all these 'fip individuals' """ # Some individuals are declared as 'personne à charge' (pac) on 'tax forms' # but are not present in the erf or eec tables. # We add them to ensure consistency between concepts. temporary_store = TemporaryStore.create(file_name = "erfs") replace = create_replace(year) erfs_survey_collection = SurveyCollection.load( collection = 'erfs', config_files_directory = config_files_directory) survey = erfs_survey_collection.get_survey('erfs_{}'.format(year)) log.info(u"Démarrage de 03_fip") # anaisenf is a string containing letter code of pac (F,G,H,I,J,N,R) and year of birth (example: 'F1990H1992') # when a child is invalid, he appears twice in anaisenf (example: F1900G1900 is a single invalid child born in 1990) erfFoyVar = ['declar', 'anaisenf'] foyer = survey.get_values(table = replace["foyer"], variables = erfFoyVar) foyer.replace({'anaisenf': {'NA': np.nan}}, inplace = True) log.info(u"Etape 1 : on récupere les personnes à charge des foyers") log.info(u" 1.1 : Création des codes des enfants") foyer['anaisenf'] = foyer['anaisenf'].astype('string') nb_pac_max = len(max(foyer['anaisenf'], key=len)) / 5 log.info(u"il ya a au maximum {} pac par foyer".format(nb_pac_max)) # Separating the string coding the pac of each "déclaration". # Creating a list containing the new variables. # Creating the multi_index for the columns multi_index_columns = [] assert int(nb_pac_max) == nb_pac_max, "nb_pac_max = {} which is not an integer".format(nb_pac_max) nb_pac_max = int(nb_pac_max) for i in range(1, nb_pac_max + 1): pac_tuples_list = [ (i, 'declaration'), (i, 'type_pac'), (i, 'naia') ] multi_index_columns += pac_tuples_list columns = MultiIndex.from_tuples( multi_index_columns, names = ['pac_number', 'variable'] ) fip = DataFrame(np.random.randn(len(foyer), 3 * nb_pac_max), columns = columns) log.info("{}".format(fip.describe())) log.info("{}".format(fip.info())) for i in range(1, nb_pac_max + 1): # TODO: using values to deal with mismatching indexes fip[(i, 'declaration')] = foyer['declar'].values fip[(i, 'type_pac')] = foyer['anaisenf'].str[5 * (i - 1)].values fip[(i, 'naia')] = foyer['anaisenf'].str[5 * (i - 1) + 1: 5 * i].values fip = fip.stack("pac_number") fip.reset_index(inplace = True) fip.drop(['level_0'], axis = 1, inplace = True) log.info(u" 1.2 : elimination des foyers fiscaux sans pac") # Clearing missing values and changing data format fip = fip[(fip.type_pac.notnull()) & (fip.naia != 'an') & (fip.naia != '')].copy() fip = fip.sort(columns = ['declaration', 'naia', 'type_pac']) # TODO: check if useful fip.set_index(["declaration", "pac_number"], inplace = True) fip = fip.reset_index() fip.drop(['pac_number'], axis = 1, inplace = True) # TODO: rajouter la case I : "Dont enfants titulaires de la carte d’invalidité" assert fip.type_pac.isin(["F", "G", "H", "I", "J", "N", "R"]).all(), "Certains type de PAC sont inconnus" # TODO: find a more explicit message # control(fip, debug=True, verbose=True, verbose_columns=['naia']) log.info(u" 1.3 : on enlève les individus F pour lesquels il existe un individu G") type_FG = fip[fip.type_pac.isin(['F', 'G'])].copy() # Filtre pour ne travailler que sur F & G type_FG['same_pair'] = type_FG.duplicated(subset = ['declaration', 'naia'], take_last = True) type_FG['is_twin'] = type_FG.duplicated(subset = ['declaration', 'naia', 'type_pac']) type_FG['to_keep'] = ~(type_FG['same_pair']) | type_FG['is_twin'] # Note : On conserve ceux qui ont des couples déclar/naia différents et les jumeaux # puis on retire les autres (à la fois F et G) log.info(u"longueur fip {}".format(len(fip))) fip['to_keep'] = np.nan fip.update(type_FG) log.info(u" 1.4 : on enlève les H pour lesquels il y a un I") type_HI = fip[fip.type_pac.isin(['H', 'I'])].copy() type_HI['same_pair'] = type_HI.duplicated(subset = ['declaration', 'naia'], take_last = True) type_HI['is_twin'] = type_HI.duplicated(subset = ['declaration', 'naia', 'type_pac']) type_HI['to_keep'] = (~(type_HI['same_pair']) | (type_HI['is_twin'])).values fip.update(type_HI) fip['to_keep'] = fip['to_keep'].fillna(True) log.info(u"nb lines to keep = {} / nb initial lines {}".format(len(fip[fip['to_keep']]), len(fip))) indivifip = fip[fip['to_keep']].copy() del indivifip['to_keep'], fip, type_FG, type_HI # # control(indivifip, debug=True) log.info(u"Step 2 : matching indivifip with eec file") indivi = temporary_store['indivim_{}'.format(year)] pac = indivi[(indivi.persfip.notnull()) & (indivi.persfip == 'pac')].copy() assert indivifip.naia.notnull().all(), "Il y a des valeurs manquantes de la variable naia" pac['naia'] = pac.naia.astype('int32') # TODO: was float in pac fix upstream indivifip['naia'] = indivifip.naia.astype('int32') pac['key1'] = zip(pac.naia, pac['declar1'].str[:29]) pac['key2'] = zip(pac.naia, pac['declar2'].str[:29]) indivifip['key'] = zip(indivifip.naia.values, indivifip['declaration'].str[:29].values) assert pac.naia.dtype == indivifip.naia.dtype, \ "Les dtypes de pac.naia {} et indvifip.naia {} sont différents".format(pac.naia.dtype, indivifip.naia.dtype) fip = indivifip[~(indivifip.key.isin(pac.key1.values))].copy() fip = fip[~(fip.key.isin(pac.key2.values))].copy() log.info(u" 2.1 new fip created") # We build a dataframe to link the pac to their type and noindiv tmp_pac1 = pac[['noindiv', 'key1']].copy() tmp_pac2 = pac[['noindiv', 'key2']].copy() tmp_indivifip = indivifip[['key', 'type_pac', 'naia']].copy() pac_ind1 = tmp_pac1.merge(tmp_indivifip, left_on='key1', right_on='key', how='inner') log.info(u"longueur pacInd1 {}".format(len(pac_ind1))) pac_ind2 = tmp_pac2.merge(tmp_indivifip, left_on='key2', right_on='key', how='inner') log.info(u"longueur pacInd2 {}".format(len(pac_ind2))) log.info(u"pacInd1 & pacInd2 créés") log.info("{}".format(pac_ind1.duplicated().sum())) log.info("{}".format(pac_ind2.duplicated().sum())) del pac_ind1['key1'], pac_ind2['key2'] if len(pac_ind1.index) == 0: if len(pac_ind2.index) == 0: log.info(u"Warning : no link between pac and noindiv for both pacInd1&2") else: log.info(u"Warning : pacInd1 is an empty data frame") pacInd = pac_ind2 elif len(pac_ind2.index) == 0: log.info(u"Warning : pacInd2 is an empty data frame") pacInd = pac_ind1 else: pacInd = concat([pac_ind2, pac_ind1]) log.info("{}{}{}".format(len(pac_ind1), len(pac_ind2), len(pacInd))) log.info("{}".format(pac_ind2.type_pac.isnull().sum())) log.info("{}".format(pacInd.type_pac.value_counts())) log.info(u" 2.2 : pacInd created") log.info(u"doublons noindiv, type_pac {}".format(pacInd.duplicated(['noindiv', 'type_pac']).sum())) log.info(u"doublons noindiv seulement {}".format(pacInd.duplicated('noindiv').sum())) log.info(u"nb de NaN {}".format(pacInd.type_pac.isnull().sum())) del pacInd["key"] pacIndiv = pacInd[~(pacInd.duplicated('noindiv'))].copy() # pacIndiv.reset_index(inplace=True) log.info("{}".format(pacIndiv.columns)) temporary_store['pacIndiv_{}'.format(year)] = pacIndiv log.info("{}".format(pacIndiv.type_pac.value_counts())) gc.collect() # # We keep the fip in the menage of their parents because it is used in to # # build the famille. We should build an individual ident (ménage) for the fip that are # # older than 18 since they are not in their parents' menage according to the eec # individec1 <- subset(indivi, (declar1 %in% fip$declar) & (persfip=="vous")) # individec1 <- individec1[,c("declar1","noidec","ident","rga","ztsai","ztsao")] # individec1 <- upData(individec1,rename=c(declar1="declar")) # fip1 <- merge(fip,individec1) # indivi$noidec <- as.numeric(substr(indivi$declar1,1,2)) log.info("{}".format(indivi['declar1'].str[0:2].value_counts())) log.info("{}".format(indivi['declar1'].str[0:2].describe())) log.info("{}".format(indivi['declar1'].str[0:2].notnull().all())) log.info("{}".format(indivi.info())) selection = indivi['declar1'].str[0:2] != "" indivi['noidec'] = indivi.declar1[selection].str[0:2].astype('int32') # To be used later to set idfoy individec1 = indivi[(indivi.declar1.isin(fip.declaration.values)) & (indivi.persfip == "vous")] individec1 = individec1[["declar1", "noidec", "ident", "rga", "ztsai", "ztsao"]].copy() individec1 = individec1.rename(columns = {'declar1': 'declaration'}) fip1 = fip.merge(individec1, on = 'declaration') log.info(u" 2.3 : fip1 created") # # TODO: On ne s'occupe pas des declar2 pour l'instant # # individec2 <- subset(indivi, (declar2 %in% fip$declar) & (persfip=="vous")) # # individec2 <- individec2[,c("declar2","noidec","ident","rga","ztsai","ztsao")] # # individec2 <- upData(individec2,rename=c(declar2="declar")) # # fip2 <-merge(fip,individec2) individec2 = indivi[(indivi.declar2.isin(fip.declaration.values)) & (indivi['persfip'] == "vous")] individec2 = individec2[["declar2", "noidec", "ident", "rga", "ztsai", "ztsao"]].copy() individec2.rename(columns = {'declar2': 'declaration'}, inplace = True) fip2 = fip.merge(individec2) log.info(u" 2.4 : fip2 created") fip1.duplicated().value_counts() fip2.duplicated().value_counts() fip = concat([fip1, fip2]) fip['persfip'] = 'pac' fip['year'] = year fip['year'] = fip['year'].astype('float') # BUG; pas de colonne année dans la DF fip['noi'] = 99 fip['noicon'] = None fip['noindiv'] = fip['declaration'] fip['noiper'] = None fip['noimer'] = None fip['declar1'] = fip['declaration'] # TODO: declar ? fip['naim'] = 99 fip['lien'] = None fip['quelfic'] = 'FIP' fip['acteu'] = None fip['agepf'] = fip['year'] - fip.naia.astype('float') fip['lpr'] = (fip['agepf'] <= 20) * 3 + (fip['agepf'] > 20) * 4 fip['stc'] = None fip['contra'] = None fip['titc'] = None fip['mrec'] = None fip['forter'] = None fip['rstg'] = None fip['retrai'] = None fip['cohab'] = None fip['sexe'] = None fip['persfip'] = "pac" fip['agepr'] = None fip['actrec'] = (fip['agepf'] <= 15) * 9 + (fip['agepf'] > 15) * 5 ## TODO: probleme actrec des enfants fip entre 16 et 20 ans : on ne sait pas s'ils sont étudiants ou salariés */ ## TODO problème avec les mois des enfants FIP : voir si on ne peut pas remonter à ces valeurs: Alexis : clairement non # Reassigning noi for fip children if they are more than one per foyer fiscal # while ( any(duplicated( fip[,c("noi","ident")]) ) ) { # dup <- duplicated( fip[, c("noi","ident")]) # tmp <- fip[dup,"noi"] # fip[dup, "noi"] <- (tmp-1) # } # TODO: Le vecteur dup est-il correct fip["noi"] = fip["noi"].astype("int64") fip["ident"] = fip["ident"].astype("int64") fip_tmp = fip[['noi', 'ident']] while any(fip.duplicated(cols=['noi', 'ident'])): fip_tmp = fip.loc[:, ['noi', 'ident']] dup = fip_tmp.duplicated() tmp = fip.loc[dup, 'noi'] log.info("{}".format(len(tmp))) fip.loc[dup, 'noi'] = tmp.astype('int64') - 1 fip['idfoy'] = 100 * fip['ident'] + fip['noidec'] fip['noindiv'] = 100 * fip['ident'] + fip['noi'] fip['type_pac'] = 0 fip['key'] = 0 log.info("{}".format(fip.duplicated('noindiv').value_counts())) temporary_store['fipDat_{}'.format(year)] = fip del fip, fip1, individec1, indivifip, indivi, pac log.info(u"fip sauvegardé")
def init_from_survey_tables(self, calibration_kwargs=None, data_year=None, inflation_kwargs=None, rebuild_input_data=False, rebuild_kwargs=None, input_survey_kwargs=None, trace=False, memory_config=None, tax_and_benefit_system=None): assert tax_and_benefit_system is not None if data_year is None: data_year = self.year if calibration_kwargs is not None: assert set(calibration_kwargs.keys()).issubset( set([ 'target_margins_by_variable', 'parameters', 'total_population' ])) if inflation_kwargs is not None: assert set(inflation_kwargs.keys()).issubset( set(['inflator_by_variable', 'target_by_variable'])) if rebuild_input_data: if rebuild_kwargs is not None: self.build_input_data( year=data_year, tax_and_benefit_system=tax_and_benefit_system, **rebuild_kwargs) else: self.build_input_data( year=data_year, tax_and_benefit_system=tax_and_benefit_system) if self.input_data_table_by_period is None: openfisca_survey_collection = SurveyCollection.load( collection=self.collection) openfisca_survey = openfisca_survey_collection.get_survey( "{}_{}".format(self.input_data_survey_prefix, data_year)) input_data_frame = openfisca_survey.get_values( table="input").reset_index(drop=True) self.init_from_data_frame(input_data_frame=input_data_frame, ) else: pass # input_survey_kwargs = input_survey_kwargs if input_survey_kwargs else dict( ) self.new_simulation(survey=input_survey_kwargs.get('input_survey'), trace=trace, memory_config=memory_config) if self.baseline_tax_benefit_system is not None: self.new_simulation( use_baseline=True, survey=input_survey_kwargs.get('baseline_input_survey'), trace=trace, memory_config=memory_config) # if calibration_kwargs: self.calibrate(**calibration_kwargs) if inflation_kwargs: self.inflate(**inflation_kwargs)
def build_homogeneisation_caracteristiques_sociales(temporary_store=None, year=None): u"""Homogénéisation des caractéristiques sociales des ménages """ assert temporary_store is not None assert year is not None # Load data bdf_survey_collection = SurveyCollection.load( collection='budget_des_familles', config_files_directory=config_files_directory) survey = bdf_survey_collection.get_survey( 'budget_des_familles_{}'.format(year)) # ****************************************************************************************************************** # * Etape n° 0-3 : HOMOGENEISATION DES CARACTERISTIQUES SOCIALES DES MENAGES # ****************************************************************************************************************** # ****************************************************************************************************************** if year == 1995: kept_variables = [ 'exdep', 'exrev', 'mena', 'v', 'ponderrd', 'nbpers', 'nbenf', 'typmen1', 'cohabpr', 'sexepr', 'agepr', 'agecj', 'matripr', 'occuppr', 'occupcj', 'nbact', 'sitlog', 'stalog', 'mena', 'nm14a', 'typmen1' ] menage = survey.get_values( table="socioscm", variables=kept_variables, ) # cette étape permet de ne garder que les données dont on est sûr de la qualité et de la véracité # exdep = 1 si les données sont bien remplies pour les dépenses du ménage # exrev = 1 si les données sont bien remplies pour les revenus du ménage menage = menage[(menage.exdep == 1) & (menage.exrev == 1)] menage.rename( columns={ 'v': 'vag', 'mena': 'ident_men', 'ponderrd': 'pondmen', 'nbpers': 'npers', 'nm14a': 'nenfants', 'nbenf': 'nenfhors', 'nbact': 'nactifs', 'cohabpr': 'couplepr', 'matripr': 'etamatri', 'typmen1': 'typmen' }, inplace=True, ) # la variable vag est utilisée dans les modèles QAIDS et AIDS comme variable temporelle afin d'attibuer # le bon prix mensuel menage.agecj = menage.agecj.fillna(0) menage.nenfhors = menage.nenfhors.fillna(0) menage.vag = menage.vag.astype('int') menage['nadultes'] = menage['npers'] - menage['nenfants'] menage['ocde10'] = 1 + 0.5 * numpy.maximum( 0, menage['nadultes'] - 1) + 0.3 * menage['nenfants'] # harmonisation des types de ménage sur la nomenclature 2010 menage['typmen_'] = menage['typmen'] menage.typmen[menage.typmen_ == 1] = 1 menage.typmen[menage.typmen_ == 2] = 3 menage.typmen[menage.typmen_ == 3] = 4 menage.typmen[menage.typmen_ == 4] = 4 menage.typmen[menage.typmen_ == 5] = 4 menage.typmen[menage.typmen_ == 6] = 2 menage.typmen[menage.typmen_ == 7] = 5 del menage['typmen_'] var_to_ints = ['couplepr', 'etamatri'] for var_to_int in var_to_ints: menage[var_to_int] = menage[var_to_int].astype(int) # Methode : # 1. on nettoite les variables (i.e. changement de nom de format) # 2. Reformatage des variables (réattribution des catégories pour quelles soient identiques # pour les différentes années) menage["situacj"] = 0 menage.situacj[menage.occupcj == 1] = 1 menage.situacj[menage.occupcj == 3] = 3 menage.situacj[menage.occupcj == 2] = 4 menage.situacj[menage.occupcj == 5] = 5 menage.situacj[menage.occupcj == 6] = 5 menage.situacj[menage.occupcj == 7] = 6 menage.situacj[menage.occupcj == 8] = 7 menage.situacj[menage.occupcj == 4] = 8 menage["situapr"] = 0 menage.situapr[menage.occuppr == 1] = 1 menage.situapr[menage.occuppr == 3] = 3 menage.situapr[menage.occuppr == 2] = 4 menage.situapr[menage.occuppr == 5] = 5 menage.situapr[menage.occuppr == 6] = 5 menage.situapr[menage.occuppr == 7] = 6 menage.situapr[menage.occuppr == 8] = 7 menage.situapr[menage.occuppr == 4] = 8 menage["typlog"] = 0 menage.typlog[menage.sitlog == 1] = 1 menage.typlog[menage.sitlog != 1] = 2 menage['stalog'] = menage['stalog'].astype(int) individus = survey.get_values(table="individu", ) variables = ['mena', 'v'] individus.rename( columns={'mena': 'identmen'}, inplace=True, ) menage.set_index('ident_men', inplace=True) if year == 2000: menage = survey.get_values( table="menage", variables=[ 'ident', 'pondmen', 'nbact', 'nbenf1', 'nbpers', 'ocde10', 'sitlog', 'stalog', 'strate', 'typmen1', 'zeat', 'stalog', 'vag', 'sexepr', 'sexecj', 'agecj', 'napr', 'nacj', 'cs2pr', 'cs2cj', 'diegpr', 'dieppr', 'diespr', 'diegcj', 'diepcj', 'diescj', 'hod_nb', 'cohabpr', 'occupapr', 'occupacj', 'occupbpr', 'occupbcj', 'occupcpr', 'occupccj', 'typmen1' ]) menage.rename( columns={ 'cohabpr': 'couplepr', 'hod_nb': 'nenfhors', 'ident': 'ident_men', 'nbact': 'nactifs', 'nbenf1': 'nenfants', 'nbpers': 'npers', 'rev81': 'poste_coicop_421', 'typmen1': 'typmen' }, inplace=True, ) menage.ocde10 = menage.ocde10 / 10 # on met un numéro à chaque vague pour pouvoir faire un meilleur suivi des évolutions temporelles # pour le modèle de demande menage.agecj = menage.agecj.fillna(0) assert menage.notnull().all().all( ), 'The following variables contains NaN values: {}'.format( list(menage.isnull().any()[menage.isnull().any()].index)) menage['vag_'] = menage['vag'] menage.vag.loc[menage.vag_ == 1] = 9 menage.vag.loc[menage.vag_ == 2] = 10 menage.vag.loc[menage.vag_ == 3] = 11 menage.vag.loc[menage.vag_ == 4] = 12 menage.vag.loc[menage.vag_ == 5] = 13 menage.vag.loc[menage.vag_ == 6] = 14 menage.vag.loc[menage.vag_ == 7] = 15 menage.vag.loc[menage.vag_ == 8] = 16 del menage['vag_'] # harmonisation des types de ménage sur la nomenclature 2010 menage['typmen_'] = menage['typmen'] menage.typmen.loc[menage.typmen_ == 1] = 1 menage.typmen.loc[menage.typmen_ == 2] = 3 menage.typmen.loc[menage.typmen_ == 3] = 4 menage.typmen.loc[menage.typmen_ == 4] = 4 menage.typmen.loc[menage.typmen_ == 5] = 4 menage.typmen.loc[menage.typmen_ == 6] = 2 menage.typmen.loc[menage.typmen_ == 7] = 5 del menage['typmen_'] menage.couplepr = menage.couplepr.astype('int') menage["nadultes"] = menage['npers'] - menage['nenfants'] menage.typmen = menage.typmen.astype('int') # occupa : 1 si la personne travaille, 2 sinon. occupb : 1 si elle travaille effectivement, 2 si congé de # longue durée (négligé ici). occupc : de 2 à 8 selon le statut si ne travaille pas (étudiant, retraité, etc.) menage["situacj"] = 0 menage.situacj.loc[menage.occupacj == 1] = 1 menage.situacj.loc[menage.occupccj == 3] = 3 menage.situacj.loc[menage.occupccj == 2] = 4 menage.situacj.loc[menage.occupccj == 5] = 5 menage.situacj.loc[menage.occupccj == 6] = 5 menage.situacj.loc[menage.occupccj == 7] = 6 menage.situacj.loc[menage.occupccj == 8] = 7 menage.situacj.loc[menage.occupccj == 4] = 8 menage["situapr"] = 0 menage.situapr.loc[menage.occupapr == 1] = 1 menage.situapr.loc[menage.occupcpr == 3] = 3 menage.situapr.loc[menage.occupcpr == 2] = 4 menage.situapr.loc[menage.occupcpr == 5] = 5 menage.situapr.loc[menage.occupcpr == 6] = 5 menage.situapr.loc[menage.occupcpr == 7] = 6 menage.situapr.loc[menage.occupcpr == 8] = 7 menage.situapr.loc[menage.occupcpr == 4] = 8 menage["natiocj"] = 0 menage["natiopr"] = 0 menage.natiocj.loc[menage.nacj == 1] = 1 menage.natiocj.loc[menage.nacj == 2] = 1 menage.natiocj.loc[menage.nacj == 3] = 2 menage.natiopr.loc[menage.napr == 1] = 1 menage.natiopr.loc[menage.napr == 2] = 1 menage.natiopr.loc[menage.napr == 3] = 2 menage["typlog"] = 0 menage.typlog.loc[menage.sitlog == 1] = 1 menage.typlog.loc[menage.sitlog != 1] = 2 # Homogénéisation des diplômes, choix d'équivalence entre les diplômes menage["dip14pr"] = 999999 menage.dip14pr.loc[menage.diegpr == 0] = 71 menage.dip14pr.loc[menage.diegpr == 2] = 70 menage.dip14pr.loc[menage.diegpr == 15] = 60 menage.dip14pr.loc[menage.diegpr == 18] = 60 menage.dip14pr.loc[menage.diegpr == 16] = 41 menage.dip14pr.loc[menage.diegpr == 17] = 41 menage.dip14pr.loc[menage.diegpr == 19] = 41 menage.dip14pr.loc[menage.dieppr == 23] = 50 menage.dip14pr.loc[menage.dieppr == 25] = 50 menage.dip14pr.loc[menage.dieppr == 27] = 50 menage.dip14pr.loc[menage.dieppr == 29] = 50 menage.dip14pr.loc[menage.dieppr == 34] = 43 menage.dip14pr.loc[menage.dieppr == 32] = 42 menage.dip14pr.loc[menage.dieppr == 36] = 42 menage.dip14pr.loc[menage.diespr == 41] = 30 menage.dip14pr.loc[menage.diespr == 42] = 31 menage.dip14pr.loc[menage.diespr == 43] = 31 menage.dip14pr.loc[menage.diespr == 44] = 33 menage.dip14pr.loc[menage.diespr == 46] = 20 menage.dip14pr.loc[menage.diespr == 48] = 12 menage.dip14pr.loc[menage.diespr == 47] = 10 menage.set_index('ident_men', inplace=True) # Recodage des catégories zeat menage.zeat.loc[menage.zeat == 7] = 6 menage.zeat.loc[menage.zeat == 8] = 7 menage.zeat.loc[menage.zeat == 9] = 8 assert menage.zeat.isin(range(1, 9)).all() individus = survey.get_values( table="individus", variables=['ident', 'matri', 'lien', 'anais']) individus = individus.loc[individus.lien == 1].copy() individus.rename( columns={ 'ident': 'ident_men', 'matri': 'etamatri' }, inplace=True, ) variables_to_destring = ['anais'] for variable_to_destring in variables_to_destring: individus[variable_to_destring] = individus[ variable_to_destring].astype('int').copy() individus['agepr'] = year - individus.anais individus.set_index('ident_men', inplace=True) assert menage.notnull().all().all( ), 'The following variables contains NaN values: {}'.format( list(menage.isnull().any()[menage.isnull().any()].index)) menage = menage.merge(individus, left_index=True, right_index=True) if year == 2005: menage = survey.get_values(table="menage") # données socio-démographiques socio_demo_variables = [ 'agpr', 'agcj', 'couplepr', 'decuc', 'ident_men', 'nactifs', 'nenfants', 'nenfhors', 'npers', 'ocde10', 'pondmen', 'sexecj', 'sexepr', 'typmen5', 'vag', 'zeat', 'cs24pr' ] socio_demo_variables += [ column for column in menage.columns if column.startswith('dip14') ] socio_demo_variables += [ column for column in menage.columns if column.startswith('natio7') ] # activité professionnelle activite_prof_variables = ['situacj', 'situapr'] activite_prof_variables += [ column for column in menage.columns if column.startswith('cs42') ] # logement logement_variables = ['htl', 'strate'] menage = menage[socio_demo_variables + activite_prof_variables + logement_variables] menage.rename( columns={ # "agpr": "agepr", "agcj": "agecj", "typmen5": "typmen", "cs24pr": "cs_pr" }, inplace=True, ) del menage['agpr'] menage['nadultes'] = menage.npers - menage.nenfants for person in ['pr', 'cj']: menage['natio' + person] = (menage['natio7' + person] > 2 ) # TODO: changer de convention ? del menage['natio7' + person] menage.agecj = menage.agecj.fillna(0) menage.nenfhors = menage.nenfhors.fillna(0) var_to_ints = [ 'ocde10', 'decuc', 'nactifs', 'nenfants', 'npers', 'pondmen', 'nadultes' ] assert menage.notnull().all().all( ), 'The following variables contains NaN values: {}'.format( list(menage.isnull().any()[menage.isnull().any()].index)) menage.couplepr = menage.couplepr > 2 # TODO: changer de convention ? menage.ocde10 = menage.ocde10 / 10 menage.set_index('ident_men', inplace=True) # on met un numéro à chaque vague pour pouvoir faire un meilleur suivi des évolutions temporelles # pour le modèle de demande menage['vag_'] = menage['vag'] menage.vag.loc[menage.vag_ == 1] = 17 menage.vag.loc[menage.vag_ == 2] = 18 menage.vag.loc[menage.vag_ == 3] = 19 menage.vag.loc[menage.vag_ == 4] = 20 menage.vag.loc[menage.vag_ == 5] = 21 menage.vag.loc[menage.vag_ == 6] = 22 del menage['vag_'] # Recodage des catégories zeat menage.zeat.loc[menage.zeat == 7] = 6 menage.zeat.loc[menage.zeat == 8] = 7 menage.zeat.loc[menage.zeat == 9] = 8 assert menage.zeat.isin(range(1, 9)).all() stalog = survey.get_values(table="depmen", variables=['ident_men', 'stalog']) stalog['stalog'] = stalog.stalog.astype('int').copy() stalog['new_stalog'] = 0 stalog.loc[stalog.stalog == 2, 'new_stalog'] = 1 stalog.loc[stalog.stalog == 1, 'new_stalog'] = 2 stalog.loc[stalog.stalog == 4, 'new_stalog'] = 3 stalog.loc[stalog.stalog == 5, 'new_stalog'] = 4 stalog.loc[stalog.stalog.isin([3, 6]), 'new_stalog'] = 5 stalog.stalog = stalog.new_stalog.copy() del stalog['new_stalog'] assert stalog.stalog.isin(range(1, 6)).all() stalog.set_index('ident_men', inplace=True) menage = menage.merge(stalog, left_index=True, right_index=True) menage['typlog'] = 2 menage.loc[menage.htl.isin(['1', '5']), 'typlog'] = 1 assert menage.typlog.isin([1, 2]).all() del menage['htl'] individus = survey.get_values(table='individu') # Il y a un problème sur l'année de naissance, # donc on le recalcule avec l'année de naissance et la vague d'enquête individus['agepr'] = year - individus.anais individus.loc[individus.vag == 6, ['agepr']] = year + 1 - individus.anais individus = individus[individus.lienpref == 00].copy() kept_variables = ['ident_men', 'etamatri', 'agepr'] individus = individus[kept_variables].copy() individus.etamatri.loc[individus.etamatri == 0] = 1 individus['etamatri'] = individus['etamatri'].astype( 'int') # MBJ TODO: define as a catagory ? individus.set_index('ident_men', inplace=True) menage = menage.merge(individus, left_index=True, right_index=True) individus = survey.get_values( table='individu', variables=[ 'ident_men', 'ident_ind', 'age', 'anais', 'vag', 'lienpref' ], ) # Il y a un problème sur l'année de naissance, # donc on le recalcule avec l'année de naissance et la vague d'enquête individus['age'] = year - individus.anais individus.loc[individus.vag == 6, ['age']] = year + 1 - individus.anais # Garder toutes les personnes du ménage qui ne sont pas la personne de référence et le conjoint individus = individus[(individus.lienpref != 00) & (individus.lienpref != 01)].copy() individus.sort_values(by=['ident_men', 'ident_ind'], inplace=True) # Inspired by http://stackoverflow.com/questions/17228215/enumerate-each-row-for-each-group-in-a-dataframe def add_col_numero(data_frame): data_frame['numero'] = numpy.arange(len(data_frame)) + 3 return data_frame individus = individus.groupby(by='ident_men').apply(add_col_numero) pivoted = individus.pivot(index='ident_men', columns="numero", values='age') pivoted.columns = [ "age{}".format(column) for column in pivoted.columns ] menage = menage.merge(pivoted, left_index=True, right_index=True, how='outer') individus = survey.get_values( table='individu', variables=['ident_men', 'ident_ind', 'agfinetu', 'lienpref'], ) individus.set_index('ident_men', inplace=True) pr = individus.loc[individus.lienpref == 00, 'agfinetu'].copy() conjoint = individus.loc[individus.lienpref == 01, 'agfinetu'].copy() conjoint.name = 'agfinetu_cj' agfinetu_merged = pandas.concat([pr, conjoint], axis=1) menage = menage.merge(agfinetu_merged, left_index=True, right_index=True) temporary_store['donnes_socio_demog_{}'.format(year)] = menage # label var agepr "Age de la personne de référence au 31/12/${yearrawdata}" # label var agecj "Age du conjoint de la PR au 31/12/${yearrawdata}" # label var sexepr "Sexe de la personne de référence" # label var sexecj "Sexe du conjoint de la PR" # label var cs42pr "Catégorie socio-professionnelle de la PR" # label var cs42cj "Catégorie socio-professionnelle du conjoint de la PR" # label var ocde10 "Nombre d'unités de consommation (échelle OCDE)" # label var ident_men "Identifiant du ménage" # label var pondmen "Ponderation du ménage" # label var npers "Nombre total de personnes dans le ménage" # label var nadultes "Nombre d'adultes dans le ménage" # label var nenfants "Nombre d'enfants dans le ménage" # label var nenfhors "Nombre d'enfants vivant hors domicile" # label var nactifs "Nombre d'actifs dans le ménage" # label var couplepr "Vie en couple de la personne de référence" # label define typmen5 1 "Personne seule" 2 "Famille monoparentale" 3 "Couple sans enfant" # 4 "Couple avec enfants" 5 "Autre type de ménage (complexe)" # label values typmen5 typmen5 # label var typmen5 "Type de ménage (5 modalités)" # label var etamatri "Situation matrimoniale de la personne de référence" # label define matripr 1 "Célibataire" 2 "Marié(e)" 3 "Veuf(ve)" 4 "Divorcé(e)" # label values etamatri matripr # label define occupation 1 "Occupe un emploi" /// # 2 "Apprenti" /// # 3 "Etudiant, élève, en formation" /// # 4 "Chômeur (inscrit ou non à l'ANPE)" /// # 5 "Retraité, préretraité ou retiré des affaires" /// # 6 "Au foyer" /// # 7 "Autre situation (handicapé)" /// # 8 "Militaire du contingent" # label values situapr occupation # label values situacj occupation # label var situapr "Situation d'activité de la personne de référence" # label var situacj "Situation d'activité du conjoint de la PR" # label define diplome 10 "Diplôme de 3ème cycle universitaire, doctorat" /// # 12 "Diplôme d'ingénieur, grande école" /// # 20 "Diplôme de 2nd cycle universitaire" /// # 30 "Diplôme de 1er cycle universitaire" /// # 31 "BTS, DUT ou équivalent" /// # 33 "Diplôme des professions sociales et de la santé niveau Bac +2" /// # 41 "Baccalauréat général, brevet supérieur, capacité en droit" /// # 42 "Baccalauréat technologique" /// # 43 "Baccalauréat professionnel" /// # 44 "Brevet professionnel ou de technicien" /// # 50 "CAP, BEP ou diplôme de même niveau" /// # 60 "Brevet des collèges, BEPC" /// # 70 "Certificat d'études primaires" /// # 71 "Aucun diplôme" # label values dip14pr diplome # label values dip14cj diplome # label var dip14pr "Diplôme le plus élevé de la PR" # label var dip14cj "Diplôme le plus élevé du conjoint de la PR" # label define nationalite 1 "Français, par naissance ou naturalisation" 2 "Etranger" # label values natiopr nationalite # label values natiocj nationalite # label var natiopr "Nationalité de la personne de référence" # label var natiocj "Nationalité du conjoint de la PR" # label define logement 1 "Maison" 2 "Appartement" # label values typlog logement # label var typlog "Type de logement" # label define statutlogement 1 "Propriétaire ou copropriétaire" /// # 2 "Accédant à la propriété (rembourse un prêt)" /// # 3 "Locataire" /// # 4 "Sous-locataire" /// # 5 "Logé gratuitement" # label values stalog statutlogement # label var stalog "Statut d'occupation du logement" # label define viecouple 1 "Vit en couple" 2 "Ne vit pas en couple" # label values couplepr viecouple # # /* Recodage des CSP en 12 et 8 postes à partir de classification de l'INSEE (2003, PCS niveaux 1 et 2) */ # gen cs24pr=00 # replace cs24pr=10 if cs42pr=="11" # replace cs24pr=10 if cs42pr=="12" # replace cs24pr=10 if cs42pr=="13" # replace cs24pr=21 if cs42pr=="21" # replace cs24pr=22 if cs42pr=="22" # replace cs24pr=23 if cs42pr=="23" # replace cs24pr=31 if cs42pr=="31" # replace cs24pr=32 if cs42pr=="33" # replace cs24pr=32 if cs42pr=="34" # replace cs24pr=32 if cs42pr=="35" # replace cs24pr=36 if cs42pr=="37" # replace cs24pr=36 if cs42pr=="38" # replace cs24pr=41 if cs42pr=="42" # replace cs24pr=41 if cs42pr=="43" # replace cs24pr=41 if cs42pr=="44" # replace cs24pr=41 if cs42pr=="45" # replace cs24pr=46 if cs42pr=="46" # replace cs24pr=47 if cs42pr=="47" # replace cs24pr=48 if cs42pr=="48" # replace cs24pr=51 if cs42pr=="52" # replace cs24pr=51 if cs42pr=="53" # replace cs24pr=54 if cs42pr=="54" # replace cs24pr=55 if cs42pr=="55" # replace cs24pr=56 if cs42pr=="56" # replace cs24pr=61 if cs42pr=="62" # replace cs24pr=61 if cs42pr=="63" # replace cs24pr=61 if cs42pr=="64" # replace cs24pr=61 if cs42pr=="65" # replace cs24pr=66 if cs42pr=="67" # replace cs24pr=66 if cs42pr=="68" # replace cs24pr=69 if cs42pr=="69" # replace cs24pr=71 if cs42pr=="71" # replace cs24pr=72 if cs42pr=="72" # replace cs24pr=73 if cs42pr=="74" # replace cs24pr=73 if cs42pr=="75" # replace cs24pr=76 if cs42pr=="77" # replace cs24pr=76 if cs42pr=="78" # replace cs24pr=81 if cs42pr=="81" # replace cs24pr=82 if cs42pr=="83" # replace cs24pr=82 if cs42pr=="84" # replace cs24pr=82 if cs42pr=="85" # replace cs24pr=82 if cs42pr=="86" # replace cs24pr=82 if cs42pr=="**" # replace cs24pr=82 if cs42pr=="00" # menage['cs24pr'] = 0 csp42s_by_csp24 = { 10: ["11", "12", "13"], 21: ["21"], 22: ["22"], 23: ["23"], 31: ["31"], 32: ["32", "33", "34", "35"], 36: ["37", "38"], 41: ["42", "43", "44", "45"], 46: ["46"], 47: ["47"], 48: ["48"], 51: ["52", "53"], 54: ["54"], 55: ["55"], 56: ["56"], 61: ["62", "63", "64", "65"], 66: ["67", "68"], 69: ["69"], 71: ["71"], 72: ["72"], 73: ["74", "75"], 76: ["77", "78"], 81: ["81"], 82: ["83", "84", "85", "86", "**", "00"], } for csp24, csp42s in csp42s_by_csp24.items(): menage.loc[menage.cs42pr.isin(csp42s), 'cs24pr'] = csp24 assert menage.cs24pr.isin(csp42s_by_csp24.keys()).all() menage['cs8pr'] = numpy.floor(menage.cs24pr / 10) assert menage.cs8pr.isin(range(1, 9)).all() variables = [ 'pondmen', 'npers', 'nenfants', 'nenfhors', 'nadultes', 'nactifs', 'ocde10', 'typmen', 'sexepr', 'agepr', 'etamatri', 'couplepr', 'situapr', 'dip14pr', 'cs42pr', 'cs24pr', 'cs8pr', 'natiopr', 'sexecj', 'agecj', 'situacj', 'dip14cj', 'cs42cj', 'natiocj', 'typlog', 'stalog' ] + ["age{}".format(age) for age in range(3, 14)] for variable in variables: assert variable in menage.columns, "{} is not a column of menage data frame".format( variable) if year == 2011: variables = [ 'agecj', 'agepr', 'coeffuc', 'decuc1', 'ident_me', 'pondmen', 'npers', 'nenfants', 'nactifs', 'sexepr', 'sexecj', 'dip14cj', 'dip14pr', 'typmen5', 'cataeu', 'situapr', 'situacj', 'zeat', ] try: menage = survey.get_values(table="MENAGE", variables=variables) except: menage = survey.get_values(table="menage", variables=variables) menage.rename( columns={ 'ident_me': 'ident_men', 'coeffuc': 'ocde10', 'typmen5': 'typmen', 'decuc1': 'decuc', 'cataeu': 'strate' }, inplace=True, ) del variables menage.agecj = menage.agecj.fillna(0) # Ajout de la variable vag try: depmen = survey.get_values(table="DEPMEN") except: depmen = survey.get_values(table="depmen") depmen.rename(columns={'ident_me': 'ident_men'}, inplace=True) vague = depmen[['vag', 'ident_men']].copy() stalog = depmen[['stalog', 'ident_men']].copy() del depmen menage.set_index('ident_men', inplace=True) vague.set_index('ident_men', inplace=True) menage = menage.merge(vague, left_index=True, right_index=True) # On met un numéro à chaque vague pour pouvoir faire un meilleur suivi des évolutions temporelles pour # le modèle de demande menage['vag_'] = menage['vag'].copy() menage.vag.loc[menage.vag_ == 1] = 23 menage.vag.loc[menage.vag_ == 2] = 24 menage.vag.loc[menage.vag_ == 3] = 25 menage.vag.loc[menage.vag_ == 4] = 26 menage.vag.loc[menage.vag_ == 5] = 27 menage.vag.loc[menage.vag_ == 6] = 28 del menage['vag_'] # Homogénéisation de la variable statut du logement qui prend des valeurs différentes pour 2011 stalog['stalog'] = stalog.stalog.astype('int').copy() stalog['new_stalog'] = 0 stalog.loc[stalog.stalog == 2, 'new_stalog'] = 1 stalog.loc[stalog.stalog == 1, 'new_stalog'] = 2 stalog.loc[stalog.stalog == 4, 'new_stalog'] = 3 stalog.loc[stalog.stalog == 5, 'new_stalog'] = 4 stalog.loc[stalog.stalog.isin([3, 6]), 'new_stalog'] = 5 stalog.stalog = stalog.new_stalog.copy() del stalog['new_stalog'] assert stalog.stalog.isin(range(1, 6)).all() stalog.set_index('ident_men', inplace=True) menage = menage.merge(stalog, left_index=True, right_index=True) # Recodage des catégories zeat menage.loc[menage.zeat == 7, 'zeat'] = 6 menage.zeat.loc[menage.zeat == 8] = 7 menage.zeat.loc[menage.zeat == 9] = 8 assert menage.zeat.isin(range(0, 9)).all() menage.index.name = 'ident_men' # assert menage.index.name == 'ident_men' menage['role_menage'] = 0 temporary_store['donnes_socio_demog_{}'.format(year)] = menage
def create_comparable_logement_data_frame(temporary_store = None, year = None): assert temporary_store is not None assert year is not None logement_adresse_variables = ["gzc2"] logement_menage_variables = [ "maa1at", "magtr", "mcs8", "mdiplo", "mrcho", "mrret", "mrsal", "mrtns", "mtybd", "muc1", "qex", "sec1", ] if year == 2003: logement_menage_variables.extend(["hnph2", "ident", "lmlm", "mnatior", "typse"]) logement_adresse_variables.extend(["iaat", "ident", "tu99"]) if year < 2010 and year > 2005: logement_menage_variables.extend(["idlog", "mnatio"]) logement_adresse_variables.extend(["idlog"]) # pas de typse en 2006 logement_logement_variables = ["hnph2", "iaat", "idlog", "lmlm", "tu99"] # pas de typse en 2006 # Travail sur la table logement # Table menage if year == 2003: year_lgt = 2003 if year > 2005 and year < 2010: year_lgt = 2006 logement_survey_collection = SurveyCollection.load(collection = 'logement', config_files_directory = config_files_directory) logement_survey = logement_survey_collection.get_survey('logement_{}'.format(year_lgt)) log.info("Preparing logement menage table") try: logement_menage = logement_survey.get_values( table = "lgt_menage", variables = logement_menage_variables) except: logement_menage = logement_survey.get_values( table = "menage1", variables = logement_menage_variables) logement_menage.rename(columns = {'idlog': 'ident'}, inplace = True) logement_menage['mrcho'].fillna(0, inplace = True) logement_menage['mrret'].fillna(0, inplace = True) logement_menage['mrsal'].fillna(0, inplace = True) logement_menage['mrtns'].fillna(0, inplace = True) logement_menage['revtot'] = logement_menage['mrcho'] + logement_menage ['mrret'] + logement_menage['mrsal'] + logement_menage['mrtns'] # TODO : Virer les revenus négatifs ? mrtns : 118 revenus négatifs sur 42845 en 2006 assert logement_menage.revtot.notnull().all() logement_menage['nvpr'] = 10.0 * logement_menage['revtot'] / logement_menage['muc1'] assert logement_menage.qex.notnull().all() assert (logement_menage.qex > 0).all() dec, values = mark_weighted_percentiles( logement_menage['nvpr'].values, numpy.arange(1, 11), logement_menage['qex'].values, 2, return_quantiles = True, ) values.sort() logement_menage['deci'] = ( 1 + (logement_menage.nvpr > values[1]) + (logement_menage.nvpr > values[2]) + (logement_menage.nvpr > values[3]) + (logement_menage.nvpr > values[4]) + (logement_menage.nvpr > values[5]) + (logement_menage.nvpr > values[6]) + (logement_menage.nvpr > values[7]) + (logement_menage.nvpr > values[8]) + (logement_menage.nvpr > values[9]) ) del dec, values assert logement_menage['deci'].isin(range(1, 11)).all(), "Logement decile are out of range'" gc.collect() if year_lgt == 2006: log.info('Preparing logement logement table') try: lgtlgt = logement_survey.get_values( table = "lgt_logt", variables = logement_logement_variables) except: lgtlgt = logement_survey.get_values( table = "logement", variables = logement_logement_variables) lgtlgt.rename(columns = {'idlog': 'ident'}, inplace = True) logement_menage = logement_menage.merge(lgtlgt, left_on = 'ident', right_on = 'ident', how = 'inner') del lgtlgt data = logement_menage[logement_menage['sec1'].isin([21, 22, 23, 24, 30])] del logement_menage gc.collect() if year_lgt == 2006: data.rename(columns = {'mnatio': 'mnatior'}, inplace = True) data = (data[data['mnatior'].notnull()]) data = (data[data['sec1'].notnull()]) data['tmp'] = data['sec1'].astype("int") data['tmp'][data['sec1'].isin([21, 22, 23])] = 3 data['tmp'][data['sec1'] == 24] = 4 data['tmp'][data['sec1'] == 30] = 5 data['statut_occupation'] = data['tmp'] count_NA('statut_occupation', data) data = (data[data['statut_occupation'].notnull()]) logement_menage = data # Table adresse log.info(u"Préparation de la table adresse de l'enquête logement") logement_adresse = logement_survey.get_values(table = "adresse", variables = logement_adresse_variables) logement_adresse.rename(columns = {'idlog': 'ident'}, inplace = True) log.info(u"Fusion des tables logement et ménage de l'enquête logement") Logement = logement_menage.merge(logement_adresse, on = 'ident', how = 'inner') Logement.hnph2[Logement.hnph2 >= 6] = 6 Logement.hnph2[Logement.hnph2 < 1] = 1 count_NA('hnph2', Logement) assert Logement.hnph2.notnull().any(), "Some hnph2 are null" # Logement=(Logement[Logement['hnph2'].notnull()]) # Mis en comment car 0 NA pour hnph2 # On est dans la même étape within ici et par la suite ( cf code R ) # TODO : ici problème je transforme les 07 en 7 # car Python considère les 0n comme des nombres octaux ( < 08 ). # J'espère que ce n'est pas important. Logement.mnatior[Logement['mnatior'].isin([0, 1])] = 1 Logement.mnatior[Logement['mnatior'].isin([2, 3, 4, 5, 6, 7, 8, 9, 10, 11])] = 2 count_NA('mnatior', Logement) assert_variable_in_range('mnatior', [1, 3], Logement) Logement['iaat_bis'] = 0 Logement.iaat_bis[Logement.iaat.isin([1, 2, 3, 4, 5])] = 1 # avant 1967 Logement.iaat_bis[Logement.iaat == 6] = 2 # 1968 - 1974 Logement.iaat_bis[Logement.iaat == 7] = 3 # 1975 - 1981 Logement.iaat_bis[Logement.iaat == 8] = 4 # 1982 - 1989 Logement.iaat_bis[Logement.iaat == 9] = 5 # 1990 - 1998 Logement.iaat_bis[Logement.iaat == 10] = 6 # après 1999 assert Logement.iaat_bis.isin(range(1, 7)).all() Logement.mdiplo[Logement.mdiplo == 1] = 1 Logement.mdiplo[Logement.mdiplo.isin([2, 3, 4])] = 2 Logement.mdiplo[Logement.mdiplo.isin([5, 6, 7, 8])] = 3 Logement.mdiplo[Logement.mdiplo == 9] = 4 Logement.mdiplo[Logement.mdiplo.isnull()] = 0 # TODO: assert Logement.mdiplo.isin(range(1, 5)).all() assert Logement.mdiplo.isin(range(0, 5)).all() Logement.mdiplo = Logement.mdiplo.astype('int') Logement.mtybd[Logement['mtybd'] == 110] = 1 Logement.mtybd[Logement['mtybd'] == 120] = 2 Logement.mtybd[Logement['mtybd'] == 200] = 3 Logement.mtybd[Logement['mtybd'].isin([311, 321, 401])] = 4 Logement.mtybd[Logement['mtybd'].isin([312, 322, 402])] = 5 Logement.mtybd[Logement['mtybd'].isin([313, 323, 403])] = 6 Logement.mtybd[Logement['mtybd'] == 400] = 7 assert Logement.mtybd.isin(range(1, 8)).all() Logement.mtybd = Logement.mtybd.astype('int') Logement['tu99_recoded'] = Logement['tu99'].copy() count_NA('tu99', Logement) Logement.tu99_recoded[Logement['tu99'] == 0] = 1 Logement.tu99_recoded[Logement['tu99'].isin([1, 2, 3])] = 2 Logement.tu99_recoded[Logement['tu99'].isin([4, 5, 6])] = 3 Logement.tu99_recoded[Logement['tu99'] == 7] = 4 Logement.tu99_recoded[Logement['tu99'] == 8] = 5 count_NA('tu99_recoded', Logement) assert_variable_in_range('tu99_recoded', [1, 6], Logement) Logement.gzc2[Logement['gzc2'] == 1] = 1 Logement.gzc2[Logement['gzc2'].isin([2, 3, 4, 5, 6])] = 2 Logement.gzc2[Logement['gzc2'] == 7] = 3 count_NA('gzc2', Logement) # TODO: assert_variable_in_range('gzc2', [1, 4], Logement) Logement.magtr[Logement['magtr'].isin([1, 2])] = 1 Logement.magtr[Logement['magtr'].isin([3, 4])] = 2 Logement.magtr[Logement['magtr'] == 5] = 3 assert Logement.magtr.isin(range(1, 4)).all() Logement['mcs8'][Logement['mcs8'] == 1] = 1 Logement['mcs8'][Logement['mcs8'] == 2] = 2 Logement['mcs8'][Logement['mcs8'] == 3] = 3 Logement['mcs8'][Logement['mcs8'].isin([4, 8])] = 4 Logement['mcs8'][Logement['mcs8'].isin([5, 6, 7])] = 5 assert Logement.mcs8.isin(range(1, 6)).all() Logement['logloy'] = numpy.log(Logement['lmlm'].values) kept_variables = [ 'deci', 'hnph2', 'iaat_bis', # 'ident', 'lmlm', 'statut_occupation', 'magtr', 'mcs8', 'mdiplo', 'mtybd', 'qex', 'tu99_recoded', ] logement = Logement[kept_variables].copy() # logement.rename(columns = {'qex': 'wprm'}, inplace = True) return logement
def build_imputation_loyers_proprietaires(year = None): """Build menage consumption by categorie fiscale dataframe """ assert year is not None # Load data bdf_survey_collection = SurveyCollection.load(collection = 'budget_des_familles', config_files_directory = config_files_directory) survey = bdf_survey_collection.get_survey('budget_des_familles_{}'.format(year)) if year == 1995: imput00 = survey.get_values(table = "socioscm") # cette étape de ne garder que les données dont on est sûr de la qualité et de la véracité # exdep = 1 si les données sont bien remplies pour les dépenses du ménage # exrev = 1 si les données sont bien remplies pour les revenus du ménage imput00 = imput00[(imput00.exdep == 1) & (imput00.exrev == 1)] imput00 = imput00[(imput00.exdep == 1) & (imput00.exrev == 1)] kept_variables = ['mena', 'stalog', 'surfhab', 'confort1', 'confort2', 'confort3', 'confort4', 'ancons', 'sitlog', 'nbphab', 'rg', 'cc'] imput00 = imput00[kept_variables] imput00.rename(columns = {'mena' : 'ident_men'}, inplace = True) #TODO: continue variable cleaning var_to_filnas = ['surfhab'] for var_to_filna in var_to_filnas: imput00[var_to_filna] = imput00[var_to_filna].fillna(0) var_to_ints = ['sitlog', 'confort1', 'stalog', 'surfhab','ident_men','ancons','nbphab'] for var_to_int in var_to_ints: imput00[var_to_int] = imput00[var_to_int].astype(int) depenses = temporary_store['depenses_{}'.format(year)] depenses.reset_index(inplace = True) depenses_small = depenses[['ident_men', '04110', 'pondmen']].copy() depenses_small.ident_men = depenses_small.ident_men.astype('int') imput00 = depenses_small.merge(imput00, on = 'ident_men').set_index('ident_men') imput00.rename(columns = {'04110' : 'loyer_reel'}, inplace = True) # * une indicatrice pour savoir si le loyer est connu et l'occupant est locataire imput00['observe'] = (imput00.loyer_reel > 0) & (imput00.stalog.isin([3, 4])) imput00['maison_appart'] = imput00.sitlog == 1 imput00['catsurf'] = ( 1 + (imput00.surfhab > 15) + (imput00.surfhab > 30) + (imput00.surfhab > 40) + (imput00.surfhab > 60) + (imput00.surfhab > 80) + (imput00.surfhab > 100) + (imput00.surfhab > 150) ) assert imput00.catsurf.isin(range(1, 9)).all() # TODO: vérifier ce qe l'on fait notamment regarder la vleur catsurf = 2 ommise dans le code stata imput00.maison = 1 - ((imput00.cc == 5) & (imput00.catsurf == 1) & (imput00.maison_appart == 1)) imput00.maison = 1 - ((imput00.cc == 5) & (imput00.catsurf == 3) & (imput00.maison_appart == 1)) imput00.maison = 1 - ((imput00.cc == 5) & (imput00.catsurf == 8) & (imput00.maison_appart == 1)) imput00.maison = 1 - ((imput00.cc == 4) & (imput00.catsurf == 1) & (imput00.maison_appart == 1)) try: hotdeck = pandas.read_stata('/home/benjello/IPP/openfisca_france_indirect_taxation/hotdeck_result.dta') except: hotdeck = survey.get_values(table = 'hotdeck_result') imput00.reset_index(inplace = True) hotdeck.ident_men = hotdeck.ident_men.astype('int') imput00 = imput00.merge(hotdeck, on = 'ident_men') imput00.loyer_impute[imput00.observe] = 0 imput00.reset_index(inplace = True) loyers_imputes = imput00[['ident_men', 'loyer_impute', 'stalog', 'observe']].copy() assert loyers_imputes.loyer_impute.notnull().all() loyers_imputes.loyer_impute[loyers_imputes.stalog.isin([1, 2, 5])] = 0 del loyers_imputes['stalog'] del loyers_imputes['observe'] loyers_imputes.rename(columns = dict(loyer_impute = '0411'), inplace = True) # POUR BdF 2000 ET 2005, ON UTILISE LES LOYERS IMPUTES CALCULES PAR L'INSEE if year == 2000: # Garder les loyers imputés (disponibles dans la table sur les ménages) loyers_imputes = survey.get_values(table = "menage", variables = ['ident', 'rev81']) loyers_imputes.rename( columns = { 'ident': 'ident_men', 'rev81': '0421', }, inplace = True, ) if year == 2005: # Garder les loyers imputés (disponibles dans la table sur les ménages) loyers_imputes = survey.get_values(table = "menage") kept_variables = ['ident_men', 'rev801_d'] loyers_imputes = loyers_imputes[kept_variables] loyers_imputes.rename(columns = {'rev801_d': '0421'}, inplace = True) if year == 2011: try: loyers_imputes = survey.get_values(table = "MENAGE") except: loyers_imputes = survey.get_values(table = "menage") kept_variables = ['ident_me', 'rev801'] loyers_imputes = loyers_imputes[kept_variables] loyers_imputes.rename(columns = {'rev801': '0421', 'ident_me': 'ident_men'}, inplace = True) # Joindre à la table des dépenses par COICOP loyers_imputes.set_index('ident_men', inplace = True) temporary_store['loyers_imputes_{}'.format(year)] = loyers_imputes depenses = temporary_store['depenses_{}'.format(year)] depenses.index = depenses.index.astype('int64') loyers_imputes.index = loyers_imputes.index.astype('int64') assert set(depenses.index) == set(loyers_imputes.index) assert len(set(depenses.columns).intersection(set(loyers_imputes.columns))) == 0 depenses = depenses.merge(loyers_imputes, left_index = True, right_index = True) #************************************************************************************************************************** #* Etape n° 0-1-3 : SAUVER LES BASES DE DEPENSES HOMOGENEISEES DANS LE BON DOSSIER #************************************************************************************************************************** # Save in temporary store temporary_store['depenses_bdf_{}'.format(year)] = depenses
def create_from(ctx, directory_path, collection_name = None, survey_name = None): parser = SafeConfigParser() parser.read(ctx.obj['CONFIG_FILE']) collection_names = [option for option in parser._sections['collections'].keys()] collection_names.remove('__name__') collections_directory = parser.get('collections', 'collections_directory') collection_names.remove('collections_directory') data_file_by_format = create_data_file_by_format(directory_path) sas_files = data_file_by_format['sas'] stata_files = data_file_by_format['stata'] click.confirm(u"Create a new survey using this information ?", abort = False, default = True) if collection_name not in collection_names: if collection_name is None: click.confirm(u"Create a new collection ?", abort = False, default = True) collection_name = click.prompt("Name of the new collection") collection_json_path = os.path.join(collections_directory, collection_name + ".json") click.confirm(u"Create a collection {} ?".format(collection_name), abort = False, default = True) if os.path.isfile(collection_json_path): click.confirm( u"Erase existing {} collection file ?".format(collection_json_path), abort = False, default = True) os.remove(collection_json_path) survey_collection = create_collection(collection_name) else: click.echo(u"The new survey is being add to the existing collection {} ".format(collection_name)) collection_json_path = os.path.join(collections_directory, collection_name + ".json") survey_collection = SurveyCollection.load(collection_json_path) if survey_name is not None: click.echo(u"The survey {} is being add to the existing collection {} ".format(survey_name, collection_name)) if not survey_name: survey_name = click.prompt('Enter a name for the survey in collection {}'.format(survey_collection.name)) add_survey_to_collection( survey_name = survey_name, survey_collection = survey_collection, sas_files = sas_files, stata_files = stata_files, ) survey_collection.dump( json_file_path = collection_json_path, ) for format_extension, data_files in data_file_by_format.iteritems(): if data_files != []: to_print = yaml.safe_dump(data_files, default_flow_style = False) click.echo("Here are the {} files: \n {}".format(format_extension, to_print)) if click.confirm('Do you want to fill the {} HDF5 file using the {} files ?'.format( survey_name, format_extension, default = False)): survey_collection.fill_hdf(source_format = format_extension, overwrite = True) else: click.echo("There are no {} files".format(format_extension)) survey_collection.dump() config_file = open(ctx.obj['CONFIG_FILE'], 'w') parser.write(config_file) config_file.close()
data_frame = data_frame.query('zeat != 0') try: data_frame.reset_index(inplace = True) except ValueError, e: log.info('ignoring reset_index because {}'.format(e)) # Remove duplicated colums causing bug with HDFStore # according to https://github.com/pydata/pandas/issues/6240 # using solution form stackoverflow # http://stackoverflow.com/questions/16938441/how-to-remove-duplicate-columns-from-a-dataframe-using-python-pandas data_frame = data_frame.T.groupby(level = 0).first().T log.info('Saving the openfisca indirect taxation input dataframe') try: openfisca_survey_collection = SurveyCollection.load( collection = 'openfisca_indirect_taxation', config_files_directory = config_files_directory) except: openfisca_survey_collection = SurveyCollection( name = 'openfisca_indirect_taxation', config_files_directory = config_files_directory) output_data_directory = openfisca_survey_collection.config.get('data', 'output_directory') survey_name = "openfisca_indirect_taxation_data_{}".format(year_calage) table = "input" hdf5_file_path = os.path.join(output_data_directory, "{}.h5".format(survey_name)) survey = Survey( name = survey_name, hdf5_file_path = hdf5_file_path, ) survey.insert_table(name = table, data_frame = data_frame) openfisca_survey_collection.surveys.append(survey) openfisca_survey_collection.dump()
def foyer_all(temporary_store=None, year=None): year_specific_by_generic = year_specific_by_generic_data_frame_name(year) # On ajoute les cases de la déclaration erfs_survey_collection = SurveyCollection.load( collection='erfs', config_files_directory=config_files_directory) data = erfs_survey_collection.get_survey('erfs_{}'.format(year)) foyer_all = data.get_values(table=year_specific_by_generic["foyer"]) # on ne garde que les cases de la déclaration ('_xzz') ou ^_[0-9][a-z]{2}") regex = re.compile("^_[0-9][a-z]{2}") variables = [x for x in foyer_all.columns if regex.match(x)] # rename variable to fxzz ou ^f[0-9][a-z]{2}") renamed_variables = ["f{}".format(x[1:]) for x in variables] foyer = foyer_all[variables + ["noindiv"]].copy() # Memory expensive ... del foyer_all gc.collect() foyer.rename(columns=dict(zip(variables, renamed_variables)), inplace=True) # On aggrège les déclarations dans le cas où un individu a fait plusieurs déclarations foyer = foyer.groupby("noindiv", as_index=False).aggregate(numpy.sum) print_id(foyer) # On récupère les variables individualisables var_dict = { 'sali': ['f1aj', 'f1bj', 'f1cj', 'f1dj', 'f1ej'], 'hsup': ['f1au', 'f1bu', 'f1cu', 'f1du', 'f1eu'], 'choi': ['f1ap', 'f1bp', 'f1cp', 'f1dp', 'f1ep'], 'fra': ['f1ak', 'f1bk', 'f1ck', 'f1dk', 'f1ek'], 'cho_ld': ['f1ai', 'f1bi', 'f1ci', 'f1di', 'f1ei'], 'ppe_tp_sa': ['f1ax', 'f1bx', 'f1cx', 'f1dx', 'f1qx'], 'ppe_du_sa': ['f1av', 'f1bv', 'f1cv', 'f1dv', 'f1qv'], 'rsti': ['f1as', 'f1bs', 'f1cs', 'f1ds', 'f1es'], 'alr': ['f1ao', 'f1bo', 'f1co', 'f1do', 'f1eo'], 'f1tv': ['f1tv', 'f1uv'], 'f1tw': ['f1tw', 'f1uw'], 'f1tx': ['f1tx', 'f1ux'], 'ppe_tp_ns': ['f5nw', 'f5ow', 'f5pw'], 'ppe_du_ns': ['f5nv', 'f5ov', 'f5pv'], 'frag_exon': ['f5hn', 'f5in', 'f5jn'], 'frag_impo': ['f5ho', 'f5io', 'f5jo'], 'arag_exon': ['f5hb', 'f5ib', 'f5jb'], 'arag_impg': ['f5hc', 'f5ic', 'f5jc'], 'arag_defi': ['f5hf', 'f5if', 'f5jf'], 'nrag_exon': ['f5hh', 'f5ih', 'f5jh'], 'nrag_impg': ['f5hi', 'f5ii', 'f5ji'], 'nrag_defi': ['f5hl', 'f5il', 'f5jl'], 'nrag_ajag': ['f5hm', 'f5im', 'f5jm'], 'mbic_exon': ['f5kn', 'f5ln', 'f5mn'], 'abic_exon': ['f5kb', 'f5lb', 'f5mb'], 'nbic_exon': ['f5kh', 'f5lh', 'f5mh'], 'mbic_impv': ['f5ko', 'f5lo', 'f5mo'], 'mbic_imps': ['f5kp', 'f5lp', 'f5mp'], 'abic_impn': ['f5kc', 'f5lc', 'f5mc'], 'abic_imps': ['f5kd', 'f5ld', 'f5md'], 'nbic_impn': ['f5ki', 'f5li', 'f5mi'], 'nbic_imps': ['f5kj', 'f5lj', 'f5mj'], 'abic_defn': ['f5kf', 'f5lf', 'f5mf'], 'abic_defs': ['f5kg', 'f5lg', 'f5mg'], 'nbic_defn': ['f5kl', 'f5ll', 'f5ml'], 'nbic_defs': ['f5km', 'f5lm', 'f5mm'], 'nbic_apch': ['f5ks', 'f5ls', 'f5ms'], 'macc_exon': ['f5nn', 'f5on', 'f5pn'], 'aacc_exon': ['f5nb', 'f5ob', 'f5pb'], 'nacc_exon': ['f5nh', 'f5oh', 'f5ph'], 'macc_impv': ['f5no', 'f5oo', 'f5po'], 'macc_imps': ['f5np', 'f5op', 'f5pp'], 'aacc_impn': ['f5nc', 'f5oc', 'f5pc'], 'aacc_imps': ['f5nd', 'f5od', 'f5pd'], 'aacc_defn': ['f5nf', 'f5of', 'f5pf'], 'aacc_defs': ['f5ng', 'f5og', 'f5pg'], 'nacc_impn': ['f5ni', 'f5oi', 'f5pi'], 'nacc_imps': ['f5nj', 'f5oj', 'f5pj'], 'nacc_defn': ['f5nl', 'f5ol', 'f5pl'], 'nacc_defs': ['f5nm', 'f5om', 'f5pm'], 'mncn_impo': ['f5ku', 'f5lu', 'f5mu'], 'cncn_bene': ['f5sn', 'f5ns', 'f5os'], 'cncn_defi': ['f5sp', 'f5nu', 'f5ou', 'f5sr'], # TODO: check 'mbnc_exon': ['f5hp', 'f5ip', 'f5jp'], 'abnc_exon': ['f5qb', 'f5rb', 'f5sb'], 'nbnc_exon': ['f5qh', 'f5rh', 'f5sh'], 'mbnc_impo': ['f5hq', 'f5iq', 'f5jq'], 'abnc_impo': ['f5qc', 'f5rc', 'f5sc'], 'abnc_defi': ['f5qe', 'f5re', 'f5se'], 'nbnc_impo': ['f5qi', 'f5ri', 'f5si'], 'nbnc_defi': ['f5qk', 'f5rk', 'f5sk'], # 'ebic_impv' : ['f5ta','f5ua', 'f5va'], # 'ebic_imps' : ['f5tb','f5ub', 'f5vb'], 'mbic_mvct': ['f5hu'], 'macc_mvct': ['f5iu'], 'mncn_mvct': ['f5ju'], 'mbnc_mvct': ['f5kz'], 'frag_pvct': ['f5hw', 'f5iw', 'f5jw'], 'mbic_pvct': ['f5kx', 'f5lx', 'f5mx'], 'macc_pvct': ['f5nx', 'f5ox', 'f5px'], 'mbnc_pvct': ['f5hv', 'f5iv', 'f5jv'], 'mncn_pvct': ['f5ky', 'f5ly', 'f5my'], 'mbic_mvlt': ['f5kr', 'f5lr', 'f5mr'], 'macc_mvlt': ['f5nr', 'f5or', 'f5pr'], 'mncn_mvlt': ['f5kw', 'f5lw', 'f5mw'], 'mbnc_mvlt': ['f5hs', 'f5is', 'f5js'], 'frag_pvce': ['f5hx', 'f5ix', 'f5jx'], 'arag_pvce': ['f5he', 'f5ie', 'f5je'], 'nrag_pvce': ['f5hk', 'f5lk', 'f5jk'], 'mbic_pvce': ['f5kq', 'f5lq', 'f5mq'], 'abic_pvce': ['f5ke', 'f5le', 'f5me'], 'nbic_pvce': ['f5kk', 'f5ik', 'f5mk'], 'macc_pvce': ['f5nq', 'f5oq', 'f5pq'], 'aacc_pvce': ['f5ne', 'f5oe', 'f5pe'], 'nacc_pvce': ['f5nk', 'f5ok', 'f5pk'], 'mncn_pvce': ['f5kv', 'f5lv', 'f5mv'], 'cncn_pvce': ['f5so', 'f5nt', 'f5ot'], 'mbnc_pvce': ['f5hr', 'f5ir', 'f5jr'], 'abnc_pvce': ['f5qd', 'f5rd', 'f5sd'], 'nbnc_pvce': ['f5qj', 'f5rj', 'f5sj'], 'demenage': ['f1ar', 'f1br', 'f1cr', 'f1dr', 'f1er'], # (déménagement) uniquement en 2006 } cases_f6_f7_f8 = build_cerfa_fields_by_column_name( year=year, sections_cerfa=[6, 7, 8]) var_dict.update(cases_f6_f7_f8) vars_sets = [set(var_list) for var_list in var_dict.values()] eligible_vars = (set().union(*vars_sets)).intersection( set(list(foyer.columns))) log.info(u"From {} variables, we keep {} eligibles variables".format( len(set().union(*vars_sets)), len(eligible_vars), )) qui = ['vous', 'conj', 'pac1', 'pac2', 'pac3'] # err = 0 # err_vars = {} foy_ind = DataFrame() for individual_var, foyer_vars in var_dict.iteritems(): try: selection = foyer[foyer_vars + ["noindiv"]].copy() except KeyError: # Testing if at least one variable of foyers_vars is in the eligible list presence = [x in eligible_vars for x in foyer_vars] if not any(presence): log.info("{} is not present".format(individual_var)) continue else: # Shrink the list foyer_vars_cleaned = [ var for var, present in zip(foyer_vars, presence) if present is True ] selection = foyer[foyer_vars_cleaned + ["noindiv"]].copy() # Reshape the dataframe selection.rename(columns=dict(zip(foyer_vars, qui)), inplace=True) selection.set_index("noindiv", inplace=True) selection.columns.name = "quifoy" selection = selection.stack() selection.name = individual_var selection = selection.reset_index( ) # A Series cannot see its index resetted to produce a DataFrame selection = selection.set_index(["quifoy", "noindiv"]) selection = selection[selection[individual_var] != 0].copy() if len(foy_ind) == 0: foy_ind = selection else: foy_ind = concat([foy_ind, selection], axis=1, join='outer') foy_ind.reset_index(inplace=True) ind_vars_to_remove = Series(list(eligible_vars)) temporary_store['ind_vars_to_remove_{}'.format(year)] = ind_vars_to_remove foy_ind.rename(columns={"noindiv": "idfoy"}, inplace=True) print_id(foy_ind) foy_ind.quifoy.loc[foy_ind.quifoy == 'vous'] = 0 foy_ind.quifoy.loc[foy_ind.quifoy == 'conj'] = 1 foy_ind.quifoy.loc[foy_ind.quifoy == 'pac1'] = 2 foy_ind.quifoy.loc[foy_ind.quifoy == 'pac2'] = 3 foy_ind.quifoy.loc[foy_ind.quifoy == 'pac3'] = 4 assert foy_ind.quifoy.isin( range(5)).all(), 'présence de valeurs aberrantes dans quifoy' log.info('saving foy_ind') print_id(foy_ind) temporary_store['foy_ind_{}'.format(year)] = foy_ind return
def sif(temporary_store=None, year=None): assert temporary_store is not None assert year is not None year_specific_by_generic = year_specific_by_generic_data_frame_name(year) erfs_survey_collection = SurveyCollection.load( collection='erfs', config_files_directory=config_files_directory) erfs_survey = erfs_survey_collection.get_survey('erfs_{}'.format(year)) log.info("05_foyer: extraction des données foyer") # TODO Comment choisir le rfr n -2 pour éxonération de TH ? # mnrvka Revenu TH n-2 # mnrvkh revenu TH (revenu fiscal de référence) # # On récupère les variables du code sif sif = erfs_survey.get_values(variables=[ "noindiv", 'sif', "nbptr", "mnrvka", "rbg", "tsrvbg", "declar" ], table=year_specific_by_generic["foyer"]) sif['statmarit'] = 0 if year == 2009: old_sif = sif['sif'][sif['noindiv'] == 901803201].copy() new_sif = old_sif.str[0:59] + old_sif.str[60:] + "0" sif.sif.loc[sif['noindiv'] == 901803201] = new_sif.values old_sif = sif.sif.loc[sif['noindiv'] == 900872201] new_sif = old_sif.str[0:58] + " " + old_sif.str[58:] sif.sif.loc[sif['noindiv'] == 900872201] = new_sif.values del old_sif, new_sif sif["rbg"] = sif["rbg"] * ((sif["tsrvbg"] == '+').astype(int) - (sif["tsrvbg"] == '-').astype(int)) sif["stamar"] = sif.sif.str[4:5] # Converting marital status statmarit_dict = {"M": 1, "C": 2, "D": 3, "V": 4, "O": 5} for key, val in statmarit_dict.iteritems(): sif.statmarit.loc[sif.stamar == key] = val sif["birthvous"] = sif.sif.str[5:9] sif["birthconj"] = sif.sif.str[10:14] sif["caseE"] = sif.sif.str[15:16] == "E" sif["caseF"] = sif.sif.str[16:17] == "F" sif["caseG"] = sif.sif.str[17:18] == "G" sif["caseK"] = sif.sif.str[18:19] == "K" d = 0 if year in [2006, 2007]: sif["caseL"] = sif.sif.str[19:20] == "L" sif["caseP"] = sif.sif.str[20:21] == "P" sif["caseS"] = sif.sif.str[21:22] == "S" sif["caseW"] = sif.sif.str[22:23] == "W" sif["caseN"] = sif.sif.str[23:24] == "N" sif["caseH"] = sif.sif.str[24:28] sif["caseT"] = sif.sif.str[28:29] == "T" if year in [2008]: d = -1 # fin de la case L sif["caseP"] = sif.sif.str[20 + d:21 + d] == "P" sif["caseS"] = sif.sif.str[21 + d:22 + d] == "S" sif["caseW"] = sif.sif.str[22 + d:23 + d] == "W" sif["caseN"] = sif.sif.str[23 + d:24 + d] == "N" sif["caseH"] = sif.sif.str[24 + d:28 + d] sif["caseT"] = sif.sif.str[28 + d:29 + d] == "T" if year in [2009]: sif["caseL"] = sif.sif.str[19:20] == "L" sif["caseP"] = sif.sif.str[20:21] == "P" sif["caseS"] = sif.sif.str[21:22] == "S" sif["caseW"] = sif.sif.str[22:23] == "W" sif["caseN"] = sif.sif.str[23:24] == "N" # caseH en moins par rapport à 2008 (mais case en L en plus) # donc décalage par rapport à 2006 d = -4 sif["caseT"] = sif.sif.str[28 + d:29 + d] == "T" sif["caseX"] = sif.sif.str[33 + d:34 + d] == "X" sif["dateX"] = sif.sif.str[34 + d:42 + d] sif["caseY"] = sif.sif.str[42 + d:43 + d] == "Y" sif["dateY"] = sif.sif.str[43 + d:51 + d] sif["caseZ"] = sif.sif.str[51 + d:52 + d] == "Z" sif["dateZ"] = sif.sif.str[52 + d:60 + d] sif["causeXYZ"] = sif.sif.str[60 + d:61 + d] # TODO: convert dateXYZ to appropriate date in pandas # print sif["dateY"].unique() sif["nbptr"] = sif.nbptr.values / 100 sif["rfr_n_2"] = sif.mnrvka.values sif["nbF"] = sif.sif.str[64 + d:66 + d] sif["nbG"] = sif.sif.str[67 + d:69 + d] sif["nbR"] = sif.sif.str[70 + d:72 + d] sif["nbJ"] = sif.sif.str[73 + d:75 + d] sif["nbN"] = sif.sif.str[76 + d:78 + d] sif["nbH"] = sif.sif.str[79 + d:81 + d] sif["nbI"] = sif.sif.str[82 + d:84 + d] if (year != 2009): sif["nbP"] = sif.sif.str[85 + d:87 + d] del sif["stamar"] duplicated_noindiv = sif.noindiv[sif.noindiv.duplicated()].copy() sif['duplicated_noindiv'] = sif.noindiv.isin(duplicated_noindiv) x = sif.loc[sif.duplicated_noindiv, ['noindiv', 'declar']] sif['change'] = "NONE" sif.loc[sif.duplicated_noindiv, 'change'] = sif.loc[sif.duplicated_noindiv, 'declar'].str[27:28] log.info("Number of individuals: {}".format(len(sif.noindiv))) log.info("Number of duplicated individuals: {}".format( len(duplicated_noindiv))) log.info("Number of distinct individuals: {}".format( len(sif.noindiv.value_counts()))) log.info(u"Saving sif") temporary_store['sif_{}'.format(year)] = sif del sif gc.collect()
def build_imputation_loyers_proprietaires(year = None): """Build menage consumption by categorie fiscale dataframe """ assert year is not None # Load data bdf_survey_collection = SurveyCollection.load(collection = 'budget_des_familles', config_files_directory = config_files_directory) survey = bdf_survey_collection.get_survey('budget_des_familles_{}'.format(year)) if year == 1995: imput00 = survey.get_values(table = "socioscm") imput00 = imput00[(imput00.exdep == 1) & (imput00.exrev == 1)] kept_variables = ['mena', 'stalog', 'surfhab', 'confort1', 'confort2', 'confort3', 'confort4', 'ancons', 'sitlog', 'nbphab', 'rg', 'cc'] imput00 = imput00[kept_variables] imput00.rename(columns = {'mena' : 'ident_men'}, inplace = True) #TODO: continue variable cleaning var_to_filnas = ['surfhab'] for var_to_filna in var_to_filnas: imput00[var_to_filna] = imput00[var_to_filna].fillna(0) var_to_ints = ['sitlog', 'confort1', 'stalog', 'surfhab','ident_men','ancons','nbphab'] for var_to_int in var_to_ints: imput00[var_to_int] = imput00[var_to_int].astype(int) depenses = temporary_store['depenses_{}'.format(year)] depenses.reset_index(inplace = True) depenses_small = depenses[['ident_men', '04110', 'pondmen']] imput00 = depenses_small.merge(imput00, on = 'ident_men').set_index('ident_men') imput00.rename(columns = {'04110' : 'loyer_reel'}, inplace = True) # * une indicatrice pour savoir si le loyer est connu et l'occupant est locataire # gen observe = (loyer_reel != . & inlist(STALOG,"3","4")) # gen loyer_impute = loyer_reel # gen maison_appart = (SITLOG == "1") # gen catsurf = 1 if SURF < 16 # replace catsurf = 1 if SURF > 15 & SURF < 31 # replace catsurf = 3 if SURF > 30 & SURF < 41 # replace catsurf = 4 if SURF > 40 & SURF < 61 # replace catsurf = 5 if SURF > 60 & SURF < 81 # replace catsurf = 6 if SURF > 80 & SURF < 101 # replace catsurf = 7 if SURF > 100 & SURF < 151 # replace catsurf = 8 if SURF > 150 # replace maison = 0 if CC == "5" & catsurf == 1 & maison == 1 # replace maison = 0 if CC == "5" & catsurf == 3 & maison == 1 # replace maison = 0 if CC == "5" & catsurf == 8 & maison == 1 # replace maison = 0 if CC == "4" & catsurf == 1 & maison == 1 # sort ident_men imput00['observe'] = (imput00.loyer_reel > 0) & (imput00.stalog.isin([3, 4])) # imput00['loyer_impute'] = imput00['loyer_reel'] imput00['maison_appart'] = imput00.sitlog == 1 imput00['catsurf'] = imput00.surfhab < 16 imput00.catsurf = 1 * ((imput00.surfhab > 15) & (imput00.surfhab < 31)) imput00.catsurf = 3 * ((imput00.surfhab > 30) & (imput00.surfhab < 41)) imput00.catsurf = 4 * ((imput00.surfhab > 40) & (imput00.surfhab < 61)) imput00.catsurf = 5 * ((imput00.surfhab > 60) & (imput00.surfhab < 81)) imput00.catsurf = 6 * ((imput00.surfhab > 80) & (imput00.surfhab < 101)) imput00.catsurf = 7 * ((imput00.surfhab > 100) & (imput00.surfhab < 151)) imput00.catsurf = 8 * (imput00.surfhab > 150) imput00.maison = 1 - ((imput00.cc == 5) & (imput00.catsurf == 1) & (imput00.maison_appart == 1)) imput00.maison = 1 - ((imput00.cc == 5) & (imput00.catsurf == 3) & (imput00.maison_appart == 1)) imput00.maison = 1 - ((imput00.cc == 5) & (imput00.catsurf == 8) & (imput00.maison_appart == 1)) imput00.maison = 1 - ((imput00.cc == 4) & (imput00.catsurf == 1) & (imput00.maison_appart == 1)) # # # TODO: continuer sur le modèle des lignes précédentes # imput00.catsurf[imput00.surfhab > 40 & imput00.surfhab < 61] = 4 # imput00.catsurf[imput00.surfhab > 60 & imput00.surfhab < 81] = 5 # imput00.catsurf[imput00.surfhab > 80 & imput00.surfhab < 101] = 6 # imput00.catsurf[imput00.surfhab > 100 & imput00.surfhab < 151] = 7 # imput00.catsurf[imput00.surfhab > 150] = 8 # imput00.maison[imput00.CC == 5 & imput00.catsurf == 3 & imput00.maison == 1] = 0 # imput00.maison[imput00.CC == 5 & imput00.catsurf == 8 & imput00.maison == 1] = 0 # imput00.maison[imput00.CC == 4 & imput00.catsurf == 1 & imput00.maison == 1] = 0 # # save "`loyers'", replace # # hotdeck loyer_imput using "$rawdatadir\hotdeck", store by(catsurf CC maison_appart) keep(ident_men loyer_imput observe) # replace loyer_imput = . if observe == 1 # loyers.loyer_imput[loyers.observe == 1] = '.' # TODO: # ca m'a l'air d'être déjà fait, je ne comprends pas le todo # 2 questions : pourquoi le imput00 à la place de 'loyers ? # et loyer_impute ?= loyer_imput (code stata) ? # use "$rawdatadir\hotdeck1.dta", clear # keep ident_men loyer_imput # sort ident_men # save "$rawdatadir\hotdeck1.dta", replace # # use "`loyers'", clear # merge ident_men using "$rawdatadir\hotdeck1.dta", update # tab _m observe # drop _m hotdeck = survey.get_values(table = 'hotdeck_result') kept_variables = ['ident_men', 'loyer_impute'] hotdeck = hotdeck[kept_variables] imput00.reset_index(inplace = True) imput00 = imput00.merge(hotdeck, on = 'ident_men').set_index('ident_men') # replace loyer_impute = 0 if observe == 1 # gen imputation = (observe == 0) # label var imputation "Un loyer a été imputé (oui = 1, non = 0)" # rename STALOG stalog # keep ident_men loyer_imp imputation observe stalog # sort ident_men # save "`loyers'", replace # use "`depenses'", clear # sort ident_men posteCOICOP # merge m:1 ident_men using "`loyers'" imput00.loyer_impute[imput00.observe == 1] = 0 imput00['imputation'] = imput00.observe == 0 imput00.reset_index(inplace = True) loyers_imputes = imput00[['ident_men', 'loyer_impute']] loyers_imputes.set_index('ident_men', inplace = True) depenses = coicop_data_frame.merge(poids, left_index = True, right_index = True) # TODO: # ajout de la ligne 166 : todo terminé # noisily: replace depense = 0 if posteCOICOP == "0411" & inlist(stalog,"1","2","5") & depense > 0 & depense != . # noisily: replace depense = 0 if posteCOICOP == "0411" & inlist(stalog,"1","2","5") & depense == . depenses.depense[depense.posteCOICOP == "0411" & depenses.stalog.isin([1,2,5])& depenses.depense > 0 & depenses.depense != '.'] = 0 depenses.depense[depense.posteCOICOP == "0411" & depenses.stalog.isin([1,2,5])& depenses.depense == '.'] = 0 depenses.depense[depenses.posteCOICOP == "0421" & depenses.observe == 0] = depenses['loyer_impute'] depenses.depense[depenses.posteCOICOP == "0421" & depenses.observe == 1 & depenses.depense == '.'] = 0 # # replace depense = loyer_imp if posteCOICOP == "0421" & observe == 0 # replace depense = 0 if posteCOICOP == "0421" & observe == 1 & depense == . # drop observe stalog loyer_impute # tab _m # drop _m # } # # # # * POUR BdF 2000 ET 2005, ON UTILISE LES LOYERS IMPUTES CALCULES PAR L'INSEE # # # if ${yearrawdata} == 2000 { # tempfile loyers_imputes # * Garder les loyers imputés (disponibles dans la table sur les ménages) # use "$rawdatadir\menage.dta", clear # keep IDENT REV81 # rename IDENT ident_men # gen posteCOICOP = "0421" # rename REV81 depense # sort ident_men posteCOICOP # save "`loyers_imputes'", replace # use "`depenses'", clear # sort ident_men posteCOICOP # merge 1:1 ident_men posteCOICOP using "`loyers_imputes'", update # tab _m # tab _m if posteCOICOP == "0421" # drop _m # } # # if ${yearrawdata} == 2005 { # * Garder les loyers imputés (disponibles dans la table sur les ménages) # tempfile loyers_imputes # use "$rawdatadir\menage.dta", clear # keep ident_men rev801_d # gen posteCOICOP = "0421" # rename rev801_d depense # sort ident_men poste # save "`loyers_imputes'", replace # use "`depenses'", clear # sort ident_men posteCOICOP # merge 1:1 ident_men posteCOICOP using "`loyers_imputes'", update keepusing(depense) # tab _m # tab _m if posteCOICOP == "0421" # drop _m # } if year == 2000: loyers_imputes = survey.get_values(table = "menage", variables = ['ident', 'rev81']) loyers_imputes.rename( columns = { 'ident': 'ident_men', 'rev81': '0421', }, inplace = True, ) depenses = survey.get_values(table = 'depmen') if year == 2005: loyers_imputes = survey.get_values(table = "menage") kept_variables = ['ident_men', 'rev801_d'] loyers_imputes = loyers_imputes[kept_variables] loyers_imputes.rename(columns = {'rev801_d': '0421'}, inplace = True) if year == 2011: loyers_imputes = survey.get_values(table = "menage") kept_variables = ['ident_me', 'rev801'] loyers_imputes = loyers_imputes[kept_variables] loyers_imputes.rename(columns = {'rev801': '0421', 'ident_me': 'ident_men'}, inplace = True) # Joindre à la table des dépenses par COICOP loyers_imputes.set_index('ident_men', inplace = True) temporary_store['loyers_imputes_{}'.format(year)] = loyers_imputes depenses = temporary_store['depenses_{}'.format(year)] depenses = depenses.merge(loyers_imputes, left_index = True, right_index = True) # Sauvegarde de la base depenses mise à jour #************************************************************************************************************************** #* Etape n° 0-1-3 : SAUVER LES BASES DE DEPENSES HOMOGENEISEES DANS LE BON DOSSIER #************************************************************************************************************************** # # # sort posteCOICOP # save "`depenses'", replace # # keep ident_men pondmen posteCOICOP poste13 grosposte depense description supplementaire nondurable semidurable durable servicenondurable servicedurable loyer depensenonconso # order ident_men pondmen posteCOICOP poste13 grosposte depense description supplementaire nondurable semidurable durable servicenondurable servicedurable loyer depensenonconso # save "${datadir}\dépenses_BdF.dta", replace # Save in temporary store temporary_store['depenses_bdf_{}'.format(year)] = depenses
def merge_tables(temporary_store = None, year = None): """ Création des tables ménages et individus concaténée (merged) """ # Prepare the some useful merged tables assert temporary_store is not None assert year is not None # load data erfs_survey_collection = SurveyCollection.load( collection = 'erfs', config_files_directory = config_files_directory) year_specific_by_generic = year_specific_by_generic_data_frame_name(year) survey = erfs_survey_collection.get_survey('erfs_{}'.format(year)) erfmen = survey.get_values(table = year_specific_by_generic["erf_menage"]) eecmen = survey.get_values(table = year_specific_by_generic["eec_menage"]) erfind = survey.get_values(table = year_specific_by_generic["erf_indivi"]) eecind = survey.get_values(table = year_specific_by_generic["eec_indivi"]) # travail sur la cohérence entre les bases noappar_m = eecmen[~(eecmen.ident.isin(erfmen.ident.values))].copy() noappar_i = eecmen[~(eecind.ident.isin(erfind.ident.values))].copy() noappar_i = noappar_i.drop_duplicates(subset = 'ident', take_last = True) # TODO: vérifier qu'il n'y a théoriquement pas de doublon difference = set(noappar_i.ident).symmetric_difference(noappar_m.ident) intersection = set(noappar_i.ident) & set(noappar_m.ident) log.info("There are {} differences and {} intersections".format(len(difference), len(intersection))) del noappar_i, noappar_m, difference, intersection gc.collect() # fusion enquete emploi et source fiscale menagem = erfmen.merge(eecmen) indivim = eecind.merge(erfind, on = ['noindiv', 'ident', 'noi'], how = "inner") var_list = [ 'acteu', 'agepr', 'cohab', 'contra', 'encadr', 'forter', 'lien', 'mrec', 'naia', 'noicon', 'noimer', 'noiper', 'prosa', 'retrai', 'rstg', 'statut', 'stc', 'otitc', 'txtppb', ] check_integer_dtype(indivim, var_list) create_actrec_variable(indivim) create_variable_locataire(menagem) menagem = menagem.merge( indivim.loc[indivim.lpr == 1, ['ident', 'ddipl']].copy() ) manually_remove_noindiv_errors(indivim) temporary_store['menagem_{}'.format(year)] = menagem del eecmen, erfmen, menagem gc.collect() temporary_store['indivim_{}'.format(year)] = indivim del erfind, eecind
def create_enfants_a_naitre(temporary_store = None, year = None): ''' ''' assert temporary_store is not None assert year is not None erfs_survey_collection = SurveyCollection.load( collection = 'erfs', config_files_directory = config_files_directory) survey = erfs_survey_collection.get_survey('erfs_{}'.format(year)) # Enfant à naître (NN pour nouveaux nés) individual_vars = [ 'acteu', 'agepr', 'cohab', 'contra', 'forter', 'ident', 'lien', 'lpr', 'mrec', 'naia', 'naim', 'noi', 'noicon', 'noimer', 'noindiv', 'noiper', 'retrai', 'rga', 'rstg', 'sexe', 'stc', 'titc', ] year_specific_by_generic = year_specific_by_generic_data_frame_name(year) eeccmp1 = survey.get_values(table = year_specific_by_generic["eec_cmp_1"], variables = individual_vars) eeccmp2 = survey.get_values(table = year_specific_by_generic["eec_cmp_2"], variables = individual_vars) eeccmp3 = survey.get_values(table = year_specific_by_generic["eec_cmp_3"], variables = individual_vars) tmp = eeccmp1.merge(eeccmp2, how = "outer") enfants_a_naitre = tmp.merge(eeccmp3, how = "outer") # optimisation des types? Controle de l'existence en passant # pourquoi pas des int quand c'est possible # TODO: minimal dtype TODO: shoudln't be here for var in individual_vars: assert_dtype(enfants_a_naitre[var], 'float') del eeccmp1, eeccmp2, eeccmp3, individual_vars, tmp gc.collect() # création de variables enfants_a_naitre['declar1'] = '' enfants_a_naitre['noidec'] = 0 enfants_a_naitre['ztsai'] = 0 enfants_a_naitre['year'] = year enfants_a_naitre.year = enfants_a_naitre.year.astype("float32") # TODO: should be an integer but NaN are present enfants_a_naitre['agepf'] = enfants_a_naitre.year - enfants_a_naitre.naia enfants_a_naitre.loc[enfants_a_naitre.naim >= 7,'agepf'] -= 1 enfants_a_naitre['actrec'] = 9 enfants_a_naitre['quelfic'] = 'ENF_NN' enfants_a_naitre['persfip'] = "" # TODO: deal with agepf for series_name in ['actrec', 'noidec', 'ztsai']: assert_dtype(enfants_a_naitre[series_name], "int") # selection enfants_a_naitre = enfants_a_naitre[ ( (enfants_a_naitre.naia == enfants_a_naitre.year) & (enfants_a_naitre.naim >= 10) ) | ( (enfants_a_naitre.naia == enfants_a_naitre.year + 1) & (enfants_a_naitre.naim <= 5) ) ].copy() temporary_store["enfants_a_naitre_{}".format(year)] = enfants_a_naitre
def create_fip(temporary_store = None, year = None): assert temporary_store is not None assert year is not None # fip : fichier d'imposition des personnes """ Creates a 'fipDat' table containing all these 'fip individuals' """ # Some individuals are declared as 'personne à charge' (pac) on 'tax forms' # but are not present in the erf or eec tables. # We add them to ensure consistency between concepts. year_specific_by_generic = year_specific_by_generic_data_frame_name(year) erfs_survey_collection = SurveyCollection.load( collection = 'erfs', config_files_directory = config_files_directory) survey = erfs_survey_collection.get_survey('erfs_{}'.format(year)) log.info(u"Démarrage de 03_fip") # anaisenf is a string containing letter code of pac (F,G,H,I,J,N,R) and year of birth (example: 'F1990H1992') # when a child is invalid, he appears twice in anaisenf (example: F1900G1900 is a single invalid child born in 1990) erfFoyVar = ['declar', 'anaisenf'] foyer = survey.get_values(table = year_specific_by_generic["foyer"], variables = erfFoyVar) foyer.replace({'anaisenf': {'NA': np.nan}}, inplace = True) log.info(u"Etape 1 : on récupere les personnes à charge des foyers") log.info(u" 1.1 : Création des codes des enfants") foyer['anaisenf'] = foyer['anaisenf'].astype('string') nb_pac_max = len(max(foyer['anaisenf'], key=len)) / 5 log.info(u"il ya a au maximum {} pac par foyer".format(nb_pac_max)) # Separating the string coding the pac of each "déclaration". # Creating a list containing the new variables. # Creating the multi_index for the columns multi_index_columns = [] assert int(nb_pac_max) == nb_pac_max, "nb_pac_max = {} which is not an integer".format(nb_pac_max) nb_pac_max = int(nb_pac_max) for i in range(1, nb_pac_max + 1): pac_tuples_list = [ (i, 'declaration'), (i, 'type_pac'), (i, 'naia') ] multi_index_columns += pac_tuples_list columns = MultiIndex.from_tuples( multi_index_columns, names = ['pac_number', 'variable'] ) fip = DataFrame(np.random.randn(len(foyer), 3 * nb_pac_max), columns = columns) for i in range(1, nb_pac_max + 1): # TODO: using values to deal with mismatching indexes fip[(i, 'declaration')] = foyer['declar'].values fip[(i, 'type_pac')] = foyer['anaisenf'].str[5 * (i - 1)].values fip[(i, 'naia')] = foyer['anaisenf'].str[5 * (i - 1) + 1: 5 * i].values fip = fip.stack("pac_number") fip.reset_index(inplace = True) fip.drop(['level_0'], axis = 1, inplace = True) log.info(u" 1.2 : elimination des foyers fiscaux sans pac") # Clearing missing values and changing data format fip = fip[(fip.type_pac.notnull()) & (fip.naia != 'an') & (fip.naia != '')].copy() fip = fip.sort(columns = ['declaration', 'naia', 'type_pac']) fip.set_index(["declaration", "pac_number"], inplace = True) fip = fip.reset_index() fip.drop(['pac_number'], axis = 1, inplace = True) assert fip.type_pac.isin(["F", "G", "H", "I", "J", "N", "R"]).all(), \ "Certains types de PAC ne sont pas des cases connues" # control(fip, debug=True, verbose=True, verbose_columns=['naia']) log.info(u" 1.3 : on enlève les individus F pour lesquels il existe un individu G") type_FG = fip[fip.type_pac.isin(['F', 'G'])].copy() # Filtre pour ne travailler que sur F & G type_FG['same_pair'] = type_FG.duplicated(subset = ['declaration', 'naia'], take_last = True) type_FG['is_twin'] = type_FG.duplicated(subset = ['declaration', 'naia', 'type_pac']) type_FG['to_keep'] = ~(type_FG['same_pair']) | type_FG['is_twin'] # Note : On conserve ceux qui ont des couples déclar/naia différents et les jumeaux # puis on retire les autres (à la fois F et G) fip['to_keep'] = np.nan fip.update(type_FG) log.info(u" 1.4 : on enlève les H pour lesquels il y a un I") type_HI = fip[fip.type_pac.isin(['H', 'I'])].copy() type_HI['same_pair'] = type_HI.duplicated(subset = ['declaration', 'naia'], take_last = True) type_HI['is_twin'] = type_HI.duplicated(subset = ['declaration', 'naia', 'type_pac']) type_HI['to_keep'] = (~(type_HI['same_pair']) | (type_HI['is_twin'])).values fip.update(type_HI) fip['to_keep'] = fip['to_keep'].fillna(True) log.info(u"{} F, G, H or I non redundant pac kept over {} potential candidates".format( fip['to_keep'].sum(), len(fip)) ) indivifip = fip[fip['to_keep']].copy() del indivifip['to_keep'], fip, type_FG, type_HI # # control(indivifip, debug=True) log.info(u"Step 2 : matching indivifip with eec file") indivi = temporary_store['indivim_{}'.format(year)] pac = indivi[(indivi.persfip.notnull()) & (indivi.persfip == 'pac')].copy() assert indivifip.naia.notnull().all(), "Il y a des valeurs manquantes de la variable naia" # For safety enforce pac.naia and indivifip.naia dtypes pac['naia'] = pac.naia.astype('int32') indivifip['naia'] = indivifip.naia.astype('int32') pac['key1'] = zip(pac.naia, pac['declar1'].str[:29]) pac['key2'] = zip(pac.naia, pac['declar2'].str[:29]) indivifip['key'] = zip(indivifip.naia.values, indivifip['declaration'].str[:29].values) assert pac.naia.dtype == indivifip.naia.dtype, \ "Les dtypes de pac.naia {} et indvifip.naia {} sont différents".format(pac.naia.dtype, indivifip.naia.dtype) fip = indivifip[~(indivifip.key.isin(pac.key1.values))].copy() fip = fip[~(fip.key.isin(pac.key2.values))].copy() log.info(u" 2.1 new fip created") # We build a dataframe to link the pac to their type and noindiv tmp_pac1 = pac[['noindiv', 'key1']].copy() tmp_pac2 = pac[['noindiv', 'key2']].copy() tmp_indivifip = indivifip[['key', 'type_pac', 'naia']].copy() pac_ind1 = tmp_pac1.merge(tmp_indivifip, left_on='key1', right_on='key', how='inner') log.info(u"{} pac dans les 1ères déclarations".format(len(pac_ind1))) pac_ind2 = tmp_pac2.merge(tmp_indivifip, left_on='key2', right_on='key', how='inner') log.info(u"{} pac dans les 2èms déclarations".format(len(pac_ind2))) log.info("{} duplicated pac_ind1".format(pac_ind1.duplicated().sum())) log.info("{} duplicated pac_ind2".format(pac_ind2.duplicated().sum())) del pac_ind1['key1'], pac_ind2['key2'] if len(pac_ind1.index) == 0: if len(pac_ind2.index) == 0: log.info(u"Warning : no link between pac and noindiv for both pacInd1&2") else: log.info(u"Warning : pacInd1 is an empty data frame") pacInd = pac_ind2 elif len(pac_ind2.index) == 0: log.info(u"Warning : pacInd2 is an empty data frame") pacInd = pac_ind1 else: pacInd = concat([pac_ind2, pac_ind1]) assert len(pac_ind1) + len(pac_ind2) == len(pacInd) log.info("{} null pac_ind2.type_pac".format(pac_ind2.type_pac.isnull().sum())) log.info("pacInd.type_pac.value_counts()) \n {}".format(pacInd.type_pac.value_counts(dropna = False))) log.info(u" 2.2 : pacInd created") log.info(u"doublons noindiv, type_pac {}".format(pacInd.duplicated(['noindiv', 'type_pac']).sum())) log.info(u"doublons noindiv seulement {}".format(pacInd.duplicated('noindiv').sum())) log.info(u"nb de NaN {}".format(pacInd.type_pac.isnull().sum())) del pacInd["key"] pacIndiv = pacInd[~(pacInd.duplicated('noindiv'))].copy() # pacIndiv.reset_index(inplace=True) log.info("{}".format(pacIndiv.columns)) temporary_store['pacIndiv_{}'.format(year)] = pacIndiv log.info("{}".format(pacIndiv.type_pac.value_counts())) gc.collect() # We keep the fip in the menage of their parents because it is used in to # build the famille. We should build an individual ident (ménage) for the fip that are # older than 18 since they are not in their parents' menage according to the eec log.info("{}".format(indivi['declar1'].str[0:2].value_counts())) log.info("{}".format(indivi['declar1'].str[0:2].describe())) log.info("{}".format(indivi['declar1'].str[0:2].notnull().all())) log.info("{}".format(indivi.info())) selection = indivi['declar1'].str[0:2] != "" indivi['noidec'] = indivi.declar1[selection].str[0:2].astype('int32') # To be used later to set idfoy individec1 = indivi[(indivi.declar1.isin(fip.declaration.values)) & (indivi.persfip == "vous")] individec1 = individec1[["declar1", "noidec", "ident", "rga", "ztsai", "ztsao"]].copy() individec1 = individec1.rename(columns = {'declar1': 'declaration'}) fip1 = fip.merge(individec1, on = 'declaration') log.info(u" 2.3 : fip1 created") individec2 = indivi.loc[ (indivi.declar2.isin(fip.declaration.values)) & (indivi['persfip'] == "vous"), ["declar2", "noidec", "ident", "rga", "ztsai", "ztsao"] ].copy() individec2.rename(columns = {'declar2': 'declaration'}, inplace = True) fip2 = fip.merge(individec2) log.info(u" 2.4 : fip2 created") fip1.duplicated().value_counts() fip2.duplicated().value_counts() fip = concat([fip1, fip2]) fip['persfip'] = 'pac' fip['year'] = year fip['year'] = fip['year'].astype('float') # BUG; pas de colonne année dans la DF fip['noi'] = 99 fip['noicon'] = None fip['noindiv'] = fip['declaration'].copy() fip['noiper'] = None fip['noimer'] = None fip['declar1'] = fip['declaration'].copy() fip['naim'] = 99 fip['lien'] = None fip['quelfic'] = 'FIP' fip['acteu'] = None fip['agepf'] = fip['year'] - fip.naia.astype('float') fip['lpr'] = (fip['agepf'] <= 20) * 3 + (fip['agepf'] > 20) * 4 fip['stc'] = None fip['contra'] = None fip['titc'] = None fip['mrec'] = None fip['forter'] = None fip['rstg'] = None fip['retrai'] = None fip['cohab'] = None fip['sexe'] = None fip['persfip'] = "pac" fip['agepr'] = None fip['actrec'] = (fip['agepf'] <= 15) * 9 + (fip['agepf'] > 15) * 5 # TODO: probleme actrec des enfants fip entre 16 et 20 ans : on ne sait pas s'ils sont étudiants ou salariés */ # TODO problème avec les mois des enfants FIP : voir si on ne peut pas remonter à ces valeurs: Alexis: clairement non # Reassigning noi for fip children if they are more than one per foyer fiscal fip["noi"] = fip["noi"].astype("int64") fip["ident"] = fip["ident"].astype("int64") fip_tmp = fip[['noi', 'ident']] while any(fip.duplicated(subset = ['noi', 'ident'])): fip_tmp = fip.loc[:, ['noi', 'ident']] dup = fip_tmp.duplicated() tmp = fip.loc[dup, 'noi'] log.info("{}".format(len(tmp))) fip.loc[dup, 'noi'] = tmp.astype('int64') - 1 fip['idfoy'] = 100 * fip['ident'] + fip['noidec'] fip['noindiv'] = 100 * fip['ident'] + fip['noi'] fip['type_pac'] = 0 fip['key'] = 0 log.info("Number of duplicated fip: {}".format(fip.duplicated('noindiv').value_counts())) temporary_store['fipDat_{}'.format(year)] = fip del fip, fip1, individec1, indivifip, indivi, pac log.info(u"fip sauvegardé")
def foyer_all(year): year = 2009 replace = create_replace(year) temporary_store = TemporaryStore.create(file_name="erfs") # On ajoute les cases de la déclaration erfs_survey_collection = SurveyCollection.load( collection='erfs', config_files_directory=config_files_directory) data = erfs_survey_collection.get_survey('erfs_{}'.format(year)) foyer_all = data.get_values(table=replace["foyer"]) # on ne garde que les cases de la déclaration ('_xzz') ou ^_[0-9][a-z]{2}") regex = re.compile("^_[0-9][a-z]{2}") variables = [x for x in foyer_all.columns if regex.match(x)] # rename variable to fxzz ou ^f[0-9][a-z]{2}") renamed_variables = ["f{}".format(x[1:]) for x in variables] foyer = foyer_all[variables + ["noindiv"]].copy() # Memory expensive ... del foyer_all gc.collect() foyer.rename(columns=dict(zip(variables, renamed_variables)), inplace=True) # On aggrège les déclarations dans le cas où un individu a fait plusieurs déclarations foyer = foyer.groupby("noindiv", as_index=False).aggregate(numpy.sum) print_id(foyer) # On récupère les variables individualisables var_dict = { 'sali': ['f1aj', 'f1bj', 'f1cj', 'f1dj', 'f1ej'], 'choi': ['f1ap', 'f1bp', 'f1cp', 'f1dp', 'f1ep'], 'fra': ['f1ak', 'f1bk', 'f1ck', 'f1dk', 'f1ek'], 'cho_ld': ['f1ai', 'f1bi', 'f1ci', 'f1di', 'f1ei'], 'ppe_tp_sa': ['f1ax', 'f1bx', 'f1cx', 'f1dx', 'f1qx'], 'ppe_du_sa': ['f1av', 'f1bv', 'f1cv', 'f1dv', 'f1qv'], 'rsti': ['f1as', 'f1bs', 'f1cs', 'f1ds', 'f1es'], 'alr': ['f1ao', 'f1bo', 'f1co', 'f1do', 'f1eo'], 'f1tv': ['f1tv', 'f1uv'], 'f1tw': ['f1tw', 'f1uw'], 'f1tx': ['f1tx', 'f1ux'], 'ppe_tp_ns': ['f5nw', 'f5ow', 'f5pw'], 'ppe_du_ns': ['f5nv', 'f5ov', 'f5pv'], 'frag_exon': ['f5hn', 'f5in', 'f5jn'], 'frag_impo': ['f5ho', 'f5io', 'f5jo'], 'arag_exon': ['f5hb', 'f5ib', 'f5jb'], 'arag_impg': ['f5hc', 'f5ic', 'f5jc'], 'arag_defi': ['f5hf', 'f5if', 'f5jf'], 'nrag_exon': ['f5hh', 'f5ih', 'f5jh'], 'nrag_impg': ['f5hi', 'f5ii', 'f5ji'], 'nrag_defi': ['f5hl', 'f5il', 'f5jl'], 'nrag_ajag': ['f5hm', 'f5im', 'f5jm'], 'mbic_exon': ['f5kn', 'f5ln', 'f5mn'], 'abic_exon': ['f5kb', 'f5lb', 'f5mb'], 'nbic_exon': ['f5kh', 'f5lh', 'f5mh'], 'mbic_impv': ['f5ko', 'f5lo', 'f5mo'], 'mbic_imps': ['f5kp', 'f5lp', 'f5mp'], 'abic_impn': ['f5kc', 'f5lc', 'f5mc'], 'abic_imps': ['f5kd', 'f5ld', 'f5md'], 'nbic_impn': ['f5ki', 'f5li', 'f5mi'], 'nbic_imps': ['f5kj', 'f5lj', 'f5mj'], 'abic_defn': ['f5kf', 'f5lf', 'f5mf'], 'abic_defs': ['f5kg', 'f5lg', 'f5mg'], 'nbic_defn': ['f5kl', 'f5ll', 'f5ml'], 'nbic_defs': ['f5km', 'f5lm', 'f5mm'], 'nbic_apch': ['f5ks', 'f5ls', 'f5ms'], 'macc_exon': ['f5nn', 'f5on', 'f5pn'], 'aacc_exon': ['f5nb', 'f5ob', 'f5pb'], 'nacc_exon': ['f5nh', 'f5oh', 'f5ph'], 'macc_impv': ['f5no', 'f5oo', 'f5po'], 'macc_imps': ['f5np', 'f5op', 'f5pp'], 'aacc_impn': ['f5nc', 'f5oc', 'f5pc'], 'aacc_imps': ['f5nd', 'f5od', 'f5pd'], 'aacc_defn': ['f5nf', 'f5of', 'f5pf'], 'aacc_defs': ['f5ng', 'f5og', 'f5pg'], 'nacc_impn': ['f5ni', 'f5oi', 'f5pi'], 'nacc_imps': ['f5nj', 'f5oj', 'f5pj'], 'nacc_defn': ['f5nl', 'f5ol', 'f5pl'], 'nacc_defs': ['f5nm', 'f5om', 'f5pm'], 'mncn_impo': ['f5ku', 'f5lu', 'f5mu'], 'cncn_bene': ['f5sn', 'f5ns', 'f5os'], 'cncn_defi': ['f5sp', 'f5nu', 'f5ou', 'f5sr'], # TODO: check 'mbnc_exon': ['f5hp', 'f5ip', 'f5jp'], 'abnc_exon': ['f5qb', 'f5rb', 'f5sb'], 'nbnc_exon': ['f5qh', 'f5rh', 'f5sh'], 'mbnc_impo': ['f5hq', 'f5iq', 'f5jq'], 'abnc_impo': ['f5qc', 'f5rc', 'f5sc'], 'abnc_defi': ['f5qe', 'f5re', 'f5se'], 'nbnc_impo': ['f5qi', 'f5ri', 'f5si'], 'nbnc_defi': ['f5qk', 'f5rk', 'f5sk'], # 'ebic_impv' : ['f5ta','f5ua', 'f5va'], # 'ebic_imps' : ['f5tb','f5ub', 'f5vb'], 'mbic_mvct': ['f5hu'], 'macc_mvct': ['f5iu'], 'mncn_mvct': ['f5ju'], 'mbnc_mvct': ['f5kz'], 'frag_pvct': ['f5hw', 'f5iw', 'f5jw'], 'mbic_pvct': ['f5kx', 'f5lx', 'f5mx'], 'macc_pvct': ['f5nx', 'f5ox', 'f5px'], 'mbnc_pvct': ['f5hv', 'f5iv', 'f5jv'], 'mncn_pvct': ['f5ky', 'f5ly', 'f5my'], 'mbic_mvlt': ['f5kr', 'f5lr', 'f5mr'], 'macc_mvlt': ['f5nr', 'f5or', 'f5pr'], 'mncn_mvlt': ['f5kw', 'f5lw', 'f5mw'], 'mbnc_mvlt': ['f5hs', 'f5is', 'f5js'], 'frag_pvce': ['f5hx', 'f5ix', 'f5jx'], 'arag_pvce': ['f5he', 'f5ie', 'f5je'], 'nrag_pvce': ['f5hk', 'f5lk', 'f5jk'], 'mbic_pvce': ['f5kq', 'f5lq', 'f5mq'], 'abic_pvce': ['f5ke', 'f5le', 'f5me'], 'nbic_pvce': ['f5kk', 'f5ik', 'f5mk'], 'macc_pvce': ['f5nq', 'f5oq', 'f5pq'], 'aacc_pvce': ['f5ne', 'f5oe', 'f5pe'], 'nacc_pvce': ['f5nk', 'f5ok', 'f5pk'], 'mncn_pvce': ['f5kv', 'f5lv', 'f5mv'], 'cncn_pvce': ['f5so', 'f5nt', 'f5ot'], 'mbnc_pvce': ['f5hr', 'f5ir', 'f5jr'], 'abnc_pvce': ['f5qd', 'f5rd', 'f5sd'], 'nbnc_pvce': ['f5qj', 'f5rj', 'f5sj'], 'demenage': ['f1ar', 'f1br', 'f1cr', 'f1dr', 'f1er'], # (déménagement) uniquement en 2006 # Charges déductibles 'f6de': ['f6de'], # CSG déductible calculée sur les revenus du patrimoine # Pensions alimentaires 'f6gi': ['f6gi' ], # Pensions alimenaires versées à des enfants majeurs (1e enfant) 'f6gj': ['f6gj' ], # Pensions alimenaires versées à des enfants majeurs (2e enfant) 'f6el': [ 'f6el' ], # Autres pensions alimenaires versées à des enfants majeurs (1e enfant) 'f6el': [ 'f6em' ], # Autres pensions alimenaires versées à des enfants majeurs (2e enfant) 'f6gp': [ 'f6gp' ], # Autres pensions alimenaires versées (décision de juscitce avant 2006) 'f6gu': ['f6gu'], # Autres pensions alimenaires versées 'f6dd': ['f6dd'], # Déductions diverses # Epargne retraite PERP 'f6rs': ['f6rs', 'f6rt', 'f6ru'], # 'f6ss': ['f6ss', 'f6st', 'f6su'], # 'f6ps': ['f6ps', 'f6pt', 'f6pu'], # 'f6qr': ['f6qr'], # 'f6qw': ['f6qw'], # 'f6qs': ['f6qs', 'f6qt', 'f6qu'], # # Réductions d'impôt 'f7ud': ['f7ud'], # Dons 'f7ue': ['f7ue'], # Dons 'f7uf': ['f7uf'], # Autres dons 'f7xs': ['f7xs'], # report années antérieures 'f7xt': ['f7xt'], # report années antérieures 'f7xu': ['f7xu'], # report années antérieures 'f7xw': ['f7xw'], # report années antérieures 'f7xy': ['f7xy'], # report années antérieures 'f7ac': ['f7ac', 'f7ae', 'f7ag'], # Cotisations syndicales 'f7ad': ['f7ad', 'f7af', 'f7ah'], # # Enfants poursuivant leurs études 'f7ea': ['f7ea'], # 'f7ec': ['f7ec'], # 'f7ef': ['f7ef'], # 'f7eb': ['f7eb'], # 'f7ed': ['f7ed'], # 'f7eg': ['f7eg'], # # Salarié à domicile 'f7db': ['f7db'], # 'f7df': ['f7df'], # 'f7dq': ['f7dq'], # 'f7dg': ['f7dg'], # 'f7dl': ['f7dl'], # 'f7gz': ['f7gz'], # Prime de rente survie, contrat d'épargne handicap 'f7cd': ['f7cd'], # 'f7ce': ['f7ce'], # # Dépenses en faveur de la qualité environnementale de l'habitation principale 'f7we': ['f7we'], # 'f7wh': ['f7wh'], # 'f7wk': ['f7wk'], # 'f7wf': ['f7wf'], # 'f7wg': ['f7wg'], # 'f7wj': ['f7wj'], # 'f7wi': ['f7wi'], # 'f7vz': ['f7vz'], # 'f7vx': ['f7vx'], # # Divers 'f8by': ['f8by', 'f8cy'], # Elus locaux 'f8ut': ['f8ut'], # #Revenus à l'étranger 'f8ti': ['f8ti'], # Revenus à l'étranger 'f8tl': ['f8tl'], # 'f8tk': ['f8tk'], # 'f8fv': ['f8fv'], # 'f8tt': ['f8tt'], # 'f8uu': ['f8uu'], # } vars_sets = [set(var_list) for var_list in var_dict.values()] eligible_vars = (set().union(*vars_sets)).intersection( set(list(foyer.columns))) log.info(u"From {} variables, we keep {} eligibles variables".format( len(set().union(*vars_sets)), len(eligible_vars), )) qui = ['vous', 'conj', 'pac1', 'pac2', 'pac3'] # err = 0 # err_vars = {} foy_ind = DataFrame() for individual_var, foyer_vars in var_dict.iteritems(): try: selection = foyer[foyer_vars + ["noindiv"]].copy() except KeyError: # Testing if at least one variable of foyers_vars is in the eligible list presence = [x in eligible_vars for x in foyer_vars] if not any(presence): log.info("{} is not present".format(individual_var)) continue else: # Shrink the list foyer_vars_cleaned = [ var for var, present in zip(foyer_vars, presence) if present is True ] selection = foyer[foyer_vars_cleaned + ["noindiv"]].copy() # Reshape the dataframe selection.rename(columns=dict(zip(foyer_vars, qui)), inplace=True) selection.set_index("noindiv", inplace=True) selection.columns.name = "quifoy" selection = selection.stack() selection.name = individual_var selection = selection.reset_index( ) # A Series cannot see its index resetted to produce a DataFrame selection = selection.set_index(["quifoy", "noindiv"]) selection = selection[selection[individual_var] != 0].copy() if len(foy_ind) == 0: foy_ind = selection else: foy_ind = concat([foy_ind, selection], axis=1, join='outer') foy_ind.reset_index(inplace=True) ind_vars_to_remove = Series(list(eligible_vars)) temporary_store['ind_vars_to_remove_{}'.format(year)] = ind_vars_to_remove foy_ind.rename(columns={"noindiv": "idfoy"}, inplace=True) print_id(foy_ind) foy_ind['quifoy'][foy_ind.quifoy == 'vous'] = 0 foy_ind['quifoy'][foy_ind.quifoy == 'conj'] = 1 foy_ind['quifoy'][foy_ind.quifoy == 'pac1'] = 2 foy_ind['quifoy'][foy_ind.quifoy == 'pac2'] = 3 foy_ind['quifoy'][foy_ind.quifoy == 'pac3'] = 4 assert foy_ind.quifoy.isin( range(5)).all(), 'présence de valeurs aberrantes dans quifoy' log.info('saving foy_ind') print_id(foy_ind) temporary_store['foy_ind_{}'.format(year)] = foy_ind return
def create_comparable_logement_data_frame(year): logement_adresse_variables = ["gzc2"] logement_menage_variables = [ "maa1at", "magtr", "mcs8", "mdiplo", "mrcho", "mrret", "mrsal", "mrtns", "mtybd", "muc1", "qex", "sec1", ] if year == 2003: logement_menage_variables.extend( ["hnph2", "ident", "lmlm", "mnatior", "typse"]) logement_adresse_variables.extend(["iaat", "ident", "tu99"]) if year < 2010 and year > 2005: logement_menage_variables.extend(["idlog", "mnatio"]) logement_adresse_variables.extend(["idlog"]) # pas de typse en 2006 logement_logement_variables = [ "hnph2", "iaat", "idlog", "lmlm", "tu99" ] # pas de typse en 2006 # Travail sur la table logement # Table menage if year == 2003: year_lgt = 2003 if year > 2005 and year < 2010: year_lgt = 2006 logement_survey_collection = SurveyCollection.load(collection='logement') logement_survey = logement_survey_collection.surveys['logement_{}'.format( year)] log.info("Preparing logement menage table") # Lgtmen = load_temp(name = "indivim",year = year) # Je rajoute une étape bidon Lgtmen = logement_survey.get_values(table="lgt_menage", variables=logement_menage_variables) Lgtmen.rename(columns={'idlog': 'ident'}, inplace=True) Lgtmen['mrcho'].fillna(0, inplace=True) Lgtmen['mrret'].fillna(0, inplace=True) Lgtmen['mrsal'].fillna(0, inplace=True) Lgtmen['mrtns'].fillna(0, inplace=True) Lgtmen['revtot'] = Lgtmen['mrcho'] + Lgtmen['mrret'] + Lgtmen[ 'mrsal'] + Lgtmen[ 'mrtns'] # TODO : Virer les revenus négatifs ? mrtns : 118 revenus négatifs sur 42845 en 2006 count_NA('revtot', Lgtmen) Lgtmen['nvpr'] = 10.0 * Lgtmen['revtot'] / Lgtmen['muc1'] count_NA('qex', Lgtmen) dec, values = mark_weighted_percentiles( Lgtmen['nvpr'].values, numpy.arange(1, 11), Lgtmen['qex'].values, 2, return_quantiles=True, ) values.sort() Lgtmen['deci'] = (1 + (Lgtmen.nvpr > values[1]) + (Lgtmen.nvpr > values[2]) + (Lgtmen.nvpr > values[3]) + (Lgtmen.nvpr > values[4]) + (Lgtmen.nvpr > values[5]) + (Lgtmen.nvpr > values[6]) + (Lgtmen.nvpr > values[7]) + (Lgtmen.nvpr > values[8]) + (Lgtmen.nvpr > values[9])) del dec, values assert Lgtmen['deci'].isin(range( 1, 11)).all(), "Logement decile are out of range'" gc.collect() if year_lgt == 2006: log.info('Preparing logement logement table') lgtlgt = logement_survey.get_values( table="lgt_logt", variables=logement_logement_variables) lgtlgt.rename(columns={'idlog': 'ident'}, inplace=True) Lgtmen = Lgtmen.merge(lgtlgt, left_on='ident', right_on='ident', how='inner') del lgtlgt data = Lgtmen[Lgtmen['sec1'].isin([21, 22, 23, 24, 30])] del Lgtmen gc.collect() if year_lgt == 2006: data.rename(columns={'mnatio': 'mnatior'}, inplace=True) data = (data[data['mnatior'].notnull()]) data = (data[data['sec1'].notnull()]) data['tmp'] = data['sec1'].astype("int64") data['tmp'][data['sec1'].isin([21, 22, 23])] = 3 data['tmp'][data['sec1'] == 24] = 4 data['tmp'][data['sec1'] == 30] = 5 data['logt'] = data['tmp'] count_NA('logt', data) data = (data[data['logt'].notnull()]) Lgtmen = data # ## Table adresse log.info(u"Préparation de la table adresse de l'enquête logement") Lgtadr = logement_survey.get_values(table="adresse", variables=logement_adresse_variables) Lgtadr.rename(columns={'idlog': 'ident'}, inplace=True) log.info(u"Fusion des tables logement et ménage de l'enquête logement") Logement = Lgtmen.merge(Lgtadr, on='ident', how='inner') Logement.hnph2[Logement['hnph2'] >= 6] = 6 Logement.hnph2[Logement['hnph2'] < 1] = 1 count_NA('hnph2', Logement) assert Logement['hnph2'].notnull().any(), "Some hnph2 are null" # Logement=(Logement[Logement['hnph2'].notnull()]) # Mis en comment car 0 NA pour hnph2 # On est dans la même étape within ici et par la suite ( cf code R ) # TODO : ici problème je transforme les 07 en 7 # car Python considère les 0n comme des nombres octaux ( < 08 ). # J'espère que ce n'est pas important. Logement.mnatior[Logement['mnatior'].isin([0, 1])] = 1 Logement.mnatior[Logement['mnatior'].isin([2, 3, 4, 5, 6, 7, 8, 9, 10, 11])] = 2 count_NA('mnatior', Logement) assert_variable_in_range('mnatior', [1, 3], Logement) Logement.iaat[Logement.iaat.isin([1, 2, 3, 4, 5])] = 1 Logement.iaat[Logement.iaat == 6] = 2 Logement.iaat[Logement.iaat == 7] = 3 Logement.iaat[Logement.iaat == 8] = 4 Logement.iaat[Logement.iaat == 9] = 5 Logement.iaat[Logement.iaat == 10] = 6 count_NA('iaat', Logement) assert_variable_in_range('iaat', [1, 7], Logement) Logement.mdiplo[Logement.mdiplo == 1] = 1 Logement.mdiplo[Logement.mdiplo.isin([2, 3, 4])] = 2 Logement.mdiplo[Logement.mdiplo.isin([5, 6, 7, 8])] = 3 Logement.mdiplo[Logement.mdiplo == 9] = 4 count_NA('mdiplo', Logement) assert_variable_in_range('mdiplo', [1, 5], Logement) Logement.mtybd[Logement['mtybd'] == 110] = 1 Logement.mtybd[Logement['mtybd'] == 120] = 2 Logement.mtybd[Logement['mtybd'] == 200] = 3 Logement.mtybd[Logement['mtybd'].isin([311, 321, 401])] = 4 Logement.mtybd[Logement['mtybd'].isin([312, 322, 402])] = 5 Logement.mtybd[Logement['mtybd'].isin([313, 323, 403])] = 6 Logement.mtybd[Logement['mtybd'] == 400] = 7 count_NA('mtybd', Logement) assert_variable_in_range('mtybd', [1, 8], Logement) Logement['tu99_recoded'] = Logement['tu99'].copy() count_NA('tu99', Logement) Logement.tu99_recoded[Logement['tu99'] == 0] = 1 Logement.tu99_recoded[Logement['tu99'].isin([1, 2, 3])] = 2 Logement.tu99_recoded[Logement['tu99'].isin([4, 5, 6])] = 3 Logement.tu99_recoded[Logement['tu99'] == 7] = 4 Logement.tu99_recoded[Logement['tu99'] == 8] = 5 count_NA('tu99_recoded', Logement) assert_variable_in_range('tu99_recoded', [1, 6], Logement) Logement.gzc2[Logement['gzc2'] == 1] = 1 Logement.gzc2[Logement['gzc2'].isin([2, 3, 4, 5, 6])] = 2 Logement.gzc2[Logement['gzc2'] == 7] = 3 count_NA('gzc2', Logement) assert_variable_in_range('gzc2', [1, 4], Logement) Logement.magtr[Logement['magtr'].isin([1, 2])] = 1 Logement.magtr[Logement['magtr'].isin([3, 4])] = 2 Logement.magtr[Logement['magtr'] == 5] = 3 count_NA('magtr', Logement) assert_variable_in_range('magtr', [1, 4], Logement) Logement['mcs8'][Logement['mcs8'] == 1] = 1 Logement['mcs8'][Logement['mcs8'] == 2] = 2 Logement['mcs8'][Logement['mcs8'] == 3] = 3 Logement['mcs8'][Logement['mcs8'].isin([4, 8])] = 4 Logement['mcs8'][Logement['mcs8'].isin([5, 6, 7])] = 5 count_NA('mcs8', Logement) assert_variable_in_range('mcs8', [1, 6], Logement) Logement['logloy'] = numpy.log(Logement['lmlm'].values) # Logement.dropna( # axis = 0, # subset = ['mdiplo', 'mtybd', 'magtr', 'mcs8', 'maa1at'], # inplace = True) # Imputation des loyers proprement dite log.info('Compute imputed rents') kept_variables = [ 'lmlm', 'logt', 'hnph2', 'iaat', 'mdiplo', 'mtybd', 'tu99_recoded', 'magtr', 'mcs8', 'deci', 'ident' ] Logt = Logement[kept_variables].copy() Logt['wprm'] = Logement['qex'].copy() return Logt
def create_comparable_logement_data_frame(temporary_store=None, year=None): assert temporary_store is not None assert year is not None logement_adresse_variables = ["gzc2"] logement_menage_variables = [ "maa1at", "magtr", "mcs8", "mdiplo", "mrcho", "mrret", "mrsal", "mrtns", "mtybd", "muc1", "qex", "sec1", ] if year == 2003: logement_menage_variables.extend( ["hnph2", "ident", "lmlm", "mnatior", "typse"]) logement_adresse_variables.extend(["iaat", "ident", "tu99"]) if year > 2005: # and year < 2010: logement_menage_variables.extend(["idlog", "mnatio"]) logement_adresse_variables.extend(["idlog"]) # pas de typse en 2006 logement_logement_variables = [ "hnph2", "iaat", "idlog", "lmlm", "tu99" ] # pas de typse en 2006 # Travail sur la table logement # Table menage if year == 2003: year_lgt = 2003 if year > 2005: # and year < 2010: year_lgt = 2006 logement_survey_collection = SurveyCollection.load(collection='logement') logement_survey = logement_survey_collection.get_survey( 'logement_{}'.format(year_lgt)) log.info("Preparing logement menage table") try: logement_menage = logement_survey.get_values( table="menage", variables=logement_menage_variables) except Exception: logement_menage = logement_survey.get_values( table="menage1", variables=logement_menage_variables) logement_menage.rename(columns={'idlog': 'ident'}, inplace=True) for revenus in ['mrcho', 'mrret', 'mrsal', 'mrtns']: logement_menage[revenus].fillna(0, inplace=True) logement_menage['revtot'] = (logement_menage.mrcho + logement_menage.mrret + logement_menage.mrsal + logement_menage.mrtns) # TODO : Virer les revenus négatifs ? mrtns : 118 revenus négatifs sur 42845 en 2006 assert logement_menage.revtot.notnull().all() logement_menage[ 'nvpr'] = 10.0 * logement_menage['revtot'] / logement_menage['muc1'] assert logement_menage.qex.notnull().all() assert (logement_menage.qex > 0).all() dec, values = mark_weighted_percentiles( logement_menage['nvpr'].values, numpy.arange(1, 11), logement_menage['qex'].values, 2, return_quantiles=True, ) values.sort() logement_menage['deci'] = (1 + (logement_menage.nvpr > values[1]) + (logement_menage.nvpr > values[2]) + (logement_menage.nvpr > values[3]) + (logement_menage.nvpr > values[4]) + (logement_menage.nvpr > values[5]) + (logement_menage.nvpr > values[6]) + (logement_menage.nvpr > values[7]) + (logement_menage.nvpr > values[8]) + (logement_menage.nvpr > values[9])) del dec, values assert logement_menage['deci'].isin(range( 1, 11)).all(), "Logement decile are out of range'" gc.collect() if year_lgt == 2006: log.info('Preparing logement logement table') try: lgtlgt = logement_survey.get_values( table="lgt_logt", variables=logement_logement_variables) except Exception: lgtlgt = logement_survey.get_values( table="logement", variables=logement_logement_variables) lgtlgt.rename(columns={'idlog': 'ident'}, inplace=True) logement_menage = logement_menage.merge(lgtlgt, left_on='ident', right_on='ident', how='inner') del lgtlgt data = logement_menage.loc[logement_menage.sec1.isin([21, 22, 23, 24, 30])].copy() del logement_menage gc.collect() if year_lgt == 2006: data.rename(columns={'mnatio': 'mnatior'}, inplace=True) data = data.loc[data.mnatior.notnull()].copy() data = data.loc[data.sec1.notnull()].copy() data['tmp'] = data.sec1.astype("int") data.loc[data.sec1.isin([21, 22, 23]), 'tmp'] = 3 data.loc[data.sec1 == 24, 'tmp'] = 4 data.loc[data.sec1 == 30, 'tmp'] = 5 data['statut_occupation'] = data.tmp count_NA('statut_occupation', data) logement_menage = data[data.statut_occupation.notnull()].copy() # Table adresse log.info(u"Préparation de la table adresse de l'enquête logement") logement_adresse = logement_survey.get_values( table="adresse", variables=logement_adresse_variables) logement_adresse.rename(columns={'idlog': 'ident'}, inplace=True) log.info(u"Fusion des tables logement et ménage de l'enquête logement") Logement = logement_menage.merge(logement_adresse, on='ident', how='inner') Logement.loc[Logement.hnph2 >= 6, 'hnph2'] = 6 Logement.loc[Logement.hnph2 < 1, 'hnph2'] = 1 count_NA('hnph2', Logement) assert Logement.hnph2.notnull().any(), "Some hnph2 are null" # Logement=(Logement[Logement['hnph2'].notnull()]) # Mis en comment car 0 NA pour hnph2 # On est dans la même étape within ici et par la suite ( cf code R ) # TODO : ici problème je transforme les 07 en 7 # car Python considère les 0n comme des nombres octaux ( < 08 ). # J'espère que ce n'est pas important. Logement.loc[Logement.mnatior.isin([0, 1]), 'mnatior'] = 1 Logement.loc[Logement.mnatior.isin([2, 3, 4, 5, 6, 7, 8, 9, 10, 11]), 'mnatior'] = 2 count_NA('mnatior', Logement) assert_variable_in_range('mnatior', [1, 3], Logement) Logement['iaat_bis'] = 0 Logement.loc[Logement.iaat.isin([1, 2, 3, 4, 5]), 'iaat_bis'] = 1 # avant 1967 Logement.loc[Logement.iaat == 6, 'iaat_bis'] = 2 # 1968 - 1974 Logement.loc[Logement.iaat == 7, 'iaat_bis'] = 3 # 1975 - 1981 Logement.loc[Logement.iaat == 8, 'iaat_bis'] = 4 # 1982 - 1989 Logement.loc[Logement.iaat == 9, 'iaat_bis'] = 5 # 1990 - 1998 Logement.loc[Logement.iaat == 10, 'iaat_bis'] = 6 # après 1999 assert Logement.iaat_bis.isin(range(1, 7)).all() Logement.loc[Logement.mdiplo == 1, 'mdiplo'] = 1 Logement.loc[Logement.mdiplo.isin([2, 3, 4]), 'mdiplo'] = 2 Logement.loc[Logement.mdiplo.isin([5, 6, 7, 8]), 'mdiplo'] = 3 Logement.loc[Logement.mdiplo == 9, 'mdiplo'] = 4 Logement.loc[Logement.mdiplo.isnull(), 'mdiplo'] = 0 # TODO: assert Logement.mdiplo.isin(range(1, 5)).all() assert Logement.mdiplo.isin(range(0, 5)).all() Logement.mdiplo = Logement.mdiplo.astype('int') Logement.loc[Logement.mtybd == 110, 'mtybd'] = 1 Logement.loc[Logement.mtybd == 120, 'mtybd'] = 2 Logement.loc[Logement.mtybd == 200, 'mtybd'] = 3 Logement.loc[Logement.mtybd.isin([311, 321, 401]), 'mtybd'] = 4 Logement.loc[Logement.mtybd.isin([312, 322, 402]), 'mtybd'] = 5 Logement.loc[Logement.mtybd.isin([313, 323, 403]), 'mtybd'] = 6 Logement.loc[Logement.mtybd == 400, 'mtybd'] = 7 assert Logement.mtybd.isin(range(1, 8)).all() Logement.mtybd = Logement.mtybd.astype('int') Logement['tu99_recoded'] = Logement.tu99.copy() count_NA('tu99', Logement) Logement.loc[Logement.tu99 == 0, 'tu99_recoded'] = 1 Logement.loc[Logement.tu99.isin([1, 2, 3]), 'tu99_recoded'] = 2 Logement.loc[Logement.tu99.isin([4, 5, 6]), 'tu99_recoded'] = 3 Logement.loc[Logement.tu99 == 7, 'tu99_recoded'] = 4 Logement.loc[Logement.tu99 == 8, 'tu99_recoded'] = 5 count_NA('tu99_recoded', Logement) assert_variable_in_range('tu99_recoded', [1, 6], Logement) Logement.loc[Logement.gzc2 == 1, 'gzc2'] = 1 Logement.loc[Logement.gzc2.isin([2, 3, 4, 5, 6]), 'gzc2'] = 2 Logement.loc[Logement.gzc2 == 7, 'gzc2'] = 3 count_NA('gzc2', Logement) # TODO: assert_variable_in_range('gzc2', [1, 4], Logement) Logement.loc[Logement.magtr.isin([1, 2]), 'magtr'] = 1 Logement.loc[Logement.magtr.isin([3, 4]), 'magtr'] = 2 Logement.loc[Logement.magtr == 5, 'magtr'] = 3 assert Logement.magtr.isin(range(1, 4)).all() # Logement.loc[Logement.mcs8 == 1, 'mcs8'] = 1 # Logement.loc[Logement.mcs8 == 2, 'mcs8'] = 2 # Logement.loc[Logement.mcs8 == 3, 'mcs8'] = 3 Logement.loc[Logement.mcs8.isin([4, 8]), 'mcs8'] = 4 Logement.loc[Logement.mcs8.isin([5, 6, 7]), 'mcs8'] = 5 assert Logement.mcs8.isin(range(1, 6)).all() Logement['logloy'] = numpy.log(Logement['lmlm'].values) kept_variables = [ 'deci', 'hnph2', 'iaat_bis', 'lmlm', 'magtr', 'mcs8', 'mdiplo', 'mtybd', 'qex', 'statut_occupation', 'tu99_recoded', # 'ident', ] logement = Logement[kept_variables].copy() # logement.rename(columns = {'qex': 'wprm'}, inplace = True) return logement
def build_homogeneisation_revenus_menages(temporary_store=None, year=None): assert temporary_store is not None """Build menage consumption by categorie fiscale dataframe """ assert year is not None # Load data bdf_survey_collection = SurveyCollection.load( collection='budget_des_familles', config_files_directory=config_files_directory) survey = bdf_survey_collection.get_survey( 'budget_des_familles_{}'.format(year)) # ********************************************************************************************************************** # ********************************* HOMOGENEISATION DES DONNEES SUR LES REVENUS DES MENAGES **************************** # ************************************ CALCUL D'UN PROXI DU REVENU DISPONIBLE DES MENAGES ****************************** # ********************************************************************************************************************** # # ********************HOMOGENEISATION DES BASES DE RESSOURCES*************************** # La base 95 permet de distinguer taxe d'habitation et impôts fonciers. # On calcule leur montant relatif pour l'appliquer à 00 et 05 if year == 1995: menrev = survey.get_values( table="menrev", variables=[ 'revtot', 'ir', 'irbis', 'imphab', 'impfon', 'revaid', 'revsal', 'revind', 'revsec', 'revret', 'revcho', 'revfam', 'revlog', 'revinv', 'revrmi', 'revpat', 'mena', 'ponderr' ], ) menage = survey.get_values(table="socioscm", variables=['exdep', 'exrev', 'mena']) menage.set_index('mena') menrev = menrev.merge(menage, left_index=True, right_index=True) # cette étape de ne garder que les données dont on est sûr de la qualité et de la véracité # exdep = 1 si les données sont bien remplies pour les dépenses du ménage # exrev = 1 si les données sont bien remplies pour les revenus du ménage menrev = menrev[(menrev.exdep == 1) & (menrev.exrev == 1)] menrev['foncier_hab'] = menrev.imphab + menrev.impfon menrev['part_IMPHAB'] = menrev.imphab / menrev.foncier_hab menrev['part_IMPFON'] = menrev.impfon / menrev.foncier_hab menrev['revsoc'] = (menrev.revret + menrev.revcho + menrev.revfam + menrev.revlog + menrev.revinv + menrev.revrmi) for variable in [ 'revcho', 'revfam', 'revinv', 'revlog', 'revret', 'revrmi' ]: del menrev[variable] menrev[ 'revact'] = menrev['revsal'] + menrev['revind'] + menrev['revsec'] menrev.rename(columns=dict( revpat="revpat", impfon="impfon", imphab="imphab", revaid="somme_obl_recue", ), inplace=True) menrev['impot_revenu'] = menrev['ir'] + menrev['irbis'] rev_disp = survey.get_values( table="menrev", variables=[ 'revtot', 'revret', 'revcho', 'revfam', 'revlog', 'revinv', 'revrmi', 'imphab', 'impfon', 'revaid', 'revsal', 'revind', 'revsec', 'revpat', 'mena', 'ponderr', 'ir', 'irbis' ], ) rev_disp.set_index('mena', inplace=True) menage2 = survey.get_values(table="socioscm", variables=['exdep', 'exrev', 'mena']) menage2.set_index('mena', inplace=True) rev_disp = menage2.merge(rev_disp, left_index=True, right_index=True) rev_disp = rev_disp[(rev_disp.exrev == 1) & (rev_disp.exdep == 1)] rev_disp['revsoc'] = (rev_disp['revret'] + rev_disp['revcho'] + rev_disp['revfam'] + rev_disp['revlog'] + rev_disp['revinv'] + rev_disp['revrmi']) rev_disp['impot_revenu'] = rev_disp['ir'] + rev_disp['irbis'] rev_disp.rename(columns=dict(revaid='somme_obl_recue', ), inplace=True) rev_disp.somme_obl_recue = rev_disp.somme_obl_recue.fillna(0) rev_disp['revact'] = rev_disp['revsal'] + rev_disp[ 'revind'] + rev_disp['revsec'] rev_disp['revtot'] = rev_disp['revact'] + rev_disp[ 'revpat'] + rev_disp['revsoc'] + rev_disp['somme_obl_recue'] rev_disp['revact'] = rev_disp['revsal'] + rev_disp[ 'revind'] + rev_disp['revsec'] rev_disp.rename(columns=dict( ponderr="pondmen", mena="ident_men", revind="act_indpt", revsal="salaires", revsec="autres_rev", ), inplace=True) rev_disp['autoverses'] = '0' rev_disp['somme_libre_recue'] = '0' rev_disp['autres_ress'] = '0' # # /* Le revenu disponible se calcule à partir de revtot à laquelle on retrancher la taxe d'habitation # et l'impôt sur le revenu, plus éventuellement les CSG et CRDS. # La variable revtot est la somme des revenus d'activité, sociaux, du patrimoine et d'aide. */ # rev_disp[ 'rev_disponible'] = rev_disp.revtot - rev_disp.impot_revenu - rev_disp.imphab loyers_imputes = temporary_store['depenses_bdf_{}'.format(year)] loyers_imputes.rename( columns={"0411": "loyer_impute"}, inplace=True, ) rev_dispbis = loyers_imputes.merge(rev_disp, left_index=True, right_index=True) rev_disp['rev_disp_loyerimput'] = rev_disp[ 'rev_disponible'] - rev_dispbis['loyer_impute'] for var in [ 'somme_obl_recue', 'act_indpt', 'revpat', 'salaires', 'autres_rev', 'rev_disponible', 'impfon', 'imphab', 'revsoc', 'revact', 'impot_revenu', 'revtot', 'rev_disp_loyerimput' ]: rev_disp[var] = rev_disp[var] / 6.55957 # CONVERSION EN EUROS temporary_store["revenus_{}".format(year)] = rev_disp elif year == 2000: # TODO: récupérer plutôt les variables qui viennent de la table dépenses (dans temporary_store) rev_disp = survey.get_values( table="consomen", variables=[ 'c13141', 'c13111', 'c13121', 'c13131', 'pondmen', 'ident' ], ) menage = survey.get_values( table="menage", variables=[ 'ident', 'revtot', 'revact', 'revsoc', 'revpat', 'rev70', 'rev71', 'revt_d', 'pondmen', 'rev10', 'rev11', 'rev20', 'rev21' ], ).sort_values(by=['ident']) menage.index = menage.index.astype(ident_men_dtype) rev_disp.index = rev_disp.index.astype(ident_men_dtype) revenus = menage.join(rev_disp, how="outer", rsuffix="rev_disp") revenus.fillna(0, inplace=True) revenus.rename(columns=dict( c13111="impot_res_ppal", c13141="impot_revenu", c13121="impot_autres_res", rev70="somme_obl_recue", rev71="somme_libre_recue", revt_d="autres_ress", ident="ident_men", rev10="act_indpt", rev11="autoverses", rev20="salaires", rev21="autres_rev", ), inplace=True) var_to_ints = [ 'pondmen', 'impot_autres_res', 'impot_res_ppal', 'pondmenrev_disp', 'c13131' ] for var_to_int in var_to_ints: revenus.loc[revenus[var_to_int].isnull(), var_to_int] = 0 revenus[var_to_int] = revenus[var_to_int].astype(int) revenus['imphab'] = 0.65 * (revenus.impot_res_ppal + revenus.impot_autres_res) revenus['impfon'] = 0.35 * (revenus.impot_res_ppal + revenus.impot_autres_res) loyers_imputes = temporary_store["depenses_bdf_{}".format(year)] variables = ["poste_coicop_421"] loyers_imputes = loyers_imputes[variables] loyers_imputes.rename( columns={"poste_coicop_421": "loyer_impute"}, inplace=True, ) temporary_store["loyers_imputes_{}".format(year)] = loyers_imputes loyers_imputes.index = loyers_imputes.index.astype(ident_men_dtype) revenus.set_index('ident_men', inplace=True) revenus.index = revenus.index.astype(ident_men_dtype) assert set(revenus.index) == set( loyers_imputes.index ), 'revenus and loyers_imputes indexes are not equal' revenus = revenus.merge(loyers_imputes, left_index=True, right_index=True) revenus[ 'rev_disponible'] = revenus.revtot - revenus.impot_revenu - revenus.imphab revenus['rev_disponible'] = revenus['rev_disponible'] * ( revenus['rev_disponible'] >= 0) revenus[ 'rev_disp_loyerimput'] = revenus.rev_disponible + revenus.loyer_impute var_to_ints = ['loyer_impute'] for var_to_int in var_to_ints: revenus[var_to_int] = revenus[var_to_int].astype(int) temporary_store["revenus_{}".format(year)] = revenus elif year == 2005: c05d = survey.get_values( table="c05d", variables=['c13111', 'c13121', 'c13141', 'pondmen', 'ident_men'], ) rev_disp = c05d.sort_values(by=['ident_men']) del c05d menage = survey.get_values( table="menage", variables=[ 'ident_men', 'revtot', 'revact', 'revsoc', 'revpat', 'rev700_d', 'rev701_d', 'rev999_d', 'rev100_d', 'rev101_d', 'rev200_d', 'rev201_d' ], ).sort_values(by=['ident_men']) rev_disp.set_index('ident_men', inplace=True) menage.set_index('ident_men', inplace=True) menage.index = menage.index.astype('str') rev_disp.index = rev_disp.index.astype('str') assert menage.index.dtype == rev_disp.index.dtype, 'menage ({}) and revdisp ({}) dtypes differs'.format( menage.index.dtype, rev_disp.index.dtype) revenus = pandas.concat([menage, rev_disp], axis=1) assert len(menage.index) == len(revenus.index) revenus.rename(columns=dict( rev100_d="act_indpt", rev101_d="autoverses", rev200_d="salaires", rev201_d="autres_rev", rev700_d="somme_obl_recue", rev701_d="somme_libre_recue", rev999_d="autres_ress", c13111="impot_res_ppal", c13141="impot_revenu", c13121="impot_autres_res", ), inplace=True) # * Ces pondérations (0.65 0.35) viennent de l'enquête BdF 1995 qui distingue taxe d'habitation et impôts # fonciers. A partir de BdF 1995, # * on a calculé que la taxe d'habitation représente en moyenne 65% des impôts locaux, et que les impôts # fonciers en représentenr 35%. # * On applique ces taux aux enquêtes 2000 et 2005. revenus['imphab'] = 0.65 * (revenus.impot_res_ppal + revenus.impot_autres_res) revenus['impfon'] = 0.35 * (revenus.impot_res_ppal + revenus.impot_autres_res) del revenus['impot_autres_res'] del revenus['impot_res_ppal'] # * Calculer le revenu disponible avec et sans le loyer imputé loyers_imputes = temporary_store["depenses_bdf_{}".format(year)] variables = ["poste_coicop_421"] loyers_imputes = loyers_imputes[variables] loyers_imputes.rename( columns={"poste_coicop_421": "loyer_impute"}, inplace=True, ) temporary_store["loyers_imputes_{}".format(year)] = loyers_imputes loyers_imputes.index = loyers_imputes.index.astype('str') assert revenus.index.dtype == loyers_imputes.index.dtype assert set(revenus.index) == set( loyers_imputes.index ), '''revenus and loyers_imputes indexes are not equal. In revenus and not in loyers_imputes: {} In loyers_imputes and not in revenus: {} '''.format( set(revenus.index) - set(loyers_imputes.index), set(loyers_imputes.index) - set(revenus.index)) revenus = revenus.merge(loyers_imputes, left_index=True, right_index=True) revenus[ 'rev_disponible'] = revenus.revtot - revenus.impot_revenu - revenus.imphab revenus['rev_disponible'] = revenus['rev_disponible'] * ( revenus['rev_disponible'] >= 0) revenus[ 'rev_disp_loyerimput'] = revenus.rev_disponible + revenus.loyer_impute temporary_store["revenus_{}".format(year)] = revenus elif year == 2011: try: c05 = survey.get_values( table="C05", variables=[ 'c13111', 'c13121', 'c13141', 'pondmen', 'ident_me' ], ) rev_disp = c05.sort_values(by=['ident_me']) except: c05 = survey.get_values( table="c05", variables=[ 'c13111', 'c13121', 'c13141', 'pondmen', 'ident_me' ], ) rev_disp = c05.sort_values(by=['ident_me']) del c05 try: menage = survey.get_values( table="MENAGE", variables=[ 'ident_me', 'revtot', 'revact', 'revsoc', 'revpat', 'rev700', 'rev701', 'rev999', 'revindep', 'salaires' ], ).sort_values(by=['ident_me']) except: menage = survey.get_values( table="menage", variables=[ 'ident_me', 'revtot', 'revact', 'revsoc', 'revpat', 'rev700', 'rev701', 'rev999', 'revindep', 'salaires' ], ).sort_values(by=['ident_me']) rev_disp.index = rev_disp.index.astype(ident_men_dtype) menage.index = menage.index.astype(ident_men_dtype) rev_disp.set_index('ident_me', inplace=True) menage.set_index('ident_me', inplace=True) revenus = pandas.concat([menage, rev_disp], axis=1) menage.index.name = 'ident_men' revenus.index.name = 'ident_men' revenus.rename( columns=dict( revindep="act_indpt", # TODO: trouver ces revenus commentés dans bdf 2011 # rev101_d = "autoverses", salaires="salaires", # rev201_d = "autres_rev", rev700="somme_obl_recue", rev701="somme_libre_recue", rev999="autres_ress", c13111="impot_res_ppal", c13141="impot_revenu", c13121="impot_autres_res", ), inplace=True) revenus['imphab'] = 0.65 * (revenus.impot_res_ppal + revenus.impot_autres_res) revenus['impfon'] = 0.35 * (revenus.impot_res_ppal + revenus.impot_autres_res) del revenus['impot_autres_res'] del revenus['impot_res_ppal'] loyers_imputes = temporary_store["depenses_bdf_{}".format(year)] variables = ["poste_coicop_421"] loyers_imputes = loyers_imputes[variables] loyers_imputes.rename( columns={"poste_coicop_421": "loyer_impute"}, inplace=True, ) temporary_store["loyers_imputes_{}".format(year)] = loyers_imputes revenus = revenus.merge(loyers_imputes, left_index=True, right_index=True) revenus[ 'rev_disponible'] = revenus.revtot - revenus.impot_revenu - revenus.imphab revenus['rev_disponible'] = revenus['rev_disponible'] * ( revenus['rev_disponible'] >= 0) revenus[ 'rev_disp_loyerimput'] = revenus.rev_disponible + revenus.loyer_impute temporary_store["revenus_{}".format(year)] = revenus
def build_depenses_homogenisees(temporary_store=None, year=None): """Build menage consumption by categorie fiscale dataframe """ assert temporary_store is not None assert year is not None bdf_survey_collection = SurveyCollection.load( collection='budget_des_familles', config_files_directory=config_files_directory) survey = bdf_survey_collection.get_survey( 'budget_des_familles_{}'.format(year)) # Homogénéisation des bases de données de dépenses if year == 1995: socioscm = survey.get_values(table="socioscm") poids = socioscm[['mena', 'ponderrd', 'exdep', 'exrev']] # cette étape de ne garder que les données dont on est sûr de la qualité et de la véracité # exdep = 1 si les données sont bien remplies pour les dépenses du ménage # exrev = 1 si les données sont bien remplies pour les revenus du ménage poids = poids[(poids.exdep == 1) & (poids.exrev == 1)] del poids['exdep'], poids['exrev'] poids.rename(columns={ 'mena': 'ident_men', 'ponderrd': 'pondmen', }, inplace=True) poids.set_index('ident_men', inplace=True) conso = survey.get_values(table="depnom") conso = conso[["valeur", "montant", "mena", "nomen5"]] conso = conso.groupby(["mena", "nomen5"]).sum() conso = conso.reset_index() conso.rename(columns={ 'mena': 'ident_men', 'nomen5': 'poste{}'.format(year), 'valeur': 'depense', 'montant': 'depense_avt_imput', }, inplace=True) # Passage à l'euro conso.depense = conso.depense / 6.55957 conso.depense_avt_imput = conso.depense_avt_imput / 6.55957 conso_small = conso[[u'ident_men', u'poste1995', u'depense']] conso_unstacked = conso_small.set_index(['ident_men', 'poste1995' ]).unstack('poste1995') conso_unstacked = conso_unstacked.fillna(0) levels = conso_unstacked.columns.levels[1] labels = conso_unstacked.columns.labels[1] conso_unstacked.columns = levels[labels] conso_unstacked.rename(index={0: 'ident_men'}, inplace=True) conso = conso_unstacked.merge(poids, left_index=True, right_index=True) conso = conso.reset_index() if year == 2000: conso = survey.get_values(table="consomen") conso.rename( columns={ 'ident': 'ident_men', 'pondmen': 'pondmen', }, inplace=True, ) for variable in ['ctotale', 'c99', 'c99999'] + \ ["c0{}".format(i) for i in range(1, 10)] + \ ["c{}".format(i) for i in range(10, 14)]: del conso[variable] if year == 2005: conso = survey.get_values(table="c05d") if year == 2011: try: conso = survey.get_values(table="C05") except: conso = survey.get_values(table="c05") conso.rename( columns={ 'ident_me': 'ident_men', }, inplace=True, ) del conso['ctot'] # Grouping by coicop poids = conso[['ident_men', 'pondmen']].copy() poids.set_index('ident_men', inplace=True) conso.drop('pondmen', axis=1, inplace=True) conso.set_index('ident_men', inplace=True) matrice_passage_data_frame, selected_parametres_fiscalite_data_frame = get_transfert_data_frames( year) coicop_poste_bdf = matrice_passage_data_frame[[ 'poste{}'.format(year), 'posteCOICOP' ]] coicop_poste_bdf.set_index('poste{}'.format(year), inplace=True) coicop_by_poste_bdf = coicop_poste_bdf.to_dict()['posteCOICOP'] del coicop_poste_bdf def reformat_consumption_column_coicop(coicop): try: return int(coicop.replace('c', '').lstrip('0')) except: return numpy.NaN # cette étape permet d'harmoniser les df pour 1995 qui ne se présentent pas de la même façon # que pour les trois autres années if year == 1995: coicop_labels = [ normalize_code_coicop(coicop_by_poste_bdf.get(poste_bdf)) for poste_bdf in conso.columns ] else: coicop_labels = [ normalize_code_coicop( coicop_by_poste_bdf.get( reformat_consumption_column_coicop(poste_bdf))) for poste_bdf in conso.columns ] tuples = zip(coicop_labels, conso.columns) conso.columns = pandas.MultiIndex.from_tuples( tuples, names=['coicop', 'poste{}'.format(year)]) coicop_data_frame = conso.groupby(level=0, axis=1).sum() depenses = coicop_data_frame.merge(poids, left_index=True, right_index=True) # Création de gros postes, les 12 postes sur lesquels le calage se fera def select_gros_postes(coicop): try: coicop = unicode(coicop) except: coicop = coicop normalized_coicop = normalize_code_coicop(coicop) grosposte = normalized_coicop[0:2] return int(grosposte) grospostes = [ select_gros_postes(coicop) for coicop in coicop_data_frame.columns ] tuples_gros_poste = zip(coicop_data_frame.columns, grospostes) coicop_data_frame.columns = pandas.MultiIndex.from_tuples( tuples_gros_poste, names=['coicop', 'grosposte']) depenses_by_grosposte = coicop_data_frame.groupby(level=1, axis=1).sum() depenses_by_grosposte = depenses_by_grosposte.merge(poids, left_index=True, right_index=True) # TODO : understand why it does not work: depenses.rename(columns = {u'0421': 'poste_coicop_421'}, inplace = True) produits = [column for column in depenses.columns if column.isdigit()] for code in produits: if code[-1:] == '0': depenses.rename(columns={code: code[:-1]}, inplace=True) else: depenses.rename(columns={code: code}, inplace=True) produits = [column for column in depenses.columns if column.isdigit()] for code in produits: if code[0:1] == '0': depenses.rename(columns={code: code[1:]}, inplace=True) else: depenses.rename(columns={code: code}, inplace=True) produits = [column for column in depenses.columns if column.isdigit()] for code in produits: depenses.rename(columns={code: 'poste_coicop_' + code}, inplace=True) temporary_store['depenses_{}'.format(year)] = depenses depenses_by_grosposte.columns = depenses_by_grosposte.columns.astype(str) liste_grospostes = [ column for column in depenses_by_grosposte.columns if column.isdigit() ] for grosposte in liste_grospostes: depenses_by_grosposte.rename( columns={grosposte: 'coicop12_' + grosposte}, inplace=True) temporary_store['depenses_by_grosposte_{}'.format( year)] = depenses_by_grosposte
def create_totals(year = None): assert year is not None temporary_store = TemporaryStore.create(file_name = "erfs") replace = create_replace(year) # On part de la table individu de l'ERFS # on renomme les variables log.info(u"Creating Totals") log.info(u"Etape 1 : Chargement des données") erfs_survey_collection = SurveyCollection.load(collection = 'erfs', config_files_directory = config_files_directory) data = erfs_survey_collection.get_survey('erfs_{}'.format(year)) indivim = temporary_store['indivim_{}'.format(year)] assert not indivim.duplicated(['noindiv']).any(), "Présence de doublons" # Deals individuals with imputed income : some individuals are in 'erf individu table' but # not in the 'foyer' table. We need to create a foyer for them. selection = Series() for var in ["zsali", "zchoi", "zrsti", "zalri", "zrtoi", "zragi", "zrici", "zrnci"]: varo = var[:-1] + "o" test = indivim[var] != indivim[varo] if len(selection) == 0: selection = test else: selection = (test) | (selection) indivi_i = indivim[selection].copy() indivi_i.rename( columns = { "ident": "idmen", "persfip": "quifoy", "zsali": "sali2", # Inclu les salaires non imposables des agents d'assurance "zchoi": "choi2", "zrsti": "rsti2", "zalri": "alr2" }, inplace = True, ) indivi_i.quifoy = where(indivi_i.quifoy.isnull(), "vous", indivi_i.quifoy) indivi_i.quelfic = "FIP_IMP" # We merge them with the other individuals indivim.rename( columns = dict( ident = "idmen", persfip = "quifoy", zsali = "sali2", # Inclu les salaires non imposables des agents d'assurance zchoi = "choi2", zrsti = "rsti2", zalri = "alr2", ), inplace = True, ) if not (set(list(indivim.noindiv)) > set(list(indivi_i.noindiv))): raise Exception("Individual ") indivim.set_index("noindiv", inplace = True) indivi_i.set_index("noindiv", inplace = True) indivi = indivim del indivim indivi.update(indivi_i) indivi.reset_index(inplace = True) log.info("Etape 2 : isolation des FIP") fip_imp = indivi.quelfic == "FIP_IMP" indivi["idfoy"] = ( indivi.idmen.astype("int64") * 100 + (indivi.declar1.str[0:2]).convert_objects(convert_numeric=True) ) indivi.loc[fip_imp, "idfoy"] = np.nan # Certains FIP (ou du moins avec revenus imputés) ont un numéro de déclaration d'impôt ( pourquoi ?) fip_has_declar = (fip_imp) & (indivi.declar1.notnull()) indivi["idfoy"] = where( fip_has_declar, indivi.idmen * 100 + indivi.declar1.str[0:2].convert_objects(convert_numeric = True), indivi.idfoy) del fip_has_declar fip_no_declar = (fip_imp) & (indivi.declar1.isnull()) del fip_imp indivi["idfoy"] = where(fip_no_declar, indivi["idmen"] * 100 + 50, indivi["idfoy"]) indivi_fnd = indivi[["idfoy", "noindiv"]][fip_no_declar].copy() while any(indivi_fnd.duplicated(cols=["idfoy"])): indivi_fnd["idfoy"] = where( indivi_fnd.duplicated(cols=["idfoy"]), indivi_fnd["idfoy"] + 1, indivi_fnd["idfoy"] ) # assert indivi_fnd["idfoy"].duplicated().value_counts()[False] == len(indivi_fnd["idfoy"].values), "Duplicates remaining" assert len(indivi[indivi.duplicated(['noindiv'])]) == 0, "Doublons" indivi.idfoy[fip_no_declar] = indivi_fnd.idfoy.copy() del indivi_fnd, fip_no_declar log.info(u"Etape 3 : Récupération des EE_NRT") nrt = indivi.quelfic == "EE_NRT" indivi.idfoy = where(nrt, indivi.idmen * 100 + indivi.noi, indivi.idfoy) indivi.quifoy[nrt] = "vous" del nrt pref_or_cref = indivi.lpr.isin([1, 2]) adults = (indivi.quelfic.isin(["EE", "EE_CAF"])) & (pref_or_cref) indivi.idfoy = where(adults, indivi.idmen * 100 + indivi.noi, indivi.idfoy) indivi.loc[adults, "quifoy"] = "vous" del adults # TODO: hack to avoid assert error log.info("{}".format(indivi.loc[indivi['lpr'].isin([1, 2]), "idfoy"].notnull().value_counts())) assert indivi.idfoy[indivi.lpr.dropna().isin([1, 2])].all() log.info(u"Etape 4 : Rattachement des enfants aux déclarations") assert not(indivi.noindiv.duplicated().any()), "Some noindiv appear twice" lpr3_or_lpr4 = indivi['lpr'].isin([3, 4]) enf_ee = (lpr3_or_lpr4) & (indivi.quelfic.isin(["EE", "EE_CAF"])) assert indivi.noindiv[enf_ee].notnull().all(), " Some noindiv are not set, which will ruin next stage" assert not(indivi.noindiv[enf_ee].duplicated().any()), "Some noindiv appear twice" pere = DataFrame({ "noindiv_enf": indivi.noindiv.loc[enf_ee], "noindiv": 100 * indivi.idmen.loc[enf_ee] + indivi.noiper.loc[enf_ee] }) mere = DataFrame({ "noindiv_enf": indivi.noindiv.loc[enf_ee], "noindiv": 100 * indivi.idmen.loc[enf_ee] + indivi.noimer.loc[enf_ee] }) foyer = data.get_values(variables = ["noindiv", "zimpof"], table = replace["foyer"]) pere = pere.merge(foyer, how = "inner", on = "noindiv") mere = mere.merge(foyer, how = "inner", on = "noindiv") df = pere.merge(mere, how = "outer", on = "noindiv_enf", suffixes=('_p', '_m')) log.info(u" 4.1 : gestion des personnes dans 2 foyers") for col in ["noindiv_p", "noindiv_m", "noindiv_enf"]: df[col] = df[col].fillna(0, inplace = True) # beacause groupby drop groups with NA in index df = df.groupby(by = ["noindiv_p", "noindiv_m", "noindiv_enf"]).sum() df.reset_index(inplace = True) df["which"] = "" df.which = where((df.zimpof_m.notnull()) & (df.zimpof_p.isnull()), "mere", "") df.which = where((df.zimpof_p.notnull()) & (df.zimpof_m.isnull()), "pere", "") both = (df.zimpof_p.notnull()) & (df.zimpof_m.notnull()) df.which = where(both & (df.zimpof_p > df.zimpof_m), "pere", "mere") df.which = where(both & (df.zimpof_m >= df.zimpof_p), "mere", "pere") assert df.which.notnull().all(), "Some enf_ee individuals are not matched with any pere or mere" del lpr3_or_lpr4, pere, mere df.rename(columns = {"noindiv_enf": "noindiv"}, inplace = True) df['idfoy'] = where(df.which == "pere", df.noindiv_p, df.noindiv_m) df['idfoy'] = where(df.which == "mere", df.noindiv_m, df.noindiv_p) assert df["idfoy"].notnull().all() dropped = [col for col in df.columns if col not in ["idfoy", "noindiv"]] df.drop(dropped, axis = 1, inplace = True) assert not(df.duplicated().any()) df.set_index("noindiv", inplace = True, verify_integrity = True) indivi.set_index("noindiv", inplace = True, verify_integrity = True) ind_notnull = indivi["idfoy"].notnull().sum() ind_isnull = indivi["idfoy"].isnull().sum() indivi = indivi.combine_first(df) assert ind_notnull + ind_isnull == ( indivi["idfoy"].notnull().sum() + indivi["idfoy"].isnull().sum() ) indivi.reset_index(inplace = True) assert not(indivi.duplicated().any()) # MBJ: issue delt with when moving from R code to python # TODO il faut rajouterles enfants_fip et créer un ménage pour les majeurs # On suit guide méthodo erf 2003 page 135 # On supprime les conjoints FIP et les FIP de 25 ans et plus; # On conserve les enfants FIP de 19 à 24 ans; # On supprime les FIP de 18 ans et moins, exceptés les FIP nés en 2002 dans un # ménage en 6ème interrogation car ce sont des enfants nés aprés la date d'enquète # EEC que l'on ne retrouvera pas dans les EEC suivantes. # log.info(u" 4.2 : On enlève les individus pour lesquels il manque le déclarant") fip = temporary_store['fipDat_{}'.format(year)] fip["declar"] = np.nan fip["agepf"] = np.nan fip.drop(["actrec", "year", "noidec"], axis = 1, inplace = True) fip.naia = fip.naia.astype("int32") fip.rename( columns = dict( ident = "idmen", persfip = "quifoy", zsali = "sali2", # Inclu les salaires non imposables des agents d'assurance zchoi = "choi2", zrsti = "rsti2", zalri = "alr2"), inplace = True) is_fip_19_25 = ((year - fip.naia - 1) >= 19) & ((year - fip.naia - 1) < 25) # TODO: BUT for the time being we keep them in thier vous menage so the following lines are commented # The idmen are of the form 60XXXX we use idmen 61XXXX, 62XXXX for the idmen of the kids over 18 and less than 25 # fip[is_fip_19_25 ,"idmen"] <- (99-fip[is_fip_19_25,"noi"]+1)*100000 + fip[is_fip_19_25,"idmen"] # fip[is_fip_19_25 ,"lpr"] <- 1 # # indivi <- rbind.fill(indivi,fip[is_fip_19_25,]) indivi = concat([indivi, fip.loc[is_fip_19_25]]) del is_fip_19_25 indivi['age'] = year - indivi.naia - 1 indivi['age_en_mois'] = 12 * indivi.age + 12 - indivi.naim indivi["quimen"] = 0 indivi.quimen[indivi.lpr == 1] = 0 indivi.quimen[indivi.lpr == 2] = 1 indivi.quimen[indivi.lpr == 3] = 2 indivi.quimen[indivi.lpr == 4] = 3 indivi['not_pr_cpr'] = None # Create a new row indivi.not_pr_cpr[indivi.lpr <= 2] = False indivi.not_pr_cpr[indivi.lpr > 2] = True assert indivi.not_pr_cpr.isin([True, False]).all() log.info(u" 4.3 : Creating non pr=0 and cpr=1 idmen's") indivi.reset_index(inplace = True) test1 = indivi[['quimen', 'idmen']][indivi.not_pr_cpr].copy() test1['quimen'] = 2 j = 2 while any(test1.duplicated(['quimen', 'idmen'])): test1.loc[test1.duplicated(['quimen', 'idmen']), 'quimen'] = j + 1 j += 1 print_id(indivi) indivi.update(test1) print_id(indivi) # indivi.set_index(['quimen']) #TODO: check relevance # TODO problème avec certains idfoy qui n'ont pas de vous log.info(u"Etape 5 : Gestion des idfoy qui n'ont pas de vous") all_ind = indivi.drop_duplicates('idfoy') with_ = indivi.loc[indivi.quifoy == 'vous', 'idfoy'] without = all_ind[~(all_ind.idfoy.isin(with_.values))] log.info(u"On cherche si le déclarant donné par la deuxième déclaration est bien un vous") # TODO: the following should be delt with at the import of the tables indivi.replace( to_replace = { 'declar2': {'NA': np.nan, '': np.nan} }, inplace = True ) has_declar2 = (indivi.idfoy.isin(without.idfoy.values)) & (indivi.declar2.notnull()) decl2_idfoy = ( indivi.loc[has_declar2, "idmen"].astype('int') * 100 + indivi.loc[has_declar2, "declar2"].str[0:2].astype('int') ) indivi.loc[has_declar2, 'idfoy'] = where(decl2_idfoy.isin(with_.values), decl2_idfoy, None) del all_ind, with_, without, has_declar2 log.info(u" 5.1 : Elimination idfoy restant") idfoyList = indivi.loc[indivi.quifoy == "vous", 'idfoy'].drop_duplicates() indivi = indivi[indivi.idfoy.isin(idfoyList.values)] del idfoyList print_id(indivi) # Sélectionne les variables à garder pour les steps suivants myvars = [ "actrec", "age", "age_en_mois", "chpub", "encadr", "idfoy", "idmen", "nbsala", "noi", "noindiv", "prosa", "quelfic", "quifoy", "quimen", "statut", "titc", "txtppb", "wprm", "rc1rev", "maahe", ] assert len(set(myvars).difference(set(indivi.columns))) == 0, \ "Manquent les colonnes suivantes : {}".format(set(myvars).difference(set(indivi.columns))) indivi = indivi[myvars].copy() # TODO les actrec des fip ne sont pas codées (on le fera à la fin quand on aura rassemblé # les infos provenant des déclarations) log.info(u"Etape 6 : Création des variables descriptives") log.info(u" 6.1 : variable activité") indivi['activite'] = None indivi['activite'][indivi.actrec <= 3] = 0 indivi['activite'][indivi.actrec == 4] = 1 indivi['activite'][indivi.actrec == 5] = 2 indivi['activite'][indivi.actrec == 7] = 3 indivi['activite'][indivi.actrec == 8] = 4 indivi['activite'][indivi.age <= 13] = 2 # ce sont en fait les actrec=9 log.info("{}".format(indivi['activite'].value_counts(dropna = False))) # TODO: MBJ problem avec les actrec # TODO: FIX AND REMOVE indivi.activite[indivi.actrec.isnull()] = 5 indivi.titc[indivi.titc.isnull()] = 0 assert indivi.titc.notnull().all(), u"Problème avec les titc" # On a 420 NaN pour les varaibels statut, titc etc log.info(u" 6.2 : variable statut") indivi.statut[indivi.statut.isnull()] = 0 indivi.statut = indivi.statut.astype('int') indivi.statut[indivi.statut == 11] = 1 indivi.statut[indivi.statut == 12] = 2 indivi.statut[indivi.statut == 13] = 3 indivi.statut[indivi.statut == 21] = 4 indivi.statut[indivi.statut == 22] = 5 indivi.statut[indivi.statut == 33] = 6 indivi.statut[indivi.statut == 34] = 7 indivi.statut[indivi.statut == 35] = 8 indivi.statut[indivi.statut == 43] = 9 indivi.statut[indivi.statut == 44] = 10 indivi.statut[indivi.statut == 45] = 11 assert indivi.statut.isin(range(12)).all(), u"statut value over range" log.info(u" 6.3 : variable txtppb") indivi.txtppb.fillna(0, inplace = True) assert indivi.txtppb.notnull().all() indivi.nbsala.fillna(0, inplace = True) indivi['nbsala'] = indivi.nbsala.astype('int') indivi.nbsala[indivi.nbsala == 99] = 10 assert indivi.nbsala.isin(range(11)).all() log.info(u" 6.4 : variable chpub et CSP") indivi.chpub.fillna(0, inplace = True) indivi.chpub = indivi.chpub.astype('int') indivi.chpub[indivi.chpub.isnull()] = 0 assert indivi.chpub.isin(range(11)).all() indivi['cadre'] = 0 indivi.prosa.fillna(0, inplace = True) assert indivi['prosa'].notnull().all() log.info("{}".format(indivi['encadr'].value_counts(dropna = False))) # encadr : 1=oui, 2=non indivi.encadr.fillna(2, inplace = True) indivi.encadr[indivi.encadr == 0] = 2 assert indivi.encadr.notnull().all() assert indivi.encadr.isin([1, 2]).all() indivi['cadre'][indivi.prosa.isin([7, 8])] = 1 indivi['cadre'][(indivi.prosa == 9) & (indivi.encadr == 1)] = 1 assert indivi['cadre'].isin(range(2)).all() log.info( u"Etape 7: on vérifie qu'il ne manque pas d'info sur les liens avec la personne de référence") log.info( u"nb de doublons idfam/quifam {}".format(len(indivi[indivi.duplicated(cols=['idfoy', 'quifoy'])]))) log.info(u"On crée les n° de personnes à charge") assert indivi['idfoy'].notnull().all() print_id(indivi) indivi['quifoy2'] = 2 indivi.quifoy2[indivi.quifoy == 'vous'] = 0 indivi.quifoy2[indivi.quifoy == 'conj'] = 1 indivi.quifoy2[indivi.quifoy == 'pac'] = 2 del indivi['quifoy'] indivi['quifoy'] = indivi.quifoy2 del indivi['quifoy2'] print_id(indivi) test2 = indivi[['quifoy', 'idfoy', 'noindiv']][indivi['quifoy'] == 2].copy() print_id(test2) j = 2 while test2.duplicated(['quifoy', 'idfoy']).any(): test2.loc[test2.duplicated(['quifoy', 'idfoy']), 'quifoy'] = j j += 1 print_id(test2) indivi = indivi.merge(test2, on = ['noindiv', 'idfoy'], how = "left") indivi['quifoy'] = indivi['quifoy_x'] indivi['quifoy'] = where(indivi['quifoy_x'] == 2, indivi['quifoy_y'], indivi['quifoy_x']) del indivi['quifoy_x'], indivi['quifoy_y'] print_id(indivi) del test2, fip log.info( u"nb de doublons idfam/quifam' {}".format( len(indivi[indivi.duplicated(subset = ['idfoy', 'quifoy'])]) ) ) print_id(indivi) log.info(u"Etape 8 : création des fichiers totaux") famille = temporary_store['famc_{}'.format(year)] log.info(u" 8.1 : création de tot2 & tot3") tot2 = indivi.merge(famille, on = 'noindiv', how = 'inner') # del famille # TODO: MBJ increase in number of menage/foyer when merging with family ... del famille control(tot2, debug = True, verbose = True) assert tot2.quifam.notnull().all() temporary_store['tot2_{}'.format(year)] = tot2 del indivi log.info(u" tot2 saved") tot2.merge(foyer, how = 'left') tot2 = tot2[tot2.idmen.notnull()].copy() print_id(tot2) tot3 = tot2 # TODO: check where they come from tot3 = tot3.drop_duplicates(subset = 'noindiv') log.info("{}".format(len(tot3))) # Block to remove any unwanted duplicated pair control(tot3, debug = True, verbose = True) tot3 = tot3.drop_duplicates(subset = ['idfoy', 'quifoy']) tot3 = tot3.drop_duplicates(subset = ['idfam', 'quifam']) tot3 = tot3.drop_duplicates(subset = ['idmen', 'quimen']) tot3 = tot3.drop_duplicates(subset = ['noindiv']) control(tot3) log.info(u" 8.2 : On ajoute les variables individualisables") allvars = temporary_store['ind_vars_to_remove_{}'.format(year)] vars2 = set(tot3.columns).difference(set(allvars)) tot3 = tot3[list(vars2)] log.info("{}".format(len(tot3))) assert not(tot3.duplicated(subset = ['noindiv']).any()), "doublon dans tot3['noindiv']" lg_dup = len(tot3[tot3.duplicated(['idfoy', 'quifoy'])]) assert lg_dup == 0, "{} pairs of idfoy/quifoy in tot3 are duplicated".format(lg_dup) temporary_store['tot3_{}'.format(year)] = tot3 control(tot3) del tot2, allvars, tot3, vars2 log.info(u"tot3 sauvegardé") gc.collect()
def build_homogeneisation_caracteristiques_sociales(temporary_store = None, year = None): u"""Homogénéisation des caractéristiques sociales des ménages """ assert temporary_store is not None assert year is not None # Load data bdf_survey_collection = SurveyCollection.load( collection = 'budget_des_familles', config_files_directory = config_files_directory) survey = bdf_survey_collection.get_survey('budget_des_familles_{}'.format(year)) # ****************************************************************************************************************** # * Etape n° 0-3 : HOMOGENEISATION DES CARACTERISTIQUES SOCIALES DES MENAGES # ****************************************************************************************************************** # ****************************************************************************************************************** if year == 1995: kept_variables = ['exdep', 'exrev', 'mena', 'v', 'ponderrd', 'nbpers', 'nbenf', 'typmen1', 'cohabpr', 'sexepr', 'agepr', 'agecj', 'matripr', 'occuppr', 'occupcj', 'nbact', 'sitlog', 'stalog', 'mena', 'nm14a', 'typmen1'] menage = survey.get_values( table = "socioscm", variables = kept_variables, ) # cette étape permet de ne garder que les données dont on est sûr de la qualité et de la véracité # exdep = 1 si les données sont bien remplies pour les dépenses du ménage # exrev = 1 si les données sont bien remplies pour les revenus du ménage menage = menage[(menage.exdep == 1) & (menage.exrev == 1)] menage.rename( columns = { 'v': 'vag', 'mena': 'ident_men', 'ponderrd': 'pondmen', 'nbpers': 'npers', 'nm14a': 'nenfants', 'nbenf': 'nenfhors', 'nbact': 'nactifs', 'cohabpr': 'couplepr', 'matripr': 'etamatri', 'typmen1': 'typmen' }, inplace = True, ) # la variable vag est utilisée dans les modèles QAIDS et AIDS comme variable temporelle afin d'attibuer # le bon prix mensuel menage.agecj = menage.agecj.fillna(0) menage.nenfhors = menage.nenfhors.fillna(0) menage.vag = menage.vag.astype('int') menage['nadultes'] = menage['npers'] - menage['nenfants'] menage['ocde10'] = 1 + 0.5 * numpy.maximum(0, menage['nadultes'] - 1) + 0.3 * menage['nenfants'] # harmonisation des types de ménage sur la nomenclature 2010 menage['typmen_'] = menage['typmen'] menage.typmen[menage.typmen_ == 1] = 1 menage.typmen[menage.typmen_ == 2] = 3 menage.typmen[menage.typmen_ == 3] = 4 menage.typmen[menage.typmen_ == 4] = 4 menage.typmen[menage.typmen_ == 5] = 4 menage.typmen[menage.typmen_ == 6] = 2 menage.typmen[menage.typmen_ == 7] = 5 del menage['typmen_'] var_to_ints = ['couplepr', 'etamatri'] for var_to_int in var_to_ints: menage[var_to_int] = menage[var_to_int].astype(int) # Methode : # 1. on nettoite les variables (i.e. changement de nom de format) # 2. Reformatage des variables (réattribution des catégories pour quelles soient identiques # pour les différentes années) menage["situacj"] = 0 menage.situacj[menage.occupcj == 1] = 1 menage.situacj[menage.occupcj == 3] = 3 menage.situacj[menage.occupcj == 2] = 4 menage.situacj[menage.occupcj == 5] = 5 menage.situacj[menage.occupcj == 6] = 5 menage.situacj[menage.occupcj == 7] = 6 menage.situacj[menage.occupcj == 8] = 7 menage.situacj[menage.occupcj == 4] = 8 menage["situapr"] = 0 menage.situapr[menage.occuppr == 1] = 1 menage.situapr[menage.occuppr == 3] = 3 menage.situapr[menage.occuppr == 2] = 4 menage.situapr[menage.occuppr == 5] = 5 menage.situapr[menage.occuppr == 6] = 5 menage.situapr[menage.occuppr == 7] = 6 menage.situapr[menage.occuppr == 8] = 7 menage.situapr[menage.occuppr == 4] = 8 menage["typlog"] = 0 menage.typlog[menage.sitlog == 1] = 1 menage.typlog[menage.sitlog != 1] = 2 menage['stalog'] = menage['stalog'].astype(int) individus = survey.get_values( table = "individu", ) variables = ['mena', 'v'] individus.rename( columns = {'mena': 'identmen'}, inplace = True, ) menage.set_index('ident_men', inplace = True) if year == 2000: menage = survey.get_values( table = "menage", variables = [ 'ident', 'pondmen', 'nbact', 'nbenf1', 'nbpers', 'ocde10', 'sitlog', 'stalog', 'strate', 'typmen1', 'zeat', 'stalog', 'vag', 'sexepr', 'sexecj', 'agecj', 'napr', 'nacj', 'cs2pr', 'cs2cj', 'diegpr', 'dieppr', 'diespr', 'diegcj', 'diepcj', 'diescj', 'hod_nb', 'cohabpr', 'occupapr', 'occupacj', 'occupbpr', 'occupbcj', 'occupcpr', 'occupccj', 'typmen1' ] ) menage.rename( columns = { 'cohabpr': 'couplepr', 'hod_nb': 'nenfhors', 'ident': 'ident_men', 'nbact': 'nactifs', 'nbenf1': 'nenfants', 'nbpers': 'npers', 'rev81': 'poste_coicop_421', 'typmen1': 'typmen' }, inplace = True, ) menage.ocde10 = menage.ocde10 / 10 # on met un numéro à chaque vague pour pouvoir faire un meilleur suivi des évolutions temporelles # pour le modèle de demande menage.agecj = menage.agecj.fillna(0) assert menage.notnull().all().all(), 'The following variables contains NaN values: {}'.format( list(menage.isnull().any()[menage.isnull().any()].index)) menage['vag_'] = menage['vag'] menage.vag.loc[menage.vag_ == 1] = 9 menage.vag.loc[menage.vag_ == 2] = 10 menage.vag.loc[menage.vag_ == 3] = 11 menage.vag.loc[menage.vag_ == 4] = 12 menage.vag.loc[menage.vag_ == 5] = 13 menage.vag.loc[menage.vag_ == 6] = 14 menage.vag.loc[menage.vag_ == 7] = 15 menage.vag.loc[menage.vag_ == 8] = 16 del menage['vag_'] # harmonisation des types de ménage sur la nomenclature 2010 menage['typmen_'] = menage['typmen'] menage.typmen.loc[menage.typmen_ == 1] = 1 menage.typmen.loc[menage.typmen_ == 2] = 3 menage.typmen.loc[menage.typmen_ == 3] = 4 menage.typmen.loc[menage.typmen_ == 4] = 4 menage.typmen.loc[menage.typmen_ == 5] = 4 menage.typmen.loc[menage.typmen_ == 6] = 2 menage.typmen.loc[menage.typmen_ == 7] = 5 del menage['typmen_'] menage.couplepr = menage.couplepr.astype('int') menage["nadultes"] = menage['npers'] - menage['nenfants'] menage.typmen = menage.typmen.astype('int') # occupa : 1 si la personne travaille, 2 sinon. occupb : 1 si elle travaille effectivement, 2 si congé de # longue durée (négligé ici). occupc : de 2 à 8 selon le statut si ne travaille pas (étudiant, retraité, etc.) menage["situacj"] = 0 menage.situacj.loc[menage.occupacj == 1] = 1 menage.situacj.loc[menage.occupccj == 3] = 3 menage.situacj.loc[menage.occupccj == 2] = 4 menage.situacj.loc[menage.occupccj == 5] = 5 menage.situacj.loc[menage.occupccj == 6] = 5 menage.situacj.loc[menage.occupccj == 7] = 6 menage.situacj.loc[menage.occupccj == 8] = 7 menage.situacj.loc[menage.occupccj == 4] = 8 menage["situapr"] = 0 menage.situapr.loc[menage.occupapr == 1] = 1 menage.situapr.loc[menage.occupcpr == 3] = 3 menage.situapr.loc[menage.occupcpr == 2] = 4 menage.situapr.loc[menage.occupcpr == 5] = 5 menage.situapr.loc[menage.occupcpr == 6] = 5 menage.situapr.loc[menage.occupcpr == 7] = 6 menage.situapr.loc[menage.occupcpr == 8] = 7 menage.situapr.loc[menage.occupcpr == 4] = 8 menage["natiocj"] = 0 menage["natiopr"] = 0 menage.natiocj.loc[menage.nacj == 1] = 1 menage.natiocj.loc[menage.nacj == 2] = 1 menage.natiocj.loc[menage.nacj == 3] = 2 menage.natiopr.loc[menage.napr == 1] = 1 menage.natiopr.loc[menage.napr == 2] = 1 menage.natiopr.loc[menage.napr == 3] = 2 menage["typlog"] = 0 menage.typlog.loc[menage.sitlog == 1] = 1 menage.typlog.loc[menage.sitlog != 1] = 2 # Homogénéisation des diplômes, choix d'équivalence entre les diplômes menage["dip14pr"] = 999999 menage.dip14pr.loc[menage.diegpr == 0] = 71 menage.dip14pr.loc[menage.diegpr == 2] = 70 menage.dip14pr.loc[menage.diegpr == 15] = 60 menage.dip14pr.loc[menage.diegpr == 18] = 60 menage.dip14pr.loc[menage.diegpr == 16] = 41 menage.dip14pr.loc[menage.diegpr == 17] = 41 menage.dip14pr.loc[menage.diegpr == 19] = 41 menage.dip14pr.loc[menage.dieppr == 23] = 50 menage.dip14pr.loc[menage.dieppr == 25] = 50 menage.dip14pr.loc[menage.dieppr == 27] = 50 menage.dip14pr.loc[menage.dieppr == 29] = 50 menage.dip14pr.loc[menage.dieppr == 34] = 43 menage.dip14pr.loc[menage.dieppr == 32] = 42 menage.dip14pr.loc[menage.dieppr == 36] = 42 menage.dip14pr.loc[menage.diespr == 41] = 30 menage.dip14pr.loc[menage.diespr == 42] = 31 menage.dip14pr.loc[menage.diespr == 43] = 31 menage.dip14pr.loc[menage.diespr == 44] = 33 menage.dip14pr.loc[menage.diespr == 46] = 20 menage.dip14pr.loc[menage.diespr == 48] = 12 menage.dip14pr.loc[menage.diespr == 47] = 10 menage.set_index('ident_men', inplace = True) # Recodage des catégories zeat menage.zeat.loc[menage.zeat == 7] = 6 menage.zeat.loc[menage.zeat == 8] = 7 menage.zeat.loc[menage.zeat == 9] = 8 assert menage.zeat.isin(range(1, 9)).all() individus = survey.get_values( table = "individus", variables = ['ident', 'matri', 'lien', 'anais'] ) individus = individus.loc[individus.lien == 1].copy() individus.rename( columns = {'ident': 'ident_men', 'matri': 'etamatri'}, inplace = True, ) variables_to_destring = ['anais'] for variable_to_destring in variables_to_destring: individus[variable_to_destring] = individus[variable_to_destring].astype('int').copy() individus['agepr'] = year - individus.anais individus.set_index('ident_men', inplace = True) assert menage.notnull().all().all(), 'The following variables contains NaN values: {}'.format( list(menage.isnull().any()[menage.isnull().any()].index)) menage = menage.merge(individus, left_index = True, right_index = True) if year == 2005: menage = survey.get_values(table = "menage") # données socio-démographiques socio_demo_variables = ['agpr', 'agcj', 'couplepr', 'decuc', 'ident_men', 'nactifs', 'nenfants', 'nenfhors', 'npers', 'ocde10', 'pondmen', 'sexecj', 'sexepr', 'typmen5', 'vag', 'zeat', 'cs24pr'] socio_demo_variables += [column for column in menage.columns if column.startswith('dip14')] socio_demo_variables += [column for column in menage.columns if column.startswith('natio7')] # activité professionnelle activite_prof_variables = ['situacj', 'situapr'] activite_prof_variables += [column for column in menage.columns if column.startswith('cs42')] # logement logement_variables = ['htl', 'strate'] menage = menage[socio_demo_variables + activite_prof_variables + logement_variables] menage.rename( columns = { # "agpr": "agepr", "agcj": "agecj", "typmen5": "typmen", "cs24pr": "cs_pr" }, inplace = True, ) del menage['agpr'] menage['nadultes'] = menage.npers - menage.nenfants for person in ['pr', 'cj']: menage['natio' + person] = (menage['natio7' + person] > 2) # TODO: changer de convention ? del menage['natio7' + person] menage.agecj = menage.agecj.fillna(0) menage.nenfhors = menage.nenfhors.fillna(0) var_to_ints = ['ocde10', 'decuc', 'nactifs', 'nenfants', 'npers', 'pondmen', 'nadultes'] assert menage.notnull().all().all(), 'The following variables contains NaN values: {}'.format( list(menage.isnull().any()[menage.isnull().any()].index)) menage.couplepr = menage.couplepr > 2 # TODO: changer de convention ? menage.ocde10 = menage.ocde10 / 10 menage.set_index('ident_men', inplace = True) # on met un numéro à chaque vague pour pouvoir faire un meilleur suivi des évolutions temporelles # pour le modèle de demande menage['vag_'] = menage['vag'] menage.vag.loc[menage.vag_ == 1] = 17 menage.vag.loc[menage.vag_ == 2] = 18 menage.vag.loc[menage.vag_ == 3] = 19 menage.vag.loc[menage.vag_ == 4] = 20 menage.vag.loc[menage.vag_ == 5] = 21 menage.vag.loc[menage.vag_ == 6] = 22 del menage['vag_'] # Recodage des catégories zeat menage.zeat.loc[menage.zeat == 7] = 6 menage.zeat.loc[menage.zeat == 8] = 7 menage.zeat.loc[menage.zeat == 9] = 8 assert menage.zeat.isin(range(1, 9)).all() stalog = survey.get_values(table = "depmen", variables = ['ident_men', 'stalog']) stalog['stalog'] = stalog.stalog.astype('int').copy() stalog['new_stalog'] = 0 stalog.loc[stalog.stalog == 2, 'new_stalog'] = 1 stalog.loc[stalog.stalog == 1, 'new_stalog'] = 2 stalog.loc[stalog.stalog == 4, 'new_stalog'] = 3 stalog.loc[stalog.stalog == 5, 'new_stalog'] = 4 stalog.loc[stalog.stalog.isin([3, 6]), 'new_stalog'] = 5 stalog.stalog = stalog.new_stalog.copy() del stalog['new_stalog'] assert stalog.stalog.isin(range(1, 6)).all() stalog.set_index('ident_men', inplace = True) menage = menage.merge(stalog, left_index = True, right_index = True) menage['typlog'] = 2 menage.loc[menage.htl.isin(['1', '5']), 'typlog'] = 1 assert menage.typlog.isin([1, 2]).all() del menage['htl'] individus = survey.get_values(table = 'individu') # Il y a un problème sur l'année de naissance, # donc on le recalcule avec l'année de naissance et la vague d'enquête individus['agepr'] = year - individus.anais individus.loc[individus.vag == 6, ['agepr']] = year + 1 - individus.anais individus = individus[individus.lienpref == 00].copy() kept_variables = ['ident_men', 'etamatri', 'agepr'] individus = individus[kept_variables].copy() individus.etamatri.loc[individus.etamatri == 0] = 1 individus['etamatri'] = individus['etamatri'].astype('int') # MBJ TODO: define as a catagory ? individus.set_index('ident_men', inplace = True) menage = menage.merge(individus, left_index = True, right_index = True) individus = survey.get_values( table = 'individu', variables = ['ident_men', 'ident_ind', 'age', 'anais', 'vag', 'lienpref'], ) # Il y a un problème sur l'année de naissance, # donc on le recalcule avec l'année de naissance et la vague d'enquête individus['age'] = year - individus.anais individus.loc[individus.vag == 6, ['age']] = year + 1 - individus.anais # Garder toutes les personnes du ménage qui ne sont pas la personne de référence et le conjoint individus = individus[(individus.lienpref != 00) & (individus.lienpref != 01)].copy() individus.sort_values(by = ['ident_men', 'ident_ind'], inplace = True) # Inspired by http://stackoverflow.com/questions/17228215/enumerate-each-row-for-each-group-in-a-dataframe def add_col_numero(data_frame): data_frame['numero'] = numpy.arange(len(data_frame)) + 3 return data_frame individus = individus.groupby(by = 'ident_men').apply(add_col_numero) pivoted = individus.pivot(index = 'ident_men', columns = "numero", values = 'age') pivoted.columns = ["age{}".format(column) for column in pivoted.columns] menage = menage.merge(pivoted, left_index = True, right_index = True, how = 'outer') individus = survey.get_values( table = 'individu', variables = ['ident_men', 'ident_ind', 'agfinetu', 'lienpref'], ) individus.set_index('ident_men', inplace = True) pr = individus.loc[individus.lienpref == 00, 'agfinetu'].copy() conjoint = individus.loc[individus.lienpref == 01, 'agfinetu'].copy() conjoint.name = 'agfinetu_cj' agfinetu_merged = pandas.concat([pr, conjoint], axis = 1) menage = menage.merge(agfinetu_merged, left_index = True, right_index = True) temporary_store['donnes_socio_demog_{}'.format(year)] = menage # label var agepr "Age de la personne de référence au 31/12/${yearrawdata}" # label var agecj "Age du conjoint de la PR au 31/12/${yearrawdata}" # label var sexepr "Sexe de la personne de référence" # label var sexecj "Sexe du conjoint de la PR" # label var cs42pr "Catégorie socio-professionnelle de la PR" # label var cs42cj "Catégorie socio-professionnelle du conjoint de la PR" # label var ocde10 "Nombre d'unités de consommation (échelle OCDE)" # label var ident_men "Identifiant du ménage" # label var pondmen "Ponderation du ménage" # label var npers "Nombre total de personnes dans le ménage" # label var nadultes "Nombre d'adultes dans le ménage" # label var nenfants "Nombre d'enfants dans le ménage" # label var nenfhors "Nombre d'enfants vivant hors domicile" # label var nactifs "Nombre d'actifs dans le ménage" # label var couplepr "Vie en couple de la personne de référence" # label define typmen5 1 "Personne seule" 2 "Famille monoparentale" 3 "Couple sans enfant" # 4 "Couple avec enfants" 5 "Autre type de ménage (complexe)" # label values typmen5 typmen5 # label var typmen5 "Type de ménage (5 modalités)" # label var etamatri "Situation matrimoniale de la personne de référence" # label define matripr 1 "Célibataire" 2 "Marié(e)" 3 "Veuf(ve)" 4 "Divorcé(e)" # label values etamatri matripr # label define occupation 1 "Occupe un emploi" /// # 2 "Apprenti" /// # 3 "Etudiant, élève, en formation" /// # 4 "Chômeur (inscrit ou non à l'ANPE)" /// # 5 "Retraité, préretraité ou retiré des affaires" /// # 6 "Au foyer" /// # 7 "Autre situation (handicapé)" /// # 8 "Militaire du contingent" # label values situapr occupation # label values situacj occupation # label var situapr "Situation d'activité de la personne de référence" # label var situacj "Situation d'activité du conjoint de la PR" # label define diplome 10 "Diplôme de 3ème cycle universitaire, doctorat" /// # 12 "Diplôme d'ingénieur, grande école" /// # 20 "Diplôme de 2nd cycle universitaire" /// # 30 "Diplôme de 1er cycle universitaire" /// # 31 "BTS, DUT ou équivalent" /// # 33 "Diplôme des professions sociales et de la santé niveau Bac +2" /// # 41 "Baccalauréat général, brevet supérieur, capacité en droit" /// # 42 "Baccalauréat technologique" /// # 43 "Baccalauréat professionnel" /// # 44 "Brevet professionnel ou de technicien" /// # 50 "CAP, BEP ou diplôme de même niveau" /// # 60 "Brevet des collèges, BEPC" /// # 70 "Certificat d'études primaires" /// # 71 "Aucun diplôme" # label values dip14pr diplome # label values dip14cj diplome # label var dip14pr "Diplôme le plus élevé de la PR" # label var dip14cj "Diplôme le plus élevé du conjoint de la PR" # label define nationalite 1 "Français, par naissance ou naturalisation" 2 "Etranger" # label values natiopr nationalite # label values natiocj nationalite # label var natiopr "Nationalité de la personne de référence" # label var natiocj "Nationalité du conjoint de la PR" # label define logement 1 "Maison" 2 "Appartement" # label values typlog logement # label var typlog "Type de logement" # label define statutlogement 1 "Propriétaire ou copropriétaire" /// # 2 "Accédant à la propriété (rembourse un prêt)" /// # 3 "Locataire" /// # 4 "Sous-locataire" /// # 5 "Logé gratuitement" # label values stalog statutlogement # label var stalog "Statut d'occupation du logement" # label define viecouple 1 "Vit en couple" 2 "Ne vit pas en couple" # label values couplepr viecouple # # /* Recodage des CSP en 12 et 8 postes à partir de classification de l'INSEE (2003, PCS niveaux 1 et 2) */ # gen cs24pr=00 # replace cs24pr=10 if cs42pr=="11" # replace cs24pr=10 if cs42pr=="12" # replace cs24pr=10 if cs42pr=="13" # replace cs24pr=21 if cs42pr=="21" # replace cs24pr=22 if cs42pr=="22" # replace cs24pr=23 if cs42pr=="23" # replace cs24pr=31 if cs42pr=="31" # replace cs24pr=32 if cs42pr=="33" # replace cs24pr=32 if cs42pr=="34" # replace cs24pr=32 if cs42pr=="35" # replace cs24pr=36 if cs42pr=="37" # replace cs24pr=36 if cs42pr=="38" # replace cs24pr=41 if cs42pr=="42" # replace cs24pr=41 if cs42pr=="43" # replace cs24pr=41 if cs42pr=="44" # replace cs24pr=41 if cs42pr=="45" # replace cs24pr=46 if cs42pr=="46" # replace cs24pr=47 if cs42pr=="47" # replace cs24pr=48 if cs42pr=="48" # replace cs24pr=51 if cs42pr=="52" # replace cs24pr=51 if cs42pr=="53" # replace cs24pr=54 if cs42pr=="54" # replace cs24pr=55 if cs42pr=="55" # replace cs24pr=56 if cs42pr=="56" # replace cs24pr=61 if cs42pr=="62" # replace cs24pr=61 if cs42pr=="63" # replace cs24pr=61 if cs42pr=="64" # replace cs24pr=61 if cs42pr=="65" # replace cs24pr=66 if cs42pr=="67" # replace cs24pr=66 if cs42pr=="68" # replace cs24pr=69 if cs42pr=="69" # replace cs24pr=71 if cs42pr=="71" # replace cs24pr=72 if cs42pr=="72" # replace cs24pr=73 if cs42pr=="74" # replace cs24pr=73 if cs42pr=="75" # replace cs24pr=76 if cs42pr=="77" # replace cs24pr=76 if cs42pr=="78" # replace cs24pr=81 if cs42pr=="81" # replace cs24pr=82 if cs42pr=="83" # replace cs24pr=82 if cs42pr=="84" # replace cs24pr=82 if cs42pr=="85" # replace cs24pr=82 if cs42pr=="86" # replace cs24pr=82 if cs42pr=="**" # replace cs24pr=82 if cs42pr=="00" # menage['cs24pr'] = 0 csp42s_by_csp24 = { 10: ["11", "12", "13"], 21: ["21"], 22: ["22"], 23: ["23"], 31: ["31"], 32: ["32", "33", "34", "35"], 36: ["37", "38"], 41: ["42", "43", "44", "45"], 46: ["46"], 47: ["47"], 48: ["48"], 51: ["52", "53"], 54: ["54"], 55: ["55"], 56: ["56"], 61: ["62", "63", "64", "65"], 66: ["67", "68"], 69: ["69"], 71: ["71"], 72: ["72"], 73: ["74", "75"], 76: ["77", "78"], 81: ["81"], 82: ["83", "84", "85", "86", "**", "00"], } for csp24, csp42s in csp42s_by_csp24.items(): menage.loc[menage.cs42pr.isin(csp42s), 'cs24pr'] = csp24 assert menage.cs24pr.isin(csp42s_by_csp24.keys()).all() menage['cs8pr'] = numpy.floor(menage.cs24pr / 10) assert menage.cs8pr.isin(range(1, 9)).all() variables = [ 'pondmen', 'npers', 'nenfants', 'nenfhors', 'nadultes', 'nactifs', 'ocde10', 'typmen', 'sexepr', 'agepr', 'etamatri', 'couplepr', 'situapr', 'dip14pr', 'cs42pr', 'cs24pr', 'cs8pr', 'natiopr', 'sexecj', 'agecj', 'situacj', 'dip14cj', 'cs42cj', 'natiocj', 'typlog', 'stalog' ] + ["age{}".format(age) for age in range(3, 14)] for variable in variables: assert variable in menage.columns, "{} is not a column of menage data frame".format(variable) if year == 2011: variables = [ 'agecj', 'agepr', 'coeffuc', 'decuc1', 'ident_me', 'pondmen', 'npers', 'nenfants', 'nactifs', 'sexepr', 'sexecj', 'dip14cj', 'dip14pr', 'typmen5', 'cataeu', 'situapr', 'situacj', 'zeat', ] try: menage = survey.get_values(table = "MENAGE", variables = variables) except: menage = survey.get_values(table = "menage", variables = variables) menage.rename( columns = { 'ident_me': 'ident_men', 'coeffuc': 'ocde10', 'typmen5': 'typmen', 'decuc1': 'decuc', 'cataeu': 'strate' }, inplace = True, ) del variables menage.agecj = menage.agecj.fillna(0) # Ajout de la variable vag try: depmen = survey.get_values(table = "DEPMEN") except: depmen = survey.get_values(table = "depmen") depmen.rename(columns = {'ident_me': 'ident_men'}, inplace = True) vague = depmen[['vag', 'ident_men']].copy() stalog = depmen[['stalog', 'ident_men']].copy() del depmen menage.set_index('ident_men', inplace = True) vague.set_index('ident_men', inplace = True) menage = menage.merge(vague, left_index = True, right_index = True) # On met un numéro à chaque vague pour pouvoir faire un meilleur suivi des évolutions temporelles pour # le modèle de demande menage['vag_'] = menage['vag'].copy() menage.vag.loc[menage.vag_ == 1] = 23 menage.vag.loc[menage.vag_ == 2] = 24 menage.vag.loc[menage.vag_ == 3] = 25 menage.vag.loc[menage.vag_ == 4] = 26 menage.vag.loc[menage.vag_ == 5] = 27 menage.vag.loc[menage.vag_ == 6] = 28 del menage['vag_'] # Homogénéisation de la variable statut du logement qui prend des valeurs différentes pour 2011 stalog['stalog'] = stalog.stalog.astype('int').copy() stalog['new_stalog'] = 0 stalog.loc[stalog.stalog == 2, 'new_stalog'] = 1 stalog.loc[stalog.stalog == 1, 'new_stalog'] = 2 stalog.loc[stalog.stalog == 4, 'new_stalog'] = 3 stalog.loc[stalog.stalog == 5, 'new_stalog'] = 4 stalog.loc[stalog.stalog.isin([3, 6]), 'new_stalog'] = 5 stalog.stalog = stalog.new_stalog.copy() del stalog['new_stalog'] assert stalog.stalog.isin(range(1, 6)).all() stalog.set_index('ident_men', inplace = True) menage = menage.merge(stalog, left_index = True, right_index = True) # Recodage des catégories zeat menage.loc[menage.zeat == 7, 'zeat'] = 6 menage.zeat.loc[menage.zeat == 8] = 7 menage.zeat.loc[menage.zeat == 9] = 8 assert menage.zeat.isin(range(0, 9)).all() menage.index.name = 'ident_men' # assert menage.index.name == 'ident_men' menage['role_menage'] = 0 temporary_store['donnes_socio_demog_{}'.format(year)] = menage
def create_enfants_a_naitre(temporary_store=None, year=None): ''' ''' assert temporary_store is not None assert year is not None erfs_survey_collection = SurveyCollection.load( collection='erfs', config_files_directory=config_files_directory) survey = erfs_survey_collection.get_survey('erfs_{}'.format(year)) # Enfant à naître (NN pour nouveaux nés) individual_vars = [ 'acteu', 'agepr', 'cohab', 'contra', 'forter', 'ident', 'lien', 'lpr', 'mrec', 'naia', 'naim', 'noi', 'noicon', 'noimer', 'noindiv', 'noiper', 'retrai', 'rga', 'rstg', 'sexe', 'stc', 'titc', ] year_specific_by_generic = year_specific_by_generic_data_frame_name(year) eeccmp1 = survey.get_values(table=year_specific_by_generic["eec_cmp_1"], variables=individual_vars) eeccmp2 = survey.get_values(table=year_specific_by_generic["eec_cmp_2"], variables=individual_vars) eeccmp3 = survey.get_values(table=year_specific_by_generic["eec_cmp_3"], variables=individual_vars) tmp = eeccmp1.merge(eeccmp2, how="outer") enfants_a_naitre = tmp.merge(eeccmp3, how="outer") # optimisation des types? Controle de l'existence en passant # pourquoi pas des int quand c'est possible # TODO: minimal dtype TODO: shoudln't be here for var in individual_vars: assert_dtype(enfants_a_naitre[var], 'float') del eeccmp1, eeccmp2, eeccmp3, individual_vars, tmp gc.collect() # création de variables enfants_a_naitre['declar1'] = '' enfants_a_naitre['noidec'] = 0 enfants_a_naitre['ztsai'] = 0 enfants_a_naitre['year'] = year enfants_a_naitre.year = enfants_a_naitre.year.astype( "float32") # TODO: should be an integer but NaN are present enfants_a_naitre['agepf'] = enfants_a_naitre.year - enfants_a_naitre.naia enfants_a_naitre.loc[enfants_a_naitre.naim >= 7, 'agepf'] -= 1 enfants_a_naitre['actrec'] = 9 enfants_a_naitre['quelfic'] = 'ENF_NN' enfants_a_naitre['persfip'] = "" # TODO: deal with agepf for series_name in ['actrec', 'noidec', 'ztsai']: assert_dtype(enfants_a_naitre[series_name], "int") # selection enfants_a_naitre = enfants_a_naitre[( (enfants_a_naitre.naia == enfants_a_naitre.year) & (enfants_a_naitre.naim >= 10)) | ( (enfants_a_naitre.naia == enfants_a_naitre.year + 1) & (enfants_a_naitre.naim <= 5))].copy() temporary_store["enfants_a_naitre_{}".format(year)] = enfants_a_naitre
@author: malkaguillot """ import pandas from openfisca_france_data import default_config_files_directory as config_files_directory from openfisca_france_data.input_data_builders.build_openfisca_survey_data.base \ import year_specific_by_generic_data_frame_name from openfisca_france_data.temporary import get_store from openfisca_survey_manager.survey_collections import SurveyCollection # En entrée : tables individus, foyer et sif de ERFS (testé sur 2009) year = 2009 year_specific_by_generic = year_specific_by_generic_data_frame_name(year) erfs_survey_collection = SurveyCollection.load(collection = 'erfs', config_files_directory = config_files_directory) survey = erfs_survey_collection.get_survey('erfs_{}'.format(year)) foyer_all = survey.get_values(table = year_specific_by_generic["foyer"]) erfind = survey.get_values(table = year_specific_by_generic["erf_indivi"]) temporary_store = get_store(file_name = 'erfs') sif = temporary_store['sif'] ind = erfind[['ident', 'noindiv', 'declar1', 'declar2', 'zsali', 'persfip', 'persfipd']] small_sif = sif[['noindiv', 'declar', 'causeXYZ']] foyer = foyer_all[['ident', 'noindiv', 'declar', 'sif', '_1aj', '_1bj', '_1cj', '_1dj', '_1aq', '_1bq', '_8by', '_8cy' ]] foyer = foyer.drop(['_1cj', '_1dj', '_1aq', '_1bq', '_8by', '_8cy'], axis=1) foyer_sif = pandas.merge(foyer, small_sif, on = ['declar', 'noindiv'])
def create_indivim_menagem(temporary_store=None, year=None): """ Création des tables ménages et individus concaténée (merged) """ # Prepare the some useful merged tables assert temporary_store is not None assert year is not None # load data erfs_survey_collection = SurveyCollection.load( collection='erfs', config_files_directory=config_files_directory) year_specific_by_generic = year_specific_by_generic_data_frame_name(year) survey = erfs_survey_collection.get_survey('erfs_{}'.format(year)) erfmen = survey.get_values(table=year_specific_by_generic["erf_menage"]) eecmen = survey.get_values(table=year_specific_by_generic["eec_menage"]) erfind = survey.get_values(table=year_specific_by_generic["erf_indivi"]) eecind = survey.get_values(table=year_specific_by_generic["eec_indivi"]) # travail sur la cohérence entre les bases noappar_m = eecmen[~(eecmen.ident.isin(erfmen.ident.values))].copy() noappar_i = eecmen[~(eecind.ident.isin(erfind.ident.values))].copy() noappar_i = noappar_i.drop_duplicates(subset='ident', take_last=True) # TODO: vérifier qu'il n'y a théoriquement pas de doublon difference = set(noappar_i.ident).symmetric_difference(noappar_m.ident) intersection = set(noappar_i.ident) & set(noappar_m.ident) log.info("There are {} differences and {} intersections".format( len(difference), len(intersection))) del noappar_i, noappar_m, difference, intersection gc.collect() # fusion enquete emploi et source fiscale menagem = erfmen.merge(eecmen) indivim = eecind.merge(erfind, on=['noindiv', 'ident', 'noi'], how="inner") # optimisation des types? Controle de l'existence en passant # TODO: minimal dtype # TODO: this should be done somewhere else var_list = ([ 'acteu', 'agepr', 'cohab', 'contra', 'encadr', 'forter', 'lien', 'mrec', 'naia', 'noicon', 'noimer', 'noiper', 'prosa', 'retrai', 'rstg', 'statut', 'stc', 'titc', 'txtppb', ]) for var in var_list: assert numpy.issubdtype( indivim[var].dtype, numpy.integer ), "Variable {} dtype is {} and should be an integer".format( var, indivim[var].dtype) ######################## # création de variables# ######################## # print indivim # actrec : activité recodée comme preconisé par l'INSEE p84 du guide utilisateur indivim["actrec"] = numpy.nan # Attention : Q: pas de 6 ?!! A : Non pas de 6, la variable recodée de l'INSEE (voit p84 du guide methodo), ici \ # la même nomenclature à été adopée # 3: contrat a durée déterminée indivim.actrec.loc[indivim.acteu == 1] = 3 # 8 : femme (homme) au foyer, autre inactif indivim.actrec.loc[indivim.acteu == 3] = 8 # 1 : actif occupé non salarié filter1 = (indivim.acteu == 1) & (indivim.stc.isin( [1, 3])) # actifs occupés non salariés à son compte ou pour un indivim.actrec.loc[filter1] = 1 # membre de sa famille # 2 : salarié pour une durée non limitée filter2 = (indivim.acteu == 1) & (((indivim.stc == 2) & (indivim.contra == 1)) | (indivim.titc == 2)) indivim.actrec.loc[filter2] = 2 # 4 : au chomage filter4 = (indivim.acteu == 2) | ((indivim.acteu == 3) & (indivim.mrec == 1)) indivim.actrec.loc[filter4] = 4 # 5 : élève étudiant , stagiaire non rémunéré filter5 = (indivim.acteu == 3) & ((indivim.forter == 2) | (indivim.rstg == 1)) indivim.actrec.loc[filter5] = 5 # 7 : retraité, préretraité, retiré des affaires unchecked filter7 = (indivim.acteu == 3) & ((indivim.retrai == 1) | (indivim.retrai == 2)) indivim.actrec.loc[filter7] = 7 # 9 : probablement enfants de - de 16 ans TODO: check that fact in database and questionnaire indivim.actrec.loc[indivim.acteu == 0] = 9 indivim.actrec = indivim.actrec.astype("int8") assert_dtype(indivim.actrec, "int8") assert indivim.actrec.isin(range( 1, 10)).all(), 'actrec values are outside the interval [1, 9]' # TODO : compare the result with results provided by Insee # tu99 if year == 2009: erfind['tu99'] = None # TODO: why ? # Locataire menagem["locataire"] = menagem.so.isin([3, 4, 5]) assert_dtype(menagem.locataire, "bool") transfert = indivim.loc[indivim.lpr == 1, ['ident', 'ddipl']].copy() menagem = menagem.merge(transfert) # Correction def _manually_remove_errors(): ''' This method is here because some oddities can make it through the controls throughout the procedure It is here to remove all these individual errors that compromise the process. ''' if year == 2006: indivim.lien[indivim.noindiv == 603018905] = 2 indivim.noimer[indivim.noindiv == 603018905] = 1 log.info("{}".format( indivim[indivim.noindiv == 603018905].to_string())) _manually_remove_errors() temporary_store['menagem_{}'.format(year)] = menagem del eecmen, erfmen, menagem, transfert gc.collect() temporary_store['indivim_{}'.format(year)] = indivim del erfind, eecind
def build_homogeneisation_revenus_menages(year = None): """Build menage consumption by categorie fiscale dataframe """ assert year is not None # Load data bdf_survey_collection = SurveyCollection.load( collection = 'budget_des_familles', config_files_directory = config_files_directory) survey = bdf_survey_collection.get_survey('budget_des_familles_{}'.format(year)) # ********************************************************************************************************************** # ********************************* HOMOGENEISATION DES DONNEES SUR LES REVENUS DES MENAGES **************************** # ************************************ CALCUL D'UN PROXI DU REVENU DISPONIBLE DES MENAGES ****************************** # ********************************************************************************************************************** # # ********************HOMOGENEISATION DES BASES DE RESSOURCES*************************** # /* La base 95 permet de distinguer taxe d'habitation et impôts fonciers. On calcule leur montant relatif pour l'appliquer à 00 et 05 */ if year == 1995: menrev = survey.get_values( table = "menrev", variables = [ 'revtot', 'ir', 'irbis', 'imphab', 'impfon', 'revaid', 'revsal', 'revind', 'revsec', 'revret', 'revcho', 'revfam', 'revlog', 'revinv', 'revrmi', 'revpat', 'mena', 'ponderr' ], ) menage = survey.get_values( table = "socioscm", variables = ['exdep', 'exrev', 'mena'] ) menage.set_index('mena') menrev = menrev.merge(menage, left_index = True, right_index = True) # cette étape de ne garder que les données dont on est sûr de la qualité et de la véracité # exdep = 1 si les données sont bien remplies pour les dépenses du ménage # exrev = 1 si les données sont bien remplies pour les revenus du ménage menrev = menrev[(menrev.exdep == 1) & (menrev.exrev == 1)] menrev['foncier_hab'] = menrev.imphab + menrev.impfon menrev['part_IMPHAB'] = menrev.imphab / menrev.foncier_hab menrev['part_IMPFON'] = menrev.impfon / menrev.foncier_hab menrev['revsoc'] = ( menrev.revret + menrev.revcho + menrev.revfam + menrev.revlog + menrev.revinv + menrev.revrmi ) for variable in ['revcho', 'revfam', 'revinv', 'revlog', 'revret', 'revrmi']: del menrev[variable] menrev['revact'] = menrev['revsal'] + menrev['revind'] + menrev['revsec'] menrev.rename( columns = dict( revpat = "revpat", impfon = "impfon", imphab = "imphab", revaid = "somme_obl_recue", ), inplace = True ) menrev['impot_revenu'] = menrev['ir'] + menrev['irbis'] rev_disp = survey.get_values( table = "menrev", variables = ['revtot', 'revret', 'revcho', 'revfam', 'revlog', 'revinv', 'revrmi', 'imphab', 'impfon', 'revaid', 'revsal', 'revind', 'revsec', 'revpat', 'mena', 'ponderr', 'ir','irbis' ], ) rev_disp.set_index('mena', inplace=True) menage2 = survey.get_values( table = "socioscm", variables = ['exdep', 'exrev', 'mena'] ) menage2.set_index('mena', inplace = True) rev_disp = menage2.merge(rev_disp, left_index = True, right_index = True) rev_disp = rev_disp[(rev_disp.exrev == 1) & (rev_disp.exdep == 1)] rev_disp['revsoc'] = rev_disp['revret'] + rev_disp['revcho'] + rev_disp['revfam'] + rev_disp['revlog'] + rev_disp['revinv'] + rev_disp['revrmi'] rev_disp['impot_revenu'] = rev_disp['ir'] + rev_disp['irbis'] rev_disp.rename( columns = dict( revaid = 'somme_obl_recue', ), inplace = True ) rev_disp.somme_obl_recue = rev_disp.somme_obl_recue.fillna(0) rev_disp['revact'] = rev_disp['revsal'] + rev_disp['revind'] + rev_disp['revsec'] rev_disp['revtot'] = rev_disp['revact'] + rev_disp['revpat'] + rev_disp['revsoc'] + rev_disp['somme_obl_recue'] rev_disp['revact'] = rev_disp['revsal'] + rev_disp['revind'] + rev_disp['revsec'] rev_disp.rename( columns = dict( ponderr = "pondmen", mena = "ident_men", revind = "act_indpt", revsal = "salaires", revsec = "autres_rev", ), inplace = True ) rev_disp['autoverses'] = '0' rev_disp['somme_libre_recue'] = '0' rev_disp['autres_ress'] = '0' # # /* Le revenu disponible se calcule à partir de revtot à laquelle on retrancher la taxe d'habitation # et l'impôt sur le revenu, plus éventuellement les CSG et CRDS. # La variable revtot est la somme des revenus d'activité, sociaux, du patrimoine et d'aide. */ # rev_disp['rev_disponible'] = rev_disp.revtot - rev_disp.impot_revenu - rev_disp.imphab loyers_imputes = temporary_store['depenses_bdf_{}'.format(year)] loyers_imputes.rename( columns = {"0411": "loyer_impute"}, inplace = True, ) rev_dispbis = loyers_imputes.merge(rev_disp, left_index = True, right_index = True) rev_disp['rev_disp_loyerimput'] = rev_disp['rev_disponible'] - rev_dispbis['loyer_impute'] for var in ['somme_obl_recue', 'act_indpt', 'revpat', 'salaires', 'autres_rev', 'rev_disponible', 'impfon', 'imphab', 'revsoc', 'revact', 'impot_revenu', 'revtot', 'rev_disp_loyerimput'] : rev_disp[var] = rev_disp[var] / 6.55957 # * CONVERSION EN EUROS temporary_store["revenus_{}".format(year)] = rev_disp elif year == 2000: # TODO: récupérer plutôt les variables qui viennent de la table dépenses (dans temporary_store) consomen = survey.get_values( table = "consomen", variables = ['c13141', 'c13111', 'c13121', 'c13131', 'pondmen', 'ident'], ) rev_disp = consomen.sort(columns = ['ident']) del consomen menage = survey.get_values( table = "menage", variables = ['ident', 'revtot', 'revact', 'revsoc', 'revpat', 'rev70', 'rev71', 'revt_d', 'pondmen', 'rev10', 'rev11', 'rev20', 'rev21'], ).sort(columns = ['ident']) revenus = menage.join(rev_disp, how = "outer", rsuffix = "rev_disp") revenus.rename( columns = dict( c13111 = "impot_res_ppal", c13141 = "impot_revenu", c13121 = "impot_autres_res", rev70 = "somme_obl_recue", rev71 = "somme_libre_recue", revt_d= "autres_ress", ident = "ident_men", rev10 = "act_indpt", rev11 = "autoverses", rev20 = "salaires", rev21 = "autres_rev", ), inplace = True ) var_to_ints = ['pondmen','impot_autres_res','impot_res_ppal','pondmenrev_disp','c13131'] for var_to_int in var_to_ints: revenus[var_to_int] = revenus[var_to_int].astype(int) revenus['imphab'] = 0.65 * (revenus.impot_res_ppal + revenus.impot_autres_res) revenus['impfon'] = 0.35 * (revenus.impot_res_ppal + revenus.impot_autres_res) loyers_imputes = temporary_store["depenses_bdf_{}".format(year)] variables = ["0421"] loyers_imputes = loyers_imputes[variables] loyers_imputes.rename( columns = {"0421": "loyer_impute"}, inplace = True, ) temporary_store["loyers_imputes_{}".format(year)] = loyers_imputes loyers_imputes.index = loyers_imputes.index.astype('int') revenus = revenus.set_index('ident_men') revenus.index = revenus.index.astype('int') revenus = revenus.merge(loyers_imputes, left_index = True, right_index = True) revenus['rev_disponible'] = revenus.revtot - revenus.impot_revenu - revenus.imphab revenus['rev_disponible'] = revenus['rev_disponible'] * (revenus['rev_disponible'] >= 0) revenus['rev_disp_loyerimput'] = revenus.rev_disponible + revenus.loyer_impute var_to_ints = ['loyer_impute'] for var_to_int in var_to_ints: revenus[var_to_int] = revenus[var_to_int].astype(int) temporary_store["revenus_{}".format(year)] = revenus elif year == 2005: c05d = survey.get_values( table = "c05d", variables = ['c13111', 'c13121', 'c13141', 'pondmen', 'ident_men'], ) rev_disp = c05d.sort(columns = ['ident_men']) del c05d menage = survey.get_values( table = "menage", variables = ['ident_men', 'revtot', 'revact', 'revsoc', 'revpat', 'rev700_d', 'rev701_d', 'rev999_d', 'rev100_d', 'rev101_d', 'rev200_d', 'rev201_d'], ).sort(columns = ['ident_men']) rev_disp.set_index('ident_men', inplace = True) menage.set_index('ident_men', inplace = True) revenus = pandas.concat([menage, rev_disp], axis = 1) revenus.rename( columns = dict( rev100_d = "act_indpt", rev101_d = "autoverses", rev200_d = "salaires", rev201_d = "autres_rev", rev700_d = "somme_obl_recue", rev701_d = "somme_libre_recue", rev999_d = "autres_ress", c13111 = "impot_res_ppal", c13141 = "impot_revenu", c13121 = "impot_autres_res", ), inplace = True ) # * Ces pondérations (0.65 0.35) viennent de l'enquête BdF 1995 qui distingue taxe d'habitation et impôts fonciers. A partir de BdF 1995, # * on a calculé que la taxe d'habitation représente en moyenne 65% des impôts locaux, et que les impôts fonciers en représentenr 35%. # * On applique ces taux aux enquêtes 2000 et 2005. # gen imphab= 0.65*(impot_res_ppal + impot_autres_res) # gen impfon= 0.35*(impot_res_ppal + impot_autres_res) # drop impot_autres_res impot_res_ppal revenus['imphab'] = 0.65 * (revenus.impot_res_ppal + revenus.impot_autres_res) revenus['impfon'] = 0.35 * (revenus.impot_res_ppal + revenus.impot_autres_res) del revenus['impot_autres_res'] del revenus['impot_res_ppal'] # * Calculer le revenu disponible avec et sans le loyer imputé loyers_imputes = temporary_store["depenses_bdf_{}".format(year)] variables = ["0421"] loyers_imputes = loyers_imputes[variables] loyers_imputes.rename( columns = {"0421": "loyer_impute"}, inplace = True, ) temporary_store["loyers_imputes_{}".format(year)] = loyers_imputes revenus = revenus.merge(loyers_imputes, left_index = True, right_index = True) revenus['rev_disponible'] = revenus.revtot - revenus.impot_revenu - revenus.imphab revenus['rev_disponible'] = revenus['rev_disponible'] * (revenus['rev_disponible'] >= 0) revenus['rev_disp_loyerimput'] = revenus.rev_disponible + revenus.loyer_impute temporary_store["revenus_{}".format(year)] = revenus elif year == 2011: try: c05 = survey.get_values( table = "C05", variables = ['c13111', 'c13121', 'c13141', 'pondmen', 'ident_me'], ) except: c05 = survey.get_values( table = "c05", variables = ['c13111', 'c13121', 'c13141', 'pondmen', 'ident_me'], ) rev_disp = c05.sort(columns = ['ident_me']) del c05 try: menage = survey.get_values( table = "MENAGE", variables = ['ident_me', 'revtot', 'revact', 'revsoc', 'revpat', 'rev700', 'rev701', 'rev999', 'revindep', 'salaires'], ).sort(columns = ['ident_me']) except: menage = survey.get_values( table = "menage", variables = ['ident_me', 'revtot', 'revact', 'revsoc', 'revpat', 'rev700', 'rev701', 'rev999', 'revindep', 'salaires'], ).sort(columns = ['ident_me']) # variables = ['ident_me', 'revtot', 'revact', 'revsoc', 'revpat', 'rev700', 'rev701', 'rev999', 'revindep', 'rev101_d', 'salaires', 'rev201'], rev_disp.set_index('ident_me', inplace = True) menage.set_index('ident_me', inplace = True) revenus = pandas.concat([menage, rev_disp], axis = 1) revenus.rename( columns = dict( revindep = "act_indpt", #TODO: trouver ces revenus commentés dans bdf 2011 # rev101_d = "autoverses", salaires = "salaires", # rev201_d = "autres_rev", rev700 = "somme_obl_recue", rev701 = "somme_libre_recue", rev999 = "autres_ress", c13111 = "impot_res_ppal", c13141 = "impot_revenu", c13121 = "impot_autres_res", ), inplace = True ) revenus['imphab'] = 0.65 * (revenus.impot_res_ppal + revenus.impot_autres_res) revenus['impfon'] = 0.35 * (revenus.impot_res_ppal + revenus.impot_autres_res) del revenus['impot_autres_res'] del revenus['impot_res_ppal'] loyers_imputes = temporary_store["depenses_bdf_{}".format(year)] variables = ["0421"] loyers_imputes = loyers_imputes[variables] loyers_imputes.rename( columns = {"0421": "loyer_impute"}, inplace = True, ) temporary_store["loyers_imputes_{}".format(year)] = loyers_imputes revenus = revenus.merge(loyers_imputes, left_index = True, right_index = True) revenus['rev_disponible'] = revenus.revtot - revenus.impot_revenu - revenus.imphab revenus['rev_disponible'] = revenus['rev_disponible'] * (revenus['rev_disponible'] >= 0) revenus['rev_disp_loyerimput'] = revenus.rev_disponible + revenus.loyer_impute temporary_store["revenus_{}".format(year)] = revenus
def build_imputation_loyers_proprietaires(temporary_store = None, year = None): """Build menage consumption by categorie fiscale dataframe """ assert temporary_store is not None assert year is not None # Load data bdf_survey_collection = SurveyCollection.load(collection = 'budget_des_familles', config_files_directory = config_files_directory) survey = bdf_survey_collection.get_survey('budget_des_familles_{}'.format(year)) if year == 1995: imput00 = survey.get_values(table = "socioscm") # cette étape permet de ne garder que les données dont on est sûr de la qualité et de la véracité # exdep = 1 si les données sont bien remplies pour les dépenses du ménage # exrev = 1 si les données sont bien remplies pour les revenus du ménage imput00 = imput00[(imput00.exdep == 1) & (imput00.exrev == 1)] imput00 = imput00[(imput00.exdep == 1) & (imput00.exrev == 1)] kept_variables = ['mena', 'stalog', 'surfhab', 'confort1', 'confort2', 'confort3', 'confort4', 'ancons', 'sitlog', 'nbphab', 'rg', 'cc'] imput00 = imput00[kept_variables] imput00.rename(columns = {'mena': 'ident_men'}, inplace = True) #TODO: continue variable cleaning var_to_filnas = ['surfhab'] for var_to_filna in var_to_filnas: imput00[var_to_filna] = imput00[var_to_filna].fillna(0) var_to_ints = ['sitlog', 'confort1', 'stalog', 'surfhab', 'ident_men', 'ancons', 'nbphab'] for var_to_int in var_to_ints: imput00[var_to_int] = imput00[var_to_int].astype(int) depenses = temporary_store['depenses_{}'.format(year)] depenses.reset_index(inplace = True) depenses_small = depenses[['ident_men', '04110', 'pondmen']].copy() depenses_small.ident_men = depenses_small.ident_men.astype('int') imput00 = depenses_small.merge(imput00, on = 'ident_men').set_index('ident_men') imput00.rename(columns = {'04110': 'loyer_reel'}, inplace = True) # * une indicatrice pour savoir si le loyer est connu et l'occupant est locataire imput00['observe'] = (imput00.loyer_reel > 0) & (imput00.stalog.isin([3, 4])) imput00['maison_appart'] = imput00.sitlog == 1 imput00['catsurf'] = ( 1 + (imput00.surfhab > 15) + (imput00.surfhab > 30) + (imput00.surfhab > 40) + (imput00.surfhab > 60) + (imput00.surfhab > 80) + (imput00.surfhab > 100) + (imput00.surfhab > 150) ) assert imput00.catsurf.isin(range(1, 9)).all() # TODO: vérifier ce qe l'on fait notamment regarder la vleur catsurf = 2 ommise dans le code stata imput00.maison = 1 - ((imput00.cc == 5) & (imput00.catsurf == 1) & (imput00.maison_appart == 1)) imput00.maison = 1 - ((imput00.cc == 5) & (imput00.catsurf == 3) & (imput00.maison_appart == 1)) imput00.maison = 1 - ((imput00.cc == 5) & (imput00.catsurf == 8) & (imput00.maison_appart == 1)) imput00.maison = 1 - ((imput00.cc == 4) & (imput00.catsurf == 1) & (imput00.maison_appart == 1)) try: parser = SafeConfigParser() config_local_ini = os.path.join(config_files_directory, 'config_local.ini') config_ini = os.path.join(config_files_directory, 'config.ini') parser.read([config_ini, config_local_ini]) directory_path = os.path.normpath( parser.get("openfisca_france_indirect_taxation", "assets") ) hotdeck = pandas.read_stata(os.path.join(directory_path, 'hotdeck_result.dta')) except: hotdeck = survey.get_values(table = 'hotdeck_result') imput00.reset_index(inplace = True) hotdeck.ident_men = hotdeck.ident_men.astype('int') imput00 = imput00.merge(hotdeck, on = 'ident_men') imput00.loyer_impute[imput00.observe] = 0 imput00.reset_index(inplace = True) loyers_imputes = imput00[['ident_men', 'loyer_impute']].copy() assert loyers_imputes.loyer_impute.notnull().all() loyers_imputes.rename(columns = dict(loyer_impute = '0411'), inplace = True) # POUR BdF 2000 ET 2005, ON UTILISE LES LOYERS IMPUTES CALCULES PAR L'INSEE if year == 2000: # Garder les loyers imputés (disponibles dans la table sur les ménages) loyers_imputes = survey.get_values(table = "menage", variables = ['ident', 'rev81']) loyers_imputes.rename( columns = { 'ident': 'ident_men', 'rev81': 'poste_coicop_421', }, inplace = True, ) if year == 2005: # Garder les loyers imputés (disponibles dans la table sur les ménages) loyers_imputes = survey.get_values(table = "menage") kept_variables = ['ident_men', 'rev801_d'] loyers_imputes = loyers_imputes[kept_variables] loyers_imputes.rename(columns = {'rev801_d': 'poste_coicop_421'}, inplace = True) if year == 2011: try: loyers_imputes = survey.get_values(table = "MENAGE") except: loyers_imputes = survey.get_values(table = "menage") kept_variables = ['ident_me', 'rev801'] loyers_imputes = loyers_imputes[kept_variables] loyers_imputes.rename(columns = {'rev801': 'poste_coicop_421', 'ident_me': 'ident_men'}, inplace = True) # Joindre à la table des dépenses par COICOP loyers_imputes.set_index('ident_men', inplace = True) temporary_store['loyers_imputes_{}'.format(year)] = loyers_imputes depenses = temporary_store['depenses_{}'.format(year)] depenses.index = depenses.index.astype('int64') loyers_imputes.index = loyers_imputes.index.astype('int64') assert set(depenses.index) == set(loyers_imputes.index) assert len(set(depenses.columns).intersection(set(loyers_imputes.columns))) == 0 depenses = depenses.merge(loyers_imputes, left_index = True, right_index = True) # **************************************************************************************************************** # Etape n° 0-1-3 : SAUVER LES BASES DE DEPENSES HOMOGENEISEES DANS LE BON DOSSIER # **************************************************************************************************************** # Save in temporary store temporary_store['depenses_bdf_{}'.format(year)] = depenses