def store_input_data_frame(data_frame = None, collection = None, survey = None, table = None):
    assert data_frame is not None
    assert collection is not None
    assert survey is not None
    try:
        openfisca_survey_collection = SurveyCollection.load(collection = collection)
    except Exception as e:
        openfisca_survey_collection = SurveyCollection(name = collection)

    log.debug("In collection {} the following survey are present: {}".format(collection, openfisca_survey_collection.surveys))
    output_data_directory = openfisca_survey_collection.config.get('data', 'output_directory')
    if table is None:
        table = "input"
    #
    survey_name = survey
    hdf5_file_path = os.path.join(os.path.dirname(output_data_directory), "{}.h5".format(survey_name))
    available_survey_names = [survey_.name for survey_ in openfisca_survey_collection.surveys]
    if survey_name in available_survey_names:
        survey = openfisca_survey_collection.get_survey(survey_name)
    else:
        survey = Survey(name = survey_name, hdf5_file_path = hdf5_file_path)
    survey.insert_table(name = table, data_frame = data_frame)
    openfisca_survey_collection.surveys.append(survey)
    collections_directory = openfisca_survey_collection.config.get('collections', 'collections_directory')
    json_file_path = os.path.join(collections_directory, '{}.json'.format(collection))
    log.debug("In collection {} the following surveyx are present: {}".format(collection, openfisca_survey_collection.surveys))
    openfisca_survey_collection.dump(json_file_path = json_file_path)
def run_all(year = None, filename = "test", check = False):

    assert year is not None
    pre_processing.create_indivim_menage_en_mois(year = year)
    pre_processing.create_enfants_a_naitre(year = year)
    # imputation_loyer.imputation_loyer(year = year)
    fip.create_fip(year = year)
    famille.famille(year = year)
    foyer.sif(year = year)
    foyer.foyer_all(year = year)
    rebuild.create_totals(year = year)
    rebuild.create_final(year = year)
    invalides.invalide(year = year)
    data_frame = final.final(year = year, check = check)

    # Saving the data_frame
    openfisca_survey_collection = SurveyCollection(name = "openfisca", config_files_directory = config_files_directory)
    output_data_directory = openfisca_survey_collection.config.get('data', 'output_directory')
    survey_name = "openfisca_data_{}".format(year)
    table = "input"
    hdf5_file_path = os.path.join(os.path.dirname(output_data_directory), "{}.h5".format(survey_name))
    survey = Survey(
        name = survey_name,
        hdf5_file_path = hdf5_file_path,
        )
    survey.insert_table(name = table, data_frame = data_frame)
    openfisca_survey_collection.surveys.append(survey)
    collections_directory = openfisca_survey_collection.config.get('collections', 'collections_directory')
    json_file_path = os.path.join(collections_directory, 'openfisca.json')
    openfisca_survey_collection.dump(json_file_path = json_file_path)
Exemple #3
0
def build_survey_collection(name=None,
                            erase_collection_json=False,
                            overwrite_surveys=False,
                            data_directory_path_by_year=None,
                            source_format='sas'):

    assert name is not None
    assert data_directory_path_by_year is not None
    years = data_directory_path_by_year.keys()
    if years is None:
        log.error("A list of years to process is needed")

    if erase_collection_json:
        survey_collection = SurveyCollection(
            name=name, config_files_directory=config_files_directory)
    else:
        try:
            survey_collection = SurveyCollection.load(
                collection=name, config_files_directory=config_files_directory)
        except ConfigParser.NoOptionError:
            survey_collection = SurveyCollection(
                name=name, config_files_directory=config_files_directory)

    for year, data_directory_path in data_directory_path_by_year.iteritems():
        if not os.path.isdir(data_directory_path):
            input_data_directory = survey_collection.config.get(
                'data', 'input_directory')
            assert os.path.isdir(input_data_directory)
            data_directory_path = os.path.join(input_data_directory,
                                               data_directory_path)
            assert os.path.isdir(input_data_directory)

        data_file_by_format = create_data_file_by_format(data_directory_path)
        print data_file_by_format
        survey_name = '{}_{}'.format(name, year)
        add_survey_to_collection(
            survey_name=survey_name,
            survey_collection=survey_collection,
            sas_files=data_file_by_format[source_format],
        )
        collections_directory = survey_collection.config.get(
            'collections', 'collections_directory')
        collection_json_path = os.path.join(collections_directory,
                                            "{}.json".format(name))
        survey_collection.dump(json_file_path=collection_json_path)
        surveys = [
            survey for survey in survey_collection.surveys
            if survey.name.endswith(str(year))
        ]
        survey_collection.fill_hdf(source_format=source_format,
                                   surveys=surveys,
                                   overwrite=overwrite_surveys)
    return survey_collection
Exemple #4
0
def build_merged_dataframes(temporary_store = None, year = None):
    assert temporary_store is not None
    assert year is not None
    log.debug("Chargement des tables des enquêtes")

    erfs_fpr_survey_collection = SurveyCollection.load(collection = "erfs_fpr")
    yr = str(year)[-2:]  # 12 for 2012
    add_suffix_retropole_years = [2012]

    survey = erfs_fpr_survey_collection.get_survey(f"erfs_fpr_{year}")
    eec_menage = survey.get_values(table = f"fpr_mrf{yr}e{yr}t4")
    eec_individu = survey.get_values(table = f"fpr_irf{yr}e{yr}t4")

    if year in add_suffix_retropole_years:
        fpr_individu = survey.get_values(table = f"fpr_indiv_{year}_retropole")
        fpr_menage = survey.get_values(table = f"fpr_menage_{year}_retropole")

    else:
        fpr_individu = survey.get_values(table = f"fpr_indiv_{year}")
        fpr_menage = survey.get_values(table = f"fpr_menage_{year}")

    individus, menages = merge_tables(fpr_menage, eec_menage, eec_individu, fpr_individu, year)
    temporary_store[f"menages_{year}"] = menages
    del eec_menage, fpr_menage, menages
    gc.collect()

    temporary_store[f"individus_{year}_post_01"] = individus
    del eec_individu, fpr_individu
Exemple #5
0
def test_survey():
    name = 'fake'
    data_dir = os.path.join(
        pkg_resources.get_distribution('openfisca-survey-manager').location,
        'openfisca_survey_manager',
        'tests',
        'data_files',
    )

    survey_collection = SurveyCollection(name=name,
                                         config_files_directory=data_dir,
                                         json_file_path=os.path.join(
                                             data_dir, 'fake.json'))

    saved_fake_survey_hdf5_file_path = os.path.join(data_dir, 'fake.hdf5')
    saved_fake_survey_file_path = os.path.join(data_dir, 'help.sas7bdat')
    survey = Survey(
        hdf5_file_path=saved_fake_survey_hdf5_file_path,
        name='fake_survey',
        sas_files=[saved_fake_survey_file_path],
        survey_collection=survey_collection,
    )
    survey.insert_table(name='help')
    survey.fill_hdf(source_format='sas')
    print survey.tables
def get_input_data_frame(year):
    openfisca_survey_collection = SurveyCollection.load(collection="eipp")
    openfisca_survey = openfisca_survey_collection.surveys[
        "eipp_data_{}".format(year)]
    input_data_frame = openfisca_survey.get_values(table="input")
    input_data_frame.reset_index(inplace=True)
    return input_data_frame
def build(year=None):
    assert year is not None
    #
    preprocessing.build_merged_dataframes(year=year)
    #
    # imputation_loyer.imputation_loyer(year = year)
    #
    openfisca_survey_collection = SurveyCollection(name='openfisca')
    stata_directory = openfisca_survey_collection.config.get(
        'data', 'stata_directory')
    stata_file = os.path.join(stata_directory, 'log_men_ERFS.dta')
    imputation_loyer.merge_imputation_loyer(stata_file=stata_file, year=year)
    #
    variables_individuelles.build_variables_individuelles(year=year)
    famille.build_famille(year=year)
    final.create_input_data_frame(year=year)
    #
    temporary_store = get_store(file_name='erfs_fpr')
    data_frame = temporary_store['input_{}'.format(year)]
    # Save the data_frame in a collection
    store_input_data_frame(
        data_frame=data_frame,
        collection="openfisca_erfs_fpr",
        survey="openfisca_erfs_fpr_data_{}".format(year),
    )
def get_input_data_frame(year):
    openfisca_survey_collection = SurveyCollection.load(
        collection = "openfisca_indirect_taxation", config_files_directory = config_files_directory)
    openfisca_survey = openfisca_survey_collection.get_survey("openfisca_indirect_taxation_data_{}".format(year))
    input_data_frame = openfisca_survey.get_values(table = "input")
    input_data_frame.reset_index(inplace = True)
    return input_data_frame
Exemple #9
0
def set_table_in_survey(input_dataframe, entity, period, collection, survey_name, survey_label = None,
        table_label = None, table_name = None):
    period = periods.period(period)
    if table_name is None:
        table_name = entity + '_' + str(period)
    if table_label is None:
        table_label = "Input data for entity {} at period {}".format(entity, period)
    try:
        survey_collection = SurveyCollection.load(collection = collection)
    except configparser.NoOptionError:
        survey_collection = SurveyCollection(name = collection)
    except configparser.NoSectionError:  # For tests
        data_dir = os.path.join(
            pkg_resources.get_distribution('openfisca-survey-manager').location,
            'openfisca_survey_manager',
            'tests',
            'data_files',
            )
        survey_collection = SurveyCollection(
            name = collection,
            config_files_directory = data_dir,
            )

    try:
        survey = survey_collection.get_survey(survey_name)
    except AssertionError:
        survey = Survey(
            name = survey_name,
            label = survey_label or None,
            survey_collection = survey_collection,
            )

    if survey.hdf5_file_path is None:
        config = survey.survey_collection.config
        directory_path = config.get("data", "output_directory")
        if not os.path.isdir(directory_path):
            log.warn("{} who should be the HDF5 data directory does not exist: we create the directory".format(
                directory_path))
            os.makedirs(directory_path)
        survey.hdf5_file_path = os.path.join(directory_path, survey.name + '.h5')

    assert survey.hdf5_file_path is not None
    survey.insert_table(label = table_label, name = table_name, dataframe = input_dataframe)
    survey_collection.surveys = [
        kept_survey for kept_survey in survey_collection.surveys if kept_survey.name != survey_name
        ]
    survey_collection.surveys.append(survey)
    collections_directory = survey_collection.config.get('collections', 'collections_directory')
    assert os.path.isdir(collections_directory), """{} who should be the collections' directory does not exist.
Fix the option collections_directory in the collections section of your config file.""".format(collections_directory)
    collection_json_path = os.path.join(collections_directory, "{}.json".format(collection))
    survey_collection.dump(json_file_path = collection_json_path)
def build_bdf_survey_collection(years=None, erase=False, overwrite=False):
    if years is None:
        log.error("A list of years to process is needed")

    if erase:
        bdf_survey_collection = SurveyCollection(
            name="budget_des_familles",
            config_files_directory=config_files_directory)
    else:
        try:
            bdf_survey_collection = SurveyCollection.load(
                collection='budget_des_familles',
                config_files_directory=config_files_directory)
        except ConfigParser.NoOptionError:
            bdf_survey_collection = SurveyCollection(
                name="budget_des_familles",
                config_files_directory=config_files_directory)

    input_data_directory = bdf_survey_collection.config.get(
        'data', 'input_directory')
    if getpass.getuser() == 'benjello':
        input_data_directory = os.path.join(
            os.path.dirname(input_data_directory), 'INSEE')
    else:
        input_data_directory = os.path.dirname(input_data_directory)

    for year in years:
        data_directory_path = os.path.join(
            input_data_directory, 'budget_des_familles/{}'.format(year))
        data_file_by_format = create_data_file_by_format(data_directory_path)
        survey_name = 'budget_des_familles_{}'.format(year)

        add_survey_to_collection(
            survey_name=survey_name,
            survey_collection=bdf_survey_collection,
            stata_files=data_file_by_format['stata'],
        )

        collections_directory = bdf_survey_collection.config.get(
            'collections', 'collections_directory')
        collection_json_path = os.path.join(collections_directory,
                                            "budget_des_familles" + ".json")
        bdf_survey_collection.dump(json_file_path=collection_json_path)
        surveys = [
            survey for survey in bdf_survey_collection.surveys
            if survey.name.endswith(str(year))
        ]
        bdf_survey_collection.fill_hdf(source_format='stata',
                                       surveys=surveys,
                                       overwrite=overwrite)
    return bdf_survey_collection
def get_input_data_frame(year):
    openfisca_survey_collection = SurveyCollection.load(
        collection="openfisca_indirect_taxation")
    openfisca_survey = openfisca_survey_collection.get_survey(
        "openfisca_indirect_taxation_data_{}".format(year))
    input_data_frame = openfisca_survey.get_values(table="input")
    input_data_frame.reset_index(inplace=True)
    return input_data_frame
def store_input_data_frame(data_frame = None, collection = None, survey = None):
    assert data_frame is not None
    assert collection is not None
    assert survey is not None
    openfisca_survey_collection = SurveyCollection(name = collection, config_files_directory = config_files_directory)
    output_data_directory = openfisca_survey_collection.config.get('data', 'output_directory')
    survey_name = survey
    table = "input"
    hdf5_file_path = os.path.join(os.path.dirname(output_data_directory), "{}.h5".format(survey_name))
    survey = Survey(
        name = survey_name,
        hdf5_file_path = hdf5_file_path,
        )
    survey.insert_table(name = table, data_frame = data_frame)
    openfisca_survey_collection.surveys.append(survey)
    collections_directory = openfisca_survey_collection.config.get('collections', 'collections_directory')
    json_file_path = os.path.join(collections_directory, 'openfisca_erfs_fpr.json')
    openfisca_survey_collection.dump(json_file_path = json_file_path)
Exemple #13
0
def test_add_survey_to_collection():
    if is_travis or is_circleci:
        return
    name = 'fake'
    survey_name = 'fake_survey'
    data_dir = os.path.join(
        pkg_resources.get_distribution('openfisca-survey-manager').location,
        'openfisca_survey_manager',
        'tests',
        'data_files',
    )
    survey_collection = SurveyCollection(name=name)
    saved_fake_survey_file_path = os.path.join(data_dir, 'help.sas7bdat')
    add_survey_to_collection(survey_name=survey_name,
                             survey_collection=survey_collection,
                             sas_files=[saved_fake_survey_file_path],
                             stata_files=[])
    ordered_dict = survey_collection.to_json()
    assert survey_name in list(ordered_dict['surveys'].keys())
    def build_erf_data_frames(self):
        # TODO: remove this
        self.columns_to_fetch = ['af']
        variables = self.columns_to_fetch
        erf_survey_collection = SurveyCollection.load(
            collection = "erfs", config_files_directory = config_files_directory)
        erf_survey = erf_survey_collection.get_survey("erfs_{}".format(year))
        year_specific_by_generic = year_specific_by_generic_data_frame_name(year)
        generic_by_year_specific = dict(zip(year_specific_by_generic.values(), year_specific_by_generic.keys()))

        erf_variables = list(set(variables + ["ident", "wprm", "quelfic", "noi"]))
        of2erf = get_of2erf()
        for index, variable in enumerate(erf_variables):
            if variable in of2erf:
                erf_variables[index] = of2erf[variable]
        data_frame_by_table = dict(eec_indivi = None, erf_indivi = None, erf_menage = None)
        erf_variables_by_generic_table = dict(eec_indivi = [], erf_indivi = [], erf_menage = [])

        year_specific_tables_by_erf_variable = dict(
            [
                (
                    erf_variable,
                    set(
                        erf_survey.find_tables(variable = erf_variable)
                        ).intersection(
                        set([year_specific_by_generic[key] for key in erf_variables_by_generic_table.keys()])
                        )
                    ) for erf_variable in erf_variables
                ]
            )
        for variable, year_specific_tables in year_specific_tables_by_erf_variable.iteritems():
            if len(year_specific_tables) < 1:
                log.info("No tables are present for variable {}".format(variable))
                continue
            else:
                log.info("Variable {} is present in multiple tables : {}".format(variable, year_specific_tables))
                for table in year_specific_tables:
                    log.info("Variable {} is retrieved from table {}".format(variable, table))
                    erf_variables_by_generic_table[generic_by_year_specific[table]].append(variable)

        erf2of = get_erf2of()

        for table, erf_variables in erf_variables_by_generic_table.iteritems():
            if erf_variables:
                data_frame_by_table[table] = erf_survey.get_values(
                    variables = erf_variables, table = year_specific_by_generic[table]
                    )
                data_frame_by_table[table].rename(columns = erf2of, inplace = True)
                data_frame_by_table[table].rename(columns = {'ident': 'idmen'}, inplace = True)

        assert not data_frame_by_table["erf_menage"].duplicated().any(), "Duplicated idmen in erf_menage"
        self.erf_data_frame_by_entity_key_plural = dict(
            menages = data_frame_by_table["erf_menage"],
            individus = data_frame_by_table["erf_indivi"].merge(data_frame_by_table["eec_indivi"])
            )
def build_homogeneisation_vehicules(temporary_store = None, year = None):
    assert temporary_store is not None
    """Compute vehicule numbers by type"""

    assert year is not None
    # Load data
    bdf_survey_collection = SurveyCollection.load(
        collection = 'budget_des_familles', config_files_directory = config_files_directory)
    survey = bdf_survey_collection.get_survey('budget_des_familles_{}'.format(year))

    if year == 1995:
        vehicule = None

    # L'enquête BdF 1995 ne contient pas d'information sur le type de carburant utilisé par les véhicules.

    if year == 2000:
        vehicule = survey.get_values(table = "depmen")
        kept_variables = ['ident', 'carbu01', 'carbu02']
        vehicule = vehicule[kept_variables]
        vehicule.rename(columns = {'ident': 'ident_men'}, inplace = True)
        vehicule.rename(columns = {'carbu01': 'carbu1'}, inplace = True)
        vehicule.rename(columns = {'carbu02': 'carbu2'}, inplace = True)
        vehicule["veh_tot"] = 1
        vehicule["veh_essence"] = 1 * (vehicule['carbu1'] == 1) + 1 * (vehicule['carbu2'] == 1)
        vehicule["veh_diesel"] = 1 * (vehicule['carbu1'] == 2) + 1 * (vehicule['carbu2'] == 2)
        vehicule.index = vehicule.index.astype(ident_men_dtype)


    if year == 2005:
        vehicule = survey.get_values(table = "automobile")
        kept_variables = ['ident_men', 'carbu']
        vehicule = vehicule[kept_variables]
        vehicule["veh_tot"] = 1
        vehicule["veh_essence"] = (vehicule['carbu'] == 1)
        vehicule["veh_diesel"] = (vehicule['carbu'] == 2)

    if year == 2011:
        try:
            vehicule = survey.get_values(table = "AUTOMOBILE")
        except:
            vehicule = survey.get_values(table = "automobile")
        kept_variables = ['ident_me', 'carbu']
        vehicule = vehicule[kept_variables]
        vehicule.rename(columns = {'ident_me': 'ident_men'}, inplace = True)
        vehicule["veh_tot"] = 1
        vehicule["veh_essence"] = (vehicule['carbu'] == 1)
        vehicule["veh_diesel"] = (vehicule['carbu'] == 2)

    # Compute the number of cars by category and save
    if year != 1995:
        vehicule = vehicule.groupby(by = 'ident_men')["veh_tot", "veh_essence", "veh_diesel"].sum()
        vehicule["pourcentage_vehicule_essence"] = 0
        vehicule.pourcentage_vehicule_essence.loc[vehicule.veh_tot != 0] = vehicule.veh_essence / vehicule.veh_tot
        # Save in temporary store
        temporary_store['automobile_{}'.format(year)] = vehicule
def test_add_survey_to_collection():
    name = 'fake'
    survey_name = 'fake_survey'
    survey_collection = SurveyCollection(name = name)

    data_dir = os.path.join(
        pkg_resources.get_distribution('openfisca-survey-manager').location,
        'openfisca_survey_manager',
        'tests',
        'data_files',
        )
#    saved_fake_survey_hdf5_file_path = os.path.join(data_dir, 'fake.hdf5')
    saved_fake_survey_file_path = os.path.join(data_dir, 'help.sas7bdat')
    add_survey_to_collection(survey_name = survey_name,
                             survey_collection = survey_collection,
                             sas_files = [saved_fake_survey_file_path],
                             stata_files = [])
    ordered_dict = survey_collection.to_json()
#    print ordered_dict
    assert ordered_dict['surveys'].keys() == [survey_name]
def run_all(year_calage = 2007, year_data_list = [1995, 2000, 2005, 2011]):

    # Quelle base de données choisir pour le calage ?
    year_data = find_nearest_inferior(year_data_list, year_calage)

    # 4 étape parallèles d'homogénéisation des données sources :
    # Gestion des dépenses de consommation:
    build_depenses_homogenisees(year = year_data)
    build_imputation_loyers_proprietaires(year = year_data)

    build_depenses_calees(year_calage, year_data)
    build_menage_consumption_by_categorie_fiscale(year_calage, year_data)
    categorie_fiscale_data_frame = temporary_store["menage_consumption_by_categorie_fiscale_{}".format(year_calage)]
    depenses_calees_by_grosposte = temporary_store["depenses_calees_by_grosposte_{}".format(year_calage)]

    # Gestion des véhicules:
    build_homogeneisation_vehicules(year = year_data)
    vehicule = temporary_store['automobile_{}'.format(year_data)]

    # Gestion des variables socio démographiques:
    build_homogeneisation_caracteristiques_sociales(year = year_data)
    menage = temporary_store['donnes_socio_demog_{}'.format(year_data)]

    # Gestion des variables revenues:
    build_homogeneisation_revenus_menages(year = year_data)
    revenus = temporary_store["revenus_{}".format(year_data)]

    # DataFrame résultant de ces 4 étapes
    data_frame = pandas.concat(
        [revenus, vehicule, categorie_fiscale_data_frame, menage, depenses_calees_by_grosposte], axis = 1)

    data_frame.index.name = "ident_men"
    data_frame.reset_index(inplace = True)
    # Remove duplicated colums causing bug with HDFStore
    # according to https://github.com/pydata/pandas/issues/6240
    # using solution form stackoverflow
    # http://stackoverflow.com/questions/16938441/how-to-remove-duplicate-columns-from-a-dataframe-using-python-pandas
    data_frame = data_frame.T.groupby(level = 0).first().T

    # Saving the data_frame
    openfisca_survey_collection = SurveyCollection.load(
        collection = 'openfisca_indirect_taxation', config_files_directory = config_files_directory)

    output_data_directory = openfisca_survey_collection.config.get('data', 'output_directory')
    survey_name = "openfisca_indirect_taxation_data_{}".format(year_calage)
    table = "input"
    hdf5_file_path = os.path.join(os.path.dirname(output_data_directory), "{}.h5".format(survey_name))
    survey = Survey(
        name = survey_name,
        hdf5_file_path = hdf5_file_path,
        )
    survey.insert_table(name = table, data_frame = data_frame)
    openfisca_survey_collection.surveys.append(survey)
    openfisca_survey_collection.dump()
def get_input_data_frame(year):
    openfisca_survey_collection = SurveyCollection.load(
        collection = "openfisca", config_files_directory = config_files_directory)
    openfisca_survey = openfisca_survey_collection.get_survey("openfisca_data_{}".format(year))
    input_data_frame = openfisca_survey.get_values(table = "input")
    input_data_frame.rename(
        columns = dict(sali = 'sal', choi = 'cho', rsti = 'rst'),
        inplace = True,
        )
    input_data_frame.reset_index(inplace = True)
    return input_data_frame
Exemple #19
0
def build(year=None, check=False):

    assert year is not None
    pre_processing.create_indivim_menagem(year=year)
    pre_processing.create_enfants_a_naitre(year=year)
    #    try:
    #        imputation_loyer.imputation_loyer(year = year)
    #    except Exception, e:
    #        log.info('Do not impute loyer because of the following error: \n {}'.format(e))
    #        pass
    fip.create_fip(year=year)
    famille.famille(year=year)
    foyer.sif(year=year)
    foyer.foyer_all(year=year)
    rebuild.create_totals_first_pass(year=year)
    rebuild.create_totals_second_pass(year=year)
    rebuild.create_final(year=year)
    invalides.invalide(year=year)
    final.final(year=year, check=check)

    temporary_store = get_store(file_name='erfs')
    data_frame = temporary_store['input_{}'.format(year)]
    # Saving the data_frame
    openfisca_survey_collection = SurveyCollection(
        name="openfisca", config_files_directory=config_files_directory)
    output_data_directory = openfisca_survey_collection.config.get(
        'data', 'output_directory')
    survey_name = "openfisca_data_{}".format(year)
    table = "input"
    hdf5_file_path = os.path.join(os.path.dirname(output_data_directory),
                                  "{}.h5".format(survey_name))
    survey = Survey(
        name=survey_name,
        hdf5_file_path=hdf5_file_path,
    )
    survey.insert_table(name=table, data_frame=data_frame)
    openfisca_survey_collection.surveys.append(survey)
    collections_directory = openfisca_survey_collection.config.get(
        'collections', 'collections_directory')
    json_file_path = os.path.join(collections_directory, 'openfisca.json')
    openfisca_survey_collection.dump(json_file_path=json_file_path)
def get_input_data_frame(year):
    openfisca_survey_collection = SurveyCollection.load(
        collection="openfisca", config_files_directory=config_files_directory)
    openfisca_survey = openfisca_survey_collection.get_survey(
        "openfisca_data_{}".format(year))
    input_data_frame = openfisca_survey.get_values(table="input")
    input_data_frame.rename(
        columns=dict(sali='sal', choi='cho', rsti='rst'),
        inplace=True,
    )
    input_data_frame.reset_index(inplace=True)
    return input_data_frame
def test_add_survey_to_collection():
    if is_travis or is_circleci:
        return
    name = 'fake'
    survey_name = 'fake_survey'
    data_dir = os.path.join(
        pkg_resources.get_distribution('openfisca-survey-manager').location,
        'openfisca_survey_manager',
        'tests',
        'data_files',
        )
    survey_collection = SurveyCollection(name = name)
    saved_fake_survey_file_path = os.path.join(data_dir, 'help.sas7bdat')
    add_survey_to_collection(
        survey_name = survey_name,
        survey_collection = survey_collection,
        sas_files = [saved_fake_survey_file_path],
        stata_files = []
        )
    ordered_dict = survey_collection.to_json()
    assert survey_name in list(ordered_dict['surveys'].keys())
 def create(cls, year = None, rebuild_input_data = False):
     assert year is not None
     if rebuild_input_data:
         cls.build_input_data(year = year)
     openfisca_survey_collection = SurveyCollection.load(
         collection = "openfisca", config_files_directory = config_files_directory)
     openfisca_survey = openfisca_survey_collection.get_survey("openfisca_erfs_fpr_data_{}".format(year))
     input_data_frame = openfisca_survey.get_values(table = "input").reset_index(drop = True)
     return cls().init_from_data_frame(
         input_data_frame = input_data_frame,
         year = year,
         )
def run_all(year = None, check = False):

    assert year is not None
    pre_processing.create_indivim_menagem(year = year)
    pre_processing.create_enfants_a_naitre(year = year)
    #    try:
    #        imputation_loyer.imputation_loyer(year = year)
    #    except Exception, e:
    #        log.info('Do not impute loyer because of the following error: \n {}'.format(e))
    #        pass
    fip.create_fip(year = year)
    famille.famille(year = year)
    foyer.sif(year = year)
    foyer.foyer_all(year = year)
    rebuild.create_totals_first_pass(year = year)
    rebuild.create_totals_second_pass(year = year)
    rebuild.create_final(year = year)
    invalides.invalide(year = year)
    final.final(year = year, check = check)

    temporary_store = get_store(file_name = 'erfs')
    data_frame = temporary_store['input_{}'.format(year)]
    # Saving the data_frame
    openfisca_survey_collection = SurveyCollection(name = "openfisca", config_files_directory = config_files_directory)
    output_data_directory = openfisca_survey_collection.config.get('data', 'output_directory')
    survey_name = "openfisca_data_{}".format(year)
    table = "input"
    hdf5_file_path = os.path.join(os.path.dirname(output_data_directory), "{}.h5".format(survey_name))
    survey = Survey(
        name = survey_name,
        hdf5_file_path = hdf5_file_path,
        )
    survey.insert_table(name = table, data_frame = data_frame)
    openfisca_survey_collection.surveys.append(survey)
    collections_directory = openfisca_survey_collection.config.get('collections', 'collections_directory')
    json_file_path = os.path.join(collections_directory, 'openfisca.json')
    openfisca_survey_collection.dump(json_file_path = json_file_path)
def set_table_in_survey(input_dataframe, entity, period, collection, survey_name, survey_label = None,
        table_label = None, table_name = None):
    period = periods.period(period)
    if table_name is None:
        table_name = entity + '_' + str(period)
    if table_label is None:
        table_label = "Input data for entity {} at period {}".format(entity, period)
    try:
        survey_collection = SurveyCollection.load(collection = collection)
    except configparser.NoOptionError:
        survey_collection = SurveyCollection(name = collection)
    except configparser.NoSectionError:  # For tests
        data_dir = os.path.join(
            pkg_resources.get_distribution('openfisca-survey-manager').location,
            'openfisca_survey_manager',
            'tests',
            'data_files',
            )
        survey_collection = SurveyCollection(
            name = collection,
            config_files_directory = data_dir,
            )

    try:
        survey = survey_collection.get_survey(survey_name)
    except AssertionError:
        survey = Survey(
            name = survey_name,
            label = survey_label or None,
            survey_collection = survey_collection,
            )

    if survey.hdf5_file_path is None:
        config = survey.survey_collection.config
        directory_path = config.get("data", "output_directory")
        if not os.path.isdir(directory_path):
            log.warn("{} who should be the HDF5 data directory does not exist: we create the directory".format(
                directory_path))
            os.makedirs(directory_path)
        survey.hdf5_file_path = os.path.join(directory_path, survey.name + '.h5')

    assert survey.hdf5_file_path is not None
    survey.insert_table(label = table_label, name = table_name, dataframe = input_dataframe)
    survey_collection.surveys = [
        kept_survey for kept_survey in survey_collection.surveys if kept_survey.name != survey_name
        ]
    survey_collection.surveys.append(survey)
    collections_directory = survey_collection.config.get('collections', 'collections_directory')
    assert os.path.isdir(collections_directory), """{} who should be the collections' directory does not exist.
Fix the option collections_directory in the collections section of your config file.""".format(collections_directory)
    collection_json_path = os.path.join(collections_directory, "{}.json".format(collection))
    survey_collection.dump(json_file_path = collection_json_path)
def build_survey_collection(
        config_files_directory: str,
        collection_name = None,
        replace_metadata = False,
        replace_data = False,
        data_directory_path_by_survey_suffix = None,
        source_format = 'sas',
        ):

    assert collection_name is not None
    assert data_directory_path_by_survey_suffix is not None
    surveys_name = list(data_directory_path_by_survey_suffix.keys())
    assert surveys_name is not None, "A list of surveys to process is needed"

    if replace_metadata:
        survey_collection = SurveyCollection(
            name = collection_name, config_files_directory = config_files_directory)
    else:
        try:
            survey_collection = SurveyCollection.load(
                collection = collection_name, config_files_directory = config_files_directory)
        except configparser.NoOptionError:
            survey_collection = SurveyCollection(
                name = collection_name, config_files_directory = config_files_directory)

    for survey_suffix, data_directory_path in data_directory_path_by_survey_suffix.items():
        assert os.path.isdir(data_directory_path), '{} is not a valid directory path'.format(data_directory_path)

        data_file_by_format = create_data_file_by_format(data_directory_path)
        survey_name = '{}_{}'.format(collection_name, survey_suffix)
        add_survey_to_collection(
            survey_name = survey_name,
            survey_collection = survey_collection,
            sas_files = data_file_by_format.get('sas'),
            stata_files = data_file_by_format.get('stata'),
            )

        valid_source_format = [
            _format for _format in list(data_file_by_format.keys())
            if data_file_by_format.get((_format))
            ]
        log.info("Valid source formats are: {}".format(valid_source_format))
        source_format = valid_source_format[0]
        log.info("Using the following format: {}".format(source_format))
        collections_directory = survey_collection.config.get('collections', 'collections_directory')
        assert os.path.isdir(collections_directory), """{} who should be the collections' directory does not exist.
Fix the option collections_directory in the collections section of your config file.""".format(collections_directory)
        collection_json_path = os.path.join(collections_directory, "{}.json".format(collection_name))
        survey_collection.dump(json_file_path = collection_json_path)
        surveys = [survey for survey in survey_collection.surveys if survey.name.endswith(str(survey_suffix))]
        survey_collection.fill_hdf(source_format = source_format, surveys = surveys, overwrite = replace_data)
    return survey_collection
def get_input_data_frame(year):
    openfisca_survey_collection = SurveyCollection.load(collection = "openfisca")
    openfisca_survey = openfisca_survey_collection.get_survey("openfisca_data_{}".format(year))
    input_data_frame = openfisca_survey.get_values(table = "input").reset_index(drop = True)
    input_data_frame.rename(
        columns = dict(
            alr = 'pensions_alimentaires_percues',
            choi = 'chomage_imposable',
            cho_ld = 'chomeur_longue_duree',
            fra = 'frais_reels',
            rsti = 'retraite_imposable',
            sali = 'salaire_imposable',
            ),
        inplace = True,
        )
    return input_data_frame
def get_input_data_frame(year):
    openfisca_survey_collection = SurveyCollection.load(collection="openfisca")
    openfisca_survey = openfisca_survey_collection.get_survey(
        "openfisca_data_{}".format(year))
    input_data_frame = openfisca_survey.get_values(table="input").reset_index(
        drop=True)
    input_data_frame.rename(
        columns=dict(
            alr='pensions_alimentaires_percues',
            choi='chomage_imposable',
            cho_ld='chomeur_longue_duree',
            fra='frais_reels',
            rsti='retraite_imposable',
            sali='salaire_imposable',
        ),
        inplace=True,
    )
    return input_data_frame
def build_merged_dataframes(temporary_store = None, year = None):
    assert temporary_store is not None
    assert year is not None
    log.debug("Chargement des tables des enquêtes")
    erfs_fpr_survey_collection = SurveyCollection.load(collection = 'erfs_fpr')
    yr = str(year)[-2:]  # 12 for 2012
    survey = erfs_fpr_survey_collection.get_survey('erfs_fpr_{}'.format(year))
    fpr_menage = survey.get_values(table = 'fpr_menage_{}_retropole'.format(year))
    eec_menage = survey.get_values(table = 'fpr_mrf{}e{}t4'.format(yr, yr))
    eec_individu = survey.get_values(table = 'fpr_irf{}e{}t4'.format(yr, yr))
    fpr_individu = survey.get_values(table = 'fpr_indiv_{}_retropole'.format(year))

    individus, menages = merge_tables(fpr_menage, eec_menage, eec_individu, fpr_individu, year)
    temporary_store['menages_{}'.format(year)] = menages
    del eec_menage, fpr_menage, menages
    gc.collect()
    temporary_store['individus_{}_post_01'.format(year)] = individus
    del eec_individu, fpr_individu
def build_other_menage_variables(year = None):
    """Build menage consumption by categorie fiscale dataframe """

    assert year is not None
    # Load data
    bdf_survey_collection = SurveyCollection.load(
        collection = 'budget_des_familles', config_files_directory = config_files_directory
        )
    survey = bdf_survey_collection.get_survey('budget_des_familles_{}'.format(year))

    c05d = survey.get_values(table = "c05d")
    kept_variables = [u'ident_men', u'pondmen']
    c05d = c05d[kept_variables]

    menage = survey.get_values(table = "menage")
    kept_variables = [u'ident_men', u'pondmen', u'revtot', u'revtotuc', u'decuc']
    menage = menage[kept_variables]
    data_frame = menage.merge(c05d, copy = True)
    return data_frame
def build_homogeneisation_vehicules(year = None):
    """Compute vehicule numbers by type"""

    assert year is not None
    # Load data
    bdf_survey_collection = SurveyCollection.load(
        collection = 'budget_des_familles', config_files_directory = config_files_directory)
    survey = bdf_survey_collection.get_survey('budget_des_familles_{}'.format(year))

    if year == 2000:
        vehicule = survey.get_values(table = "depmen")
        kept_variables = ['ident', 'carbu01', 'carbu02']
        vehicule = vehicule[kept_variables]
        vehicule.rename(columns = {'ident': 'ident_men'}, inplace = True)
        vehicule.rename(columns = {'carbu01': 'carbu1'}, inplace = True)
        vehicule.rename(columns = {'carbu02': 'carbu2'}, inplace = True)
        vehicule["veh_tot"] = '1'
        vehicule["veh_essence"] = 1*(vehicule['carbu1'] == '1') + 1*(vehicule['carbu2'] == '1')
        vehicule["veh_diesel"] = 1*(vehicule['carbu1'] == '2') + 1*(vehicule['carbu2'] == '2')

    if year == 2005:
        vehicule = survey.get_values(table = "automobile")
        kept_variables = ['ident_men', 'carbu']
        vehicule = vehicule[kept_variables]
        vehicule["veh_tot"] = 1
        vehicule["veh_essence"] = (vehicule['carbu'] == 1)
        vehicule["veh_diesel"] = (vehicule['carbu'] == 2)

    if year == 2011:
        vehicule = survey.get_values(table = "automobile")
        kept_variables = ['ident_me', 'carbu']
        vehicule = vehicule[kept_variables]
        vehicule.rename(columns = {'ident_me': 'ident_men'}, inplace = True)
        vehicule["veh_tot"] = 1
        vehicule["veh_essence"] = (vehicule['carbu'] == 1)
        vehicule["veh_diesel"] = (vehicule['carbu'] == 2)

    # Compute the number of cars by category
    vehicule = vehicule.groupby(by = 'ident_men')["veh_tot", "veh_essence", "veh_diesel"].sum()

    # Save in temporary store
    temporary_store['automobile_{}'.format(year)] = vehicule
def build_clean_aliss_data_frame():
    year = 2011
    aliss_survey_collection = SurveyCollection.load(
        collection = 'aliss', config_files_directory = config_files_directory
        )
    survey = aliss_survey_collection.get_survey('aliss_{}'.format(year))

    aliss = survey.get_values(table = 'Base_ALISS_2011')
    aliss['age'] = 99
    aliss['revenus'] = 99

    triplets = [
        ('1 : Jeune/Ais', 0, 3),
        ('2 : Jeune/MoyenSup', 0, 2),
        ('3 : Jeune/MoyenInf', 0, 1),
        ('4 : Jeune/Modeste', 0, 0),
        ('5 : Age Moyen/Ais', 1, 3),
        ('6 : Age Moyen/MoyenSup', 1, 2),
        ('7 : Age Moyen/MoyenInf', 1, 1),
        ('8 : Age Moyen/Modeste', 1, 0),
        ('9 : Age Sup/Ais', 2, 3),
        ('10 : Age Sup/MoyenSup', 2, 2),
        ('11 : Age Sup/MoyenInf', 2, 1),
        ('12 : Age Sup/Modeste', 2, 0),
        ('13 : Vieux/Ais', 3, 3),
        ('14 : Vieux/MoyenSup', 3, 2),
        ('15 : Vieux/MoyenInf', 3, 1),
        ('16 : Vieux/Modeste', 3, 0),
        ]

    for household_type, age, revenus in triplets:
        print household_type, age, revenus
        selection = aliss.type.str.startswith(household_type)
        aliss.loc[selection, 'age'] = age
        aliss.loc[selection, 'revenus'] = revenus

    assert aliss.age.isin(range(4)).all()
    assert aliss.revenus.isin(range(4)).all()
    del aliss['type']

    return aliss
Exemple #32
0
def build_merged_dataframes(temporary_store=None, year=None):
    assert temporary_store is not None
    assert year is not None
    log.debug("Chargement des tables des enquêtes")
    erfs_fpr_survey_collection = SurveyCollection.load(collection='erfs_fpr')
    yr = str(year)[-2:]  # 12 for 2012
    survey = erfs_fpr_survey_collection.get_survey('erfs_fpr_{}'.format(year))
    fpr_menage = survey.get_values(
        table='fpr_menage_{}_retropole'.format(year))
    eec_menage = survey.get_values(table='fpr_mrf{}e{}t4'.format(yr, yr))
    eec_individu = survey.get_values(table='fpr_irf{}e{}t4'.format(yr, yr))
    fpr_individu = survey.get_values(
        table='fpr_indiv_{}_retropole'.format(year))

    individus, menages = merge_tables(fpr_menage, eec_menage, eec_individu,
                                      fpr_individu, year)
    temporary_store['menages_{}'.format(year)] = menages
    del eec_menage, fpr_menage, menages
    gc.collect()
    temporary_store['individus_{}_post_01'.format(year)] = individus
    del eec_individu, fpr_individu
def build_erfs_survey_collection(years = None, erase = False, overwrite = False):

    if years is None:
        log.error("A list of years to process is needed")

    if erase:
        erfs_survey_collection = SurveyCollection(
            name = "erfs", config_files_directory = config_files_directory)
    else:
        try:
            erfs_survey_collection = SurveyCollection.load(
                collection = 'erfs', config_files_directory = config_files_directory)
        except ConfigParser.NoOptionError:
            erfs_survey_collection = SurveyCollection(
                name = "erfs", config_files_directory = config_files_directory)

    input_data_directory = erfs_survey_collection.config.get('data', 'input_directory')
    if getpass.getuser() == 'benjello':
        input_data_directory = os.path.join(os.path.dirname(input_data_directory), 'INSEE')
    else:
        input_data_directory = os.path.dirname(input_data_directory)

    for year in years:
        data_directory_path = os.path.join(
            input_data_directory,
            'ERF/ERFS_{}'.format(year)
            )
        data_file_by_format = create_data_file_by_format(data_directory_path)
        survey_name = 'erfs_{}'.format(year)

        add_survey_to_collection(
            survey_name = survey_name,
            survey_collection = erfs_survey_collection,
            sas_files = data_file_by_format['sas'],
            )

        collections_directory = erfs_survey_collection.config.get('collections', 'collections_directory')
        collection_json_path = os.path.join(collections_directory, "erfs" + ".json")
        erfs_survey_collection.dump(json_file_path = collection_json_path)
        surveys = [survey for survey in erfs_survey_collection.surveys if survey.name.endswith(str(year))]

        erfs_survey_collection.fill_hdf(source_format = 'sas', surveys = surveys, overwrite = overwrite)
    return erfs_survey_collection
Exemple #34
0
def build_clean_aliss_data_frame():
    year = 2011
    aliss_survey_collection = SurveyCollection.load(
        collection='aliss', config_files_directory=config_files_directory)
    survey = aliss_survey_collection.get_survey('aliss_{}'.format(year))

    aliss = survey.get_values(table='Base_ALISS_2011')
    aliss['age'] = 99
    aliss['revenus'] = 99

    triplets = [
        ('1 : Jeune/Ais', 0, 3),
        ('2 : Jeune/MoyenSup', 0, 2),
        ('3 : Jeune/MoyenInf', 0, 1),
        ('4 : Jeune/Modeste', 0, 0),
        ('5 : Age Moyen/Ais', 1, 3),
        ('6 : Age Moyen/MoyenSup', 1, 2),
        ('7 : Age Moyen/MoyenInf', 1, 1),
        ('8 : Age Moyen/Modeste', 1, 0),
        ('9 : Age Sup/Ais', 2, 3),
        ('10 : Age Sup/MoyenSup', 2, 2),
        ('11 : Age Sup/MoyenInf', 2, 1),
        ('12 : Age Sup/Modeste', 2, 0),
        ('13 : Vieux/Ais', 3, 3),
        ('14 : Vieux/MoyenSup', 3, 2),
        ('15 : Vieux/MoyenInf', 3, 1),
        ('16 : Vieux/Modeste', 3, 0),
    ]

    for household_type, age, revenus in triplets:
        print household_type, age, revenus
        selection = aliss.type.str.startswith(household_type)
        aliss.loc[selection, 'age'] = age
        aliss.loc[selection, 'revenus'] = revenus

    assert aliss.age.isin(range(4)).all()
    assert aliss.revenus.isin(range(4)).all()
    del aliss['type']
    return aliss
 def create(cls, year = None, input_data_frame = None):
     assert year is not None
     openfisca_survey_collection = SurveyCollection.load(
         collection = "openfisca", config_files_directory = config_files_directory)
     openfisca_survey = openfisca_survey_collection.get_survey("openfisca_data_{}".format(year))
     if input_data_frame:
         input_data_frame = (input_data_frame
             .reset_index(drop = True)
             .rename(
                 columns = dict(
                     alr = 'pensions_alimentaires_percues',
                     choi = 'chomage_imposable',
                     cho_ld = 'chomeur_longue_duree',
                     fra = 'frais_reels',
                     rsti = 'retraite_imposable',
                     sali = 'salaire_imposable',
                     ),
                 inplace = True,
                 )
             )
     else:
         input_data_frame = (openfisca_survey.get_values(table = "input")
             .reset_index(drop = True)
             .rename(
                 columns = dict(
                     alr = 'pensions_alimentaires_percues',
                     choi = 'chomage_imposable',
                     cho_ld = 'chomeur_longue_duree',
                     fra = 'frais_reels',
                     rsti = 'retraite_imposable',
                     sali = 'salaire_imposable',
                     ),
                 inplace = True,
                 )
             )
     return cls().init_from_data_frame(
         input_data_frame = input_data_frame,
         year = year,
         )
def show(ctx, collection_name, survey_name = None, tables_names = None):
    parser = SafeConfigParser()
    parser.read(ctx.obj['CONFIG_FILE'])
    json_file_path = os.path.abspath(parser.get("collections", collection_name))
    survey_collection = SurveyCollection.load(json_file_path = json_file_path)
    click.echo(survey_collection)
    if survey_name is not None:
        survey = [
            kept_survey for kept_survey in survey_collection.surveys if kept_survey.name == survey_name
            ][0]
        if survey is not None:
            click.echo(survey)
        else:
            click.echo("{} is not an element of collection {} surveys ({})".format(
                survey_name, collection_name, str(survey_collection.surveys.keys()).strip('[]')))

        if tables_names:
            for table_name in tables_names:
                click.echo(yaml.safe_dump(
                    {"table {}".format(table_name): survey.tables[table_name]},
                    default_flow_style = False,
                    ))
def build_survey_collection(name = None, erase_collection_json = False, overwrite_surveys = False,
        data_directory_path_by_year = None, source_format = 'sas'):

    assert name is not None
    assert data_directory_path_by_year is not None
    years = data_directory_path_by_year.keys()
    if years is None:
        log.error("A list of years to process is needed")

    if erase_collection_json:
        survey_collection = SurveyCollection(
            name = name, config_files_directory = config_files_directory)
    else:
        try:
            survey_collection = SurveyCollection.load(
                collection = name, config_files_directory = config_files_directory)
        except ConfigParser.NoOptionError:
            survey_collection = SurveyCollection(
                name = name, config_files_directory = config_files_directory)

    for year, data_directory_path in data_directory_path_by_year.iteritems():
        if not os.path.isdir(data_directory_path):
            input_data_directory = survey_collection.config.get('data', 'input_directory')
            assert os.path.isdir(input_data_directory)
            data_directory_path = os.path.join(input_data_directory, data_directory_path)
            assert os.path.isdir(input_data_directory)

        data_file_by_format = create_data_file_by_format(data_directory_path)
        print data_file_by_format
        survey_name = '{}_{}'.format(name, year)
        add_survey_to_collection(
            survey_name = survey_name,
            survey_collection = survey_collection,
            sas_files = data_file_by_format[source_format],
            )
        collections_directory = survey_collection.config.get('collections', 'collections_directory')
        collection_json_path = os.path.join(collections_directory, "{}.json".format(name))
        survey_collection.dump(json_file_path = collection_json_path)
        surveys = [survey for survey in survey_collection.surveys if survey.name.endswith(str(year))]
        survey_collection.fill_hdf(source_format = source_format, surveys = surveys, overwrite = overwrite_surveys)
    return survey_collection
def coicop_from_aliss(year = 2011):
    assert year == 2011
    aliss_survey_collection = SurveyCollection.load(
        collection = 'aliss', config_files_directory = config_files_directory
        )
    survey = aliss_survey_collection.get_survey('aliss_{}'.format(year))

    aliss = survey.get_values(table = 'Base_ALISS_2011')
    dirty_produits = aliss.souscode.unique()
    entries = list()

    for dirty_produit in dirty_produits:
        entries.append(dict(
            code_coicop = '0' + '.'.join(dirty_produit[:4]),
            label = dirty_produit[6:],
            code_aliss = dirty_produit[:6],
            ))

    result = pandas.DataFrame(entries)

    assert not result.code_coicop.duplicated().any()

    return result
def coicop_from_aliss(year=2011):
    assert year == 2011
    aliss_survey_collection = SurveyCollection.load(
        collection='aliss', config_files_directory=config_files_directory)
    survey = aliss_survey_collection.get_survey('aliss_{}'.format(year))

    aliss = survey.get_values(table='Base_ALISS_2011')
    dirty_produits = aliss.souscode.unique()
    entries = list()

    for dirty_produit in dirty_produits:
        entries.append(
            dict(
                code_coicop='0' + '.'.join(dirty_produit[:4]),
                label=dirty_produit[6:],
                code_aliss=dirty_produit[:6],
            ))

    result = pandas.DataFrame(entries)

    assert not result.code_coicop.duplicated().any()

    return result
 def create(cls, year=None, input_data_frame=None):
     assert year is not None
     openfisca_survey_collection = SurveyCollection.load(
         collection="openfisca",
         config_files_directory=config_files_directory)
     openfisca_survey = openfisca_survey_collection.get_survey(
         "openfisca_data_{}".format(year))
     if input_data_frame:
         input_data_frame = (input_data_frame.reset_index(drop=True).rename(
             columns=dict(
                 alr='pensions_alimentaires_percues',
                 choi='chomage_imposable',
                 cho_ld='chomeur_longue_duree',
                 fra='frais_reels',
                 rsti='retraite_imposable',
                 sali='salaire_imposable',
             ),
             inplace=True,
         ))
     else:
         input_data_frame = (openfisca_survey.get_values(
             table="input").reset_index(drop=True).rename(
                 columns=dict(
                     alr='pensions_alimentaires_percues',
                     choi='chomage_imposable',
                     cho_ld='chomeur_longue_duree',
                     fra='frais_reels',
                     rsti='retraite_imposable',
                     sali='salaire_imposable',
                 ),
                 inplace=True,
             ))
     return cls().init_from_data_frame(
         input_data_frame=input_data_frame,
         year=year,
     )
def build_survey_collection(collection_name = None, replace_metadata = False, replace_data = False,
        data_directory_path_by_survey_suffix = None, source_format = 'sas'):

    assert collection_name is not None
    assert data_directory_path_by_survey_suffix is not None
    surveys_name = data_directory_path_by_survey_suffix.keys()
    assert surveys_name is not None, "A list of surveys to process is needed"

    if replace_metadata:
        survey_collection = SurveyCollection(
            name = collection_name, config_files_directory = config_files_directory)
    else:
        try:
            survey_collection = SurveyCollection.load(
                collection = collection_name, config_files_directory = config_files_directory)
        except ConfigParser.NoOptionError:
            survey_collection = SurveyCollection(
                name = collection_name, config_files_directory = config_files_directory)

    for survey_suffix, data_directory_path in data_directory_path_by_survey_suffix.iteritems():
        assert os.path.isdir(data_directory_path)

        data_file_by_format = create_data_file_by_format(data_directory_path)
        print data_file_by_format
        survey_name = '{}_{}'.format(collection_name, survey_suffix)
        add_survey_to_collection(
            survey_name = survey_name,
            survey_collection = survey_collection,
            sas_files = data_file_by_format[source_format],
            )
        collections_directory = survey_collection.config.get('collections', 'collections_directory')
        assert os.path.isdir(collections_directory), """{} who should be the collections' directory does not exist.
Fix the option collections_directory in the collections section of your config file.""".format(collections_directory)
        collection_json_path = os.path.join(collections_directory, "{}.json".format(collection_name))
        survey_collection.dump(json_file_path = collection_json_path)
        surveys = [survey for survey in survey_collection.surveys if survey.name.endswith(str(survey_suffix))]
        survey_collection.fill_hdf(source_format = source_format, surveys = surveys, overwrite = replace_data)
    return survey_collection
def store_input_data_frame(data_frame=None,
                           collection=None,
                           survey=None,
                           table=None):
    assert data_frame is not None
    assert collection is not None
    assert survey is not None
    try:
        openfisca_survey_collection = SurveyCollection.load(
            collection=collection)
    except Exception as e:
        openfisca_survey_collection = SurveyCollection(name=collection)

    log.debug("In collection {} the following survey are present: {}".format(
        collection, openfisca_survey_collection.surveys))
    output_data_directory = openfisca_survey_collection.config.get(
        'data', 'output_directory')
    if table is None:
        table = "input"
    #
    survey_name = survey
    hdf5_file_path = os.path.join(os.path.dirname(output_data_directory),
                                  "{}.h5".format(survey_name))
    available_survey_names = [
        survey_.name for survey_ in openfisca_survey_collection.surveys
    ]
    if survey_name in available_survey_names:
        survey = openfisca_survey_collection.get_survey(survey_name)
    else:
        survey = Survey(name=survey_name, hdf5_file_path=hdf5_file_path)
    survey.insert_table(name=table, data_frame=data_frame)
    openfisca_survey_collection.surveys.append(survey)
    collections_directory = openfisca_survey_collection.config.get(
        'collections', 'collections_directory')
    json_file_path = os.path.join(collections_directory,
                                  '{}.json'.format(collection))
    log.debug("In collection {} the following surveyx are present: {}".format(
        collection, openfisca_survey_collection.surveys))
    openfisca_survey_collection.dump(json_file_path=json_file_path)
def build_homogeneisation_caracteristiques_sociales(temporary_store = None, year = None):
    u"""Homogénéisation des caractéristiques sociales des ménages """

    assert temporary_store is not None
    assert year is not None
    # Load data
    bdf_survey_collection = SurveyCollection.load(
        collection = 'budget_des_familles', config_files_directory = config_files_directory)
    survey = bdf_survey_collection.get_survey('budget_des_familles_{}'.format(year))
    # ******************************************************************************************************************
    # * Etape n° 0-3 : HOMOGENEISATION DES CARACTERISTIQUES SOCIALES DES MENAGES
    # ******************************************************************************************************************
    # ******************************************************************************************************************

    if year == 1995:
        kept_variables = ['exdep', 'exrev', 'mena', 'v', 'ponderrd', 'nbpers', 'nbenf', 'typmen1', 'cohabpr', 'sexepr',
            'agepr', 'agecj', 'matripr', 'occuppr', 'occupcj', 'nbact', 'sitlog', 'stalog', 'mena', 'nm14a', 'typmen1']
        menage = survey.get_values(
            table = "socioscm",
            variables = kept_variables,
            )
        # cette étape permet de ne garder que les données dont on est sûr de la qualité et de la véracité
        # exdep = 1 si les données sont bien remplies pour les dépenses du ménage
        # exrev = 1 si les données sont bien remplies pour les revenus du ménage
        menage = menage[(menage.exdep == 1) & (menage.exrev == 1)]
        menage.rename(
            columns = {
                'v': 'vag',
                'mena': 'ident_men',
                'ponderrd': 'pondmen',
                'nbpers': 'npers',
                'nm14a': 'nenfants',
                'nbenf': 'nenfhors',
                'nbact': 'nactifs',
                'cohabpr': 'couplepr',
                'matripr': 'etamatri',
                'typmen1': 'typmen'
                },
            inplace = True,
            )
        # la variable vag est utilisée dans les modèles QAIDS et AIDS comme variable temporelle afin d'attibuer
        # le bon prix mensuel
        menage.agecj = menage.agecj.fillna(0)
        menage.nenfhors = menage.nenfhors.fillna(0)
        menage.vag = menage.vag.astype('int')

        menage['nadultes'] = menage['npers'] - menage['nenfants']
        menage['ocde10'] = 1 + 0.5 * numpy.maximum(0, menage['nadultes'] - 1) + 0.3 * menage['nenfants']

        # harmonisation des types de ménage sur la nomenclature 2010
        menage['typmen_'] = menage['typmen']
        menage.typmen[menage.typmen_ == 1] = 1
        menage.typmen[menage.typmen_ == 2] = 3
        menage.typmen[menage.typmen_ == 3] = 4
        menage.typmen[menage.typmen_ == 4] = 4
        menage.typmen[menage.typmen_ == 5] = 4
        menage.typmen[menage.typmen_ == 6] = 2
        menage.typmen[menage.typmen_ == 7] = 5
        del menage['typmen_']

        var_to_ints = ['couplepr', 'etamatri']
        for var_to_int in var_to_ints:
            menage[var_to_int] = menage[var_to_int].astype(int)

        #  Methode :
        #  1. on nettoite les variables (i.e. changement de nom de format)
        #  2. Reformatage des variables (réattribution des catégories pour quelles soient identiques
        #     pour les différentes années)

        menage["situacj"] = 0
        menage.situacj[menage.occupcj == 1] = 1
        menage.situacj[menage.occupcj == 3] = 3
        menage.situacj[menage.occupcj == 2] = 4
        menage.situacj[menage.occupcj == 5] = 5
        menage.situacj[menage.occupcj == 6] = 5
        menage.situacj[menage.occupcj == 7] = 6
        menage.situacj[menage.occupcj == 8] = 7
        menage.situacj[menage.occupcj == 4] = 8

        menage["situapr"] = 0
        menage.situapr[menage.occuppr == 1] = 1
        menage.situapr[menage.occuppr == 3] = 3
        menage.situapr[menage.occuppr == 2] = 4
        menage.situapr[menage.occuppr == 5] = 5
        menage.situapr[menage.occuppr == 6] = 5
        menage.situapr[menage.occuppr == 7] = 6
        menage.situapr[menage.occuppr == 8] = 7
        menage.situapr[menage.occuppr == 4] = 8

        menage["typlog"] = 0
        menage.typlog[menage.sitlog == 1] = 1
        menage.typlog[menage.sitlog != 1] = 2

        menage['stalog'] = menage['stalog'].astype(int)

        individus = survey.get_values(
            table = "individu",
            )
        variables = ['mena', 'v']
        individus.rename(
            columns = {'mena': 'identmen'},
            inplace = True,
            )
        menage.set_index('ident_men', inplace = True)

    if year == 2000:
        menage = survey.get_values(
            table = "menage",
            variables = [
                'ident', 'pondmen', 'nbact', 'nbenf1', 'nbpers', 'ocde10', 'sitlog', 'stalog', 'strate',
                'typmen1', 'zeat', 'stalog', 'vag', 'sexepr', 'sexecj', 'agecj', 'napr', 'nacj', 'cs2pr',
                'cs2cj', 'diegpr', 'dieppr', 'diespr', 'diegcj', 'diepcj', 'diescj', 'hod_nb', 'cohabpr',
                'occupapr', 'occupacj', 'occupbpr', 'occupbcj', 'occupcpr', 'occupccj', 'typmen1'
                ]
            )
        menage.rename(
            columns = {
                'cohabpr': 'couplepr',
                'hod_nb': 'nenfhors',
                'ident': 'ident_men',
                'nbact': 'nactifs',
                'nbenf1': 'nenfants',
                'nbpers': 'npers',
                'rev81': 'poste_coicop_421',
                'typmen1': 'typmen'
                },
            inplace = True,
            )
        menage.ocde10 = menage.ocde10 / 10
        # on met un numéro à chaque vague pour pouvoir faire un meilleur suivi des évolutions temporelles
        # pour le modèle de demande
        menage.agecj = menage.agecj.fillna(0)

        assert menage.notnull().all().all(), 'The following variables contains NaN values: {}'.format(
            list(menage.isnull().any()[menage.isnull().any()].index))

        menage['vag_'] = menage['vag']
        menage.vag.loc[menage.vag_ == 1] = 9
        menage.vag.loc[menage.vag_ == 2] = 10
        menage.vag.loc[menage.vag_ == 3] = 11
        menage.vag.loc[menage.vag_ == 4] = 12
        menage.vag.loc[menage.vag_ == 5] = 13
        menage.vag.loc[menage.vag_ == 6] = 14
        menage.vag.loc[menage.vag_ == 7] = 15
        menage.vag.loc[menage.vag_ == 8] = 16
        del menage['vag_']
        # harmonisation des types de ménage sur la nomenclature 2010
        menage['typmen_'] = menage['typmen']
        menage.typmen.loc[menage.typmen_ == 1] = 1
        menage.typmen.loc[menage.typmen_ == 2] = 3
        menage.typmen.loc[menage.typmen_ == 3] = 4
        menage.typmen.loc[menage.typmen_ == 4] = 4
        menage.typmen.loc[menage.typmen_ == 5] = 4
        menage.typmen.loc[menage.typmen_ == 6] = 2
        menage.typmen.loc[menage.typmen_ == 7] = 5
        del menage['typmen_']

        menage.couplepr = menage.couplepr.astype('int')
        menage["nadultes"] = menage['npers'] - menage['nenfants']

        menage.typmen = menage.typmen.astype('int')

        # occupa : 1 si la personne travaille, 2 sinon. occupb : 1 si elle travaille effectivement, 2 si congé de
        # longue durée (négligé ici). occupc : de 2 à 8 selon le statut si ne travaille pas (étudiant, retraité, etc.)
        menage["situacj"] = 0
        menage.situacj.loc[menage.occupacj == 1] = 1
        menage.situacj.loc[menage.occupccj == 3] = 3
        menage.situacj.loc[menage.occupccj == 2] = 4
        menage.situacj.loc[menage.occupccj == 5] = 5
        menage.situacj.loc[menage.occupccj == 6] = 5
        menage.situacj.loc[menage.occupccj == 7] = 6
        menage.situacj.loc[menage.occupccj == 8] = 7
        menage.situacj.loc[menage.occupccj == 4] = 8

        menage["situapr"] = 0
        menage.situapr.loc[menage.occupapr == 1] = 1
        menage.situapr.loc[menage.occupcpr == 3] = 3
        menage.situapr.loc[menage.occupcpr == 2] = 4
        menage.situapr.loc[menage.occupcpr == 5] = 5
        menage.situapr.loc[menage.occupcpr == 6] = 5
        menage.situapr.loc[menage.occupcpr == 7] = 6
        menage.situapr.loc[menage.occupcpr == 8] = 7
        menage.situapr.loc[menage.occupcpr == 4] = 8

        menage["natiocj"] = 0
        menage["natiopr"] = 0
        menage.natiocj.loc[menage.nacj == 1] = 1
        menage.natiocj.loc[menage.nacj == 2] = 1
        menage.natiocj.loc[menage.nacj == 3] = 2
        menage.natiopr.loc[menage.napr == 1] = 1
        menage.natiopr.loc[menage.napr == 2] = 1
        menage.natiopr.loc[menage.napr == 3] = 2

        menage["typlog"] = 0
        menage.typlog.loc[menage.sitlog == 1] = 1
        menage.typlog.loc[menage.sitlog != 1] = 2

        # Homogénéisation des diplômes, choix d'équivalence entre les diplômes
        menage["dip14pr"] = 999999
        menage.dip14pr.loc[menage.diegpr == 0] = 71
        menage.dip14pr.loc[menage.diegpr == 2] = 70
        menage.dip14pr.loc[menage.diegpr == 15] = 60
        menage.dip14pr.loc[menage.diegpr == 18] = 60
        menage.dip14pr.loc[menage.diegpr == 16] = 41
        menage.dip14pr.loc[menage.diegpr == 17] = 41
        menage.dip14pr.loc[menage.diegpr == 19] = 41

        menage.dip14pr.loc[menage.dieppr == 23] = 50
        menage.dip14pr.loc[menage.dieppr == 25] = 50
        menage.dip14pr.loc[menage.dieppr == 27] = 50
        menage.dip14pr.loc[menage.dieppr == 29] = 50
        menage.dip14pr.loc[menage.dieppr == 34] = 43
        menage.dip14pr.loc[menage.dieppr == 32] = 42
        menage.dip14pr.loc[menage.dieppr == 36] = 42

        menage.dip14pr.loc[menage.diespr == 41] = 30
        menage.dip14pr.loc[menage.diespr == 42] = 31
        menage.dip14pr.loc[menage.diespr == 43] = 31
        menage.dip14pr.loc[menage.diespr == 44] = 33
        menage.dip14pr.loc[menage.diespr == 46] = 20
        menage.dip14pr.loc[menage.diespr == 48] = 12
        menage.dip14pr.loc[menage.diespr == 47] = 10

        menage.set_index('ident_men', inplace = True)

        # Recodage des catégories zeat
        menage.zeat.loc[menage.zeat == 7] = 6
        menage.zeat.loc[menage.zeat == 8] = 7
        menage.zeat.loc[menage.zeat == 9] = 8

        assert menage.zeat.isin(range(1, 9)).all()

        individus = survey.get_values(
            table = "individus",
            variables = ['ident', 'matri', 'lien', 'anais']
            )

        individus = individus.loc[individus.lien == 1].copy()
        individus.rename(
            columns = {'ident': 'ident_men', 'matri': 'etamatri'},
            inplace = True,
            )
        variables_to_destring = ['anais']
        for variable_to_destring in variables_to_destring:
            individus[variable_to_destring] = individus[variable_to_destring].astype('int').copy()
        individus['agepr'] = year - individus.anais
        individus.set_index('ident_men', inplace = True)

        assert menage.notnull().all().all(), 'The following variables contains NaN values: {}'.format(
            list(menage.isnull().any()[menage.isnull().any()].index))

        menage = menage.merge(individus, left_index = True, right_index = True)

    if year == 2005:
        menage = survey.get_values(table = "menage")
        # données socio-démographiques
        socio_demo_variables = ['agpr', 'agcj', 'couplepr', 'decuc', 'ident_men', 'nactifs', 'nenfants', 'nenfhors',
            'npers', 'ocde10', 'pondmen', 'sexecj', 'sexepr', 'typmen5', 'vag', 'zeat', 'cs24pr']
        socio_demo_variables += [column for column in menage.columns if column.startswith('dip14')]
        socio_demo_variables += [column for column in menage.columns if column.startswith('natio7')]
        # activité professionnelle
        activite_prof_variables = ['situacj', 'situapr']
        activite_prof_variables += [column for column in menage.columns if column.startswith('cs42')]
        # logement
        logement_variables = ['htl', 'strate']
        menage = menage[socio_demo_variables + activite_prof_variables + logement_variables]
        menage.rename(
            columns = {
                # "agpr": "agepr",
                "agcj": "agecj",
                "typmen5": "typmen",
                "cs24pr": "cs_pr"
                },
            inplace = True,
            )
        del menage['agpr']
        menage['nadultes'] = menage.npers - menage.nenfants
        for person in ['pr', 'cj']:
            menage['natio' + person] = (menage['natio7' + person] > 2)  # TODO: changer de convention ?
            del menage['natio7' + person]

        menage.agecj = menage.agecj.fillna(0)
        menage.nenfhors = menage.nenfhors.fillna(0)
        var_to_ints = ['ocde10', 'decuc', 'nactifs', 'nenfants', 'npers', 'pondmen', 'nadultes']
        assert menage.notnull().all().all(), 'The following variables contains NaN values: {}'.format(
            list(menage.isnull().any()[menage.isnull().any()].index))

        menage.couplepr = menage.couplepr > 2  # TODO: changer de convention ?
        menage.ocde10 = menage.ocde10 / 10
        menage.set_index('ident_men', inplace = True)
        # on met un numéro à chaque vague pour pouvoir faire un meilleur suivi des évolutions temporelles
        # pour le modèle de demande
        menage['vag_'] = menage['vag']
        menage.vag.loc[menage.vag_ == 1] = 17
        menage.vag.loc[menage.vag_ == 2] = 18
        menage.vag.loc[menage.vag_ == 3] = 19
        menage.vag.loc[menage.vag_ == 4] = 20
        menage.vag.loc[menage.vag_ == 5] = 21
        menage.vag.loc[menage.vag_ == 6] = 22
        del menage['vag_']

        # Recodage des catégories zeat
        menage.zeat.loc[menage.zeat == 7] = 6
        menage.zeat.loc[menage.zeat == 8] = 7
        menage.zeat.loc[menage.zeat == 9] = 8

        assert menage.zeat.isin(range(1, 9)).all()

        stalog = survey.get_values(table = "depmen", variables = ['ident_men', 'stalog'])
        stalog['stalog'] = stalog.stalog.astype('int').copy()
        stalog['new_stalog'] = 0
        stalog.loc[stalog.stalog == 2, 'new_stalog'] = 1
        stalog.loc[stalog.stalog == 1, 'new_stalog'] = 2
        stalog.loc[stalog.stalog == 4, 'new_stalog'] = 3
        stalog.loc[stalog.stalog == 5, 'new_stalog'] = 4
        stalog.loc[stalog.stalog.isin([3, 6]), 'new_stalog'] = 5
        stalog.stalog = stalog.new_stalog.copy()
        del stalog['new_stalog']

        assert stalog.stalog.isin(range(1, 6)).all()
        stalog.set_index('ident_men', inplace = True)
        menage = menage.merge(stalog, left_index = True, right_index = True)
        menage['typlog'] = 2
        menage.loc[menage.htl.isin(['1', '5']), 'typlog'] = 1
        assert menage.typlog.isin([1, 2]).all()
        del menage['htl']

        individus = survey.get_values(table = 'individu')
        # Il y a un problème sur l'année de naissance,
        # donc on le recalcule avec l'année de naissance et la vague d'enquête
        individus['agepr'] = year - individus.anais
        individus.loc[individus.vag == 6, ['agepr']] = year + 1 - individus.anais
        individus = individus[individus.lienpref == 00].copy()
        kept_variables = ['ident_men', 'etamatri', 'agepr']
        individus = individus[kept_variables].copy()
        individus.etamatri.loc[individus.etamatri == 0] = 1
        individus['etamatri'] = individus['etamatri'].astype('int')  # MBJ TODO: define as a catagory ?
        individus.set_index('ident_men', inplace = True)
        menage = menage.merge(individus, left_index = True, right_index = True)

        individus = survey.get_values(
            table = 'individu',
            variables = ['ident_men', 'ident_ind', 'age', 'anais', 'vag', 'lienpref'],
            )
        # Il y a un problème sur l'année de naissance,
        # donc on le recalcule avec l'année de naissance et la vague d'enquête
        individus['age'] = year - individus.anais
        individus.loc[individus.vag == 6, ['age']] = year + 1 - individus.anais
        # Garder toutes les personnes du ménage qui ne sont pas la personne de référence et le conjoint
        individus = individus[(individus.lienpref != 00) & (individus.lienpref != 01)].copy()
        individus.sort_values(by = ['ident_men', 'ident_ind'], inplace = True)

        # Inspired by http://stackoverflow.com/questions/17228215/enumerate-each-row-for-each-group-in-a-dataframe
        def add_col_numero(data_frame):
            data_frame['numero'] = numpy.arange(len(data_frame)) + 3
            return data_frame

        individus = individus.groupby(by = 'ident_men').apply(add_col_numero)
        pivoted = individus.pivot(index = 'ident_men', columns = "numero", values = 'age')
        pivoted.columns = ["age{}".format(column) for column in pivoted.columns]
        menage = menage.merge(pivoted, left_index = True, right_index = True, how = 'outer')

        individus = survey.get_values(
            table = 'individu',
            variables = ['ident_men', 'ident_ind', 'agfinetu', 'lienpref'],
            )
        individus.set_index('ident_men', inplace = True)
        pr = individus.loc[individus.lienpref == 00, 'agfinetu'].copy()
        conjoint = individus.loc[individus.lienpref == 01, 'agfinetu'].copy()
        conjoint.name = 'agfinetu_cj'
        agfinetu_merged = pandas.concat([pr, conjoint], axis = 1)
        menage = menage.merge(agfinetu_merged, left_index = True, right_index = True)
        temporary_store['donnes_socio_demog_{}'.format(year)] = menage

        # label var agepr "Age de la personne de référence au 31/12/${yearrawdata}"
        # label var agecj "Age du conjoint de la PR au 31/12/${yearrawdata}"
        # label var sexepr "Sexe de la personne de référence"
        # label var sexecj "Sexe du conjoint de la PR"
        # label var cs42pr "Catégorie socio-professionnelle de la PR"
        # label var cs42cj "Catégorie socio-professionnelle du conjoint de la PR"
        # label var ocde10 "Nombre d'unités de consommation (échelle OCDE)"
        # label var ident_men "Identifiant du ménage"
        # label var pondmen "Ponderation du ménage"
        # label var npers "Nombre total de personnes dans le ménage"
        # label var nadultes "Nombre d'adultes dans le ménage"
        # label var nenfants "Nombre d'enfants dans le ménage"
        # label var nenfhors "Nombre d'enfants vivant hors domicile"
        # label var nactifs  "Nombre d'actifs dans le ménage"
        # label var couplepr "Vie en couple de la personne de référence"
        # label define typmen5 1 "Personne seule" 2 "Famille monoparentale" 3 "Couple sans enfant"
        #                      4 "Couple avec enfants" 5 "Autre type de ménage (complexe)"
        # label values typmen5 typmen5
        # label var typmen5 "Type de ménage (5 modalités)"
        # label var etamatri "Situation matrimoniale de la personne de référence"
        # label define matripr 1 "Célibataire" 2 "Marié(e)" 3 "Veuf(ve)" 4 "Divorcé(e)"
        # label values etamatri matripr
        # label define occupation 1 "Occupe un emploi" ///
        # 2 "Apprenti" ///
        # 3 "Etudiant, élève, en formation"  ///
        # 4 "Chômeur (inscrit ou non à l'ANPE)" ///
        # 5 "Retraité, préretraité ou retiré des affaires" ///
        # 6 "Au foyer"  ///
        # 7 "Autre situation (handicapé)"  ///
        # 8 "Militaire du contingent"
        # label values situapr occupation
        # label values situacj occupation
        # label var situapr "Situation d'activité de la personne de référence"
        # label var situacj "Situation d'activité du conjoint de la PR"
        # label define diplome 10 "Diplôme de 3ème cycle universitaire, doctorat" ///
        # 12 "Diplôme d'ingénieur, grande école" ///
        # 20 "Diplôme de 2nd cycle universitaire" ///
        # 30 "Diplôme de 1er cycle universitaire" ///
        # 31 "BTS, DUT ou équivalent" ///
        # 33 "Diplôme des professions sociales et de la santé niveau Bac +2" ///
        # 41 "Baccalauréat général, brevet supérieur, capacité en droit" ///
        # 42 "Baccalauréat technologique" ///
        # 43 "Baccalauréat professionnel" ///
        # 44 "Brevet professionnel ou de technicien" ///
        # 50 "CAP, BEP ou diplôme de même niveau" ///
        # 60 "Brevet des collèges, BEPC" ///
        # 70 "Certificat d'études primaires" ///
        # 71 "Aucun diplôme"
        # label values dip14pr diplome
        # label values dip14cj diplome
        # label var dip14pr "Diplôme le plus élevé de la PR"
        # label var dip14cj "Diplôme le plus élevé du conjoint de la PR"
        # label define nationalite 1 "Français, par naissance ou naturalisation" 2 "Etranger"
        # label values natiopr nationalite
        # label values natiocj nationalite
        # label var natiopr "Nationalité de la personne de référence"
        # label var natiocj "Nationalité du conjoint de la PR"
        # label define logement 1 "Maison" 2 "Appartement"
        # label values typlog logement
        # label var typlog "Type de logement"
        # label define statutlogement 1 "Propriétaire ou copropriétaire" ///
        # 2 "Accédant à la propriété (rembourse un prêt)" ///
        # 3 "Locataire" ///
        # 4 "Sous-locataire" ///
        # 5 "Logé gratuitement"
        # label values stalog statutlogement
        # label var stalog "Statut d'occupation du logement"
        # label define viecouple 1 "Vit en couple" 2 "Ne vit pas en couple"
        # label values couplepr viecouple
        #
        # /* Recodage des CSP en 12 et 8 postes à partir de classification de l'INSEE (2003, PCS niveaux 1 et 2) */
        # gen cs24pr=00
        # replace cs24pr=10 if cs42pr=="11"
        # replace cs24pr=10 if cs42pr=="12"
        # replace cs24pr=10 if cs42pr=="13"
        # replace cs24pr=21 if cs42pr=="21"
        # replace cs24pr=22 if cs42pr=="22"
        # replace cs24pr=23 if cs42pr=="23"
        # replace cs24pr=31 if cs42pr=="31"
        # replace cs24pr=32 if cs42pr=="33"
        # replace cs24pr=32 if cs42pr=="34"
        # replace cs24pr=32 if cs42pr=="35"
        # replace cs24pr=36 if cs42pr=="37"
        # replace cs24pr=36 if cs42pr=="38"
        # replace cs24pr=41 if cs42pr=="42"
        # replace cs24pr=41 if cs42pr=="43"
        # replace cs24pr=41 if cs42pr=="44"
        # replace cs24pr=41 if cs42pr=="45"
        # replace cs24pr=46 if cs42pr=="46"
        # replace cs24pr=47 if cs42pr=="47"
        # replace cs24pr=48 if cs42pr=="48"
        # replace cs24pr=51 if cs42pr=="52"
        # replace cs24pr=51 if cs42pr=="53"
        # replace cs24pr=54 if cs42pr=="54"
        # replace cs24pr=55 if cs42pr=="55"
        # replace cs24pr=56 if cs42pr=="56"
        # replace cs24pr=61 if cs42pr=="62"
        # replace cs24pr=61 if cs42pr=="63"
        # replace cs24pr=61 if cs42pr=="64"
        # replace cs24pr=61 if cs42pr=="65"
        # replace cs24pr=66 if cs42pr=="67"
        # replace cs24pr=66 if cs42pr=="68"
        # replace cs24pr=69 if cs42pr=="69"
        # replace cs24pr=71 if cs42pr=="71"
        # replace cs24pr=72 if cs42pr=="72"
        # replace cs24pr=73 if cs42pr=="74"
        # replace cs24pr=73 if cs42pr=="75"
        # replace cs24pr=76 if cs42pr=="77"
        # replace cs24pr=76 if cs42pr=="78"
        # replace cs24pr=81 if cs42pr=="81"
        # replace cs24pr=82 if cs42pr=="83"
        # replace cs24pr=82 if cs42pr=="84"
        # replace cs24pr=82 if cs42pr=="85"
        # replace cs24pr=82 if cs42pr=="86"
        # replace cs24pr=82 if cs42pr=="**"
        # replace cs24pr=82 if cs42pr=="00"
        #

        menage['cs24pr'] = 0
        csp42s_by_csp24 = {
            10: ["11", "12", "13"],
            21: ["21"],
            22: ["22"],
            23: ["23"],
            31: ["31"],
            32: ["32", "33", "34", "35"],
            36: ["37", "38"],
            41: ["42", "43", "44", "45"],
            46: ["46"],
            47: ["47"],
            48: ["48"],
            51: ["52", "53"],
            54: ["54"],
            55: ["55"],
            56: ["56"],
            61: ["62", "63", "64", "65"],
            66: ["67", "68"],
            69: ["69"],
            71: ["71"],
            72: ["72"],
            73: ["74", "75"],
            76: ["77", "78"],
            81: ["81"],
            82: ["83", "84", "85", "86", "**", "00"],
            }
        for csp24, csp42s in csp42s_by_csp24.items():
            menage.loc[menage.cs42pr.isin(csp42s), 'cs24pr'] = csp24
        assert menage.cs24pr.isin(csp42s_by_csp24.keys()).all()

        menage['cs8pr'] = numpy.floor(menage.cs24pr / 10)
        assert menage.cs8pr.isin(range(1, 9)).all()

        variables = [
            'pondmen', 'npers', 'nenfants', 'nenfhors', 'nadultes', 'nactifs', 'ocde10', 'typmen',
            'sexepr', 'agepr', 'etamatri', 'couplepr', 'situapr', 'dip14pr', 'cs42pr', 'cs24pr', 'cs8pr', 'natiopr',
            'sexecj', 'agecj', 'situacj', 'dip14cj', 'cs42cj', 'natiocj', 'typlog', 'stalog'
            ] + ["age{}".format(age) for age in range(3, 14)]

        for variable in variables:
            assert variable in menage.columns, "{} is not a column of menage data frame".format(variable)

    if year == 2011:
        variables = [
            'agecj',
            'agepr',
            'coeffuc',
            'decuc1',
            'ident_me',
            'pondmen',
            'npers',
            'nenfants',
            'nactifs',
            'sexepr',
            'sexecj',
            'dip14cj',
            'dip14pr',
            'typmen5',
            'cataeu',
            'situapr',
            'situacj',
            'zeat',
            ]

        try:
            menage = survey.get_values(table = "MENAGE", variables = variables)
        except:
            menage = survey.get_values(table = "menage", variables = variables)

        menage.rename(
            columns = {
                'ident_me': 'ident_men',
                'coeffuc': 'ocde10',
                'typmen5': 'typmen',
                'decuc1': 'decuc',
                'cataeu': 'strate'
                },
            inplace = True,
            )
        del variables
        menage.agecj = menage.agecj.fillna(0)
        # Ajout de la variable vag
        try:
            depmen = survey.get_values(table = "DEPMEN")
        except:
            depmen = survey.get_values(table = "depmen")
        depmen.rename(columns = {'ident_me': 'ident_men'}, inplace = True)
        vague = depmen[['vag', 'ident_men']].copy()
        stalog = depmen[['stalog', 'ident_men']].copy()
        del depmen

        menage.set_index('ident_men', inplace = True)
        vague.set_index('ident_men', inplace = True)
        menage = menage.merge(vague, left_index = True, right_index = True)
        # On met un numéro à chaque vague pour pouvoir faire un meilleur suivi des évolutions temporelles pour
        # le modèle de demande
        menage['vag_'] = menage['vag'].copy()
        menage.vag.loc[menage.vag_ == 1] = 23
        menage.vag.loc[menage.vag_ == 2] = 24
        menage.vag.loc[menage.vag_ == 3] = 25
        menage.vag.loc[menage.vag_ == 4] = 26
        menage.vag.loc[menage.vag_ == 5] = 27
        menage.vag.loc[menage.vag_ == 6] = 28
        del menage['vag_']

        # Homogénéisation de la variable statut du logement qui prend des valeurs différentes pour 2011
        stalog['stalog'] = stalog.stalog.astype('int').copy()
        stalog['new_stalog'] = 0
        stalog.loc[stalog.stalog == 2, 'new_stalog'] = 1
        stalog.loc[stalog.stalog == 1, 'new_stalog'] = 2
        stalog.loc[stalog.stalog == 4, 'new_stalog'] = 3
        stalog.loc[stalog.stalog == 5, 'new_stalog'] = 4
        stalog.loc[stalog.stalog.isin([3, 6]), 'new_stalog'] = 5
        stalog.stalog = stalog.new_stalog.copy()
        del stalog['new_stalog']
        assert stalog.stalog.isin(range(1, 6)).all()
        stalog.set_index('ident_men', inplace = True)
        menage = menage.merge(stalog, left_index = True, right_index = True)

        # Recodage des catégories zeat
        menage.loc[menage.zeat == 7, 'zeat'] = 6
        menage.zeat.loc[menage.zeat == 8] = 7
        menage.zeat.loc[menage.zeat == 9] = 8
        assert menage.zeat.isin(range(0, 9)).all()
        menage.index.name = 'ident_men'

    #
    assert menage.index.name == 'ident_men'
    menage['role_menage'] = 0
    temporary_store['donnes_socio_demog_{}'.format(year)] = menage
def create_comparable_logement_data_frame(temporary_store=None, year=None):
    assert temporary_store is not None
    assert year is not None
    logement_adresse_variables = ["gzc2"]
    logement_menage_variables = [
        "maa1at",
        "magtr",
        "mcs8",
        "mdiplo",
        "mrcho",
        "mrret",
        "mrsal",
        "mrtns",
        "mtybd",
        "muc1",
        "qex",
        "sec1",
    ]

    if year == 2003:
        logement_menage_variables.extend(
            ["hnph2", "ident", "lmlm", "mnatior", "typse"])
        logement_adresse_variables.extend(["iaat", "ident", "tu99"])
    if year > 2005:  # and year < 2010:
        logement_menage_variables.extend(["idlog", "mnatio"])
        logement_adresse_variables.extend(["idlog"])  # pas de typse en 2006
        logement_logement_variables = [
            "hnph2", "iaat", "idlog", "lmlm", "tu99"
        ]  # pas de typse en 2006

    # Travail sur la table logement
    # Table menage
    if year == 2003:
        year_lgt = 2003
    if year > 2005:  # and year < 2010:
        year_lgt = 2006

    logement_survey_collection = SurveyCollection.load(collection='logement')
    logement_survey = logement_survey_collection.get_survey(
        'logement_{}'.format(year_lgt))

    log.info("Preparing logement menage table")

    try:
        logement_menage = logement_survey.get_values(
            table="menage", variables=logement_menage_variables)
    except Exception:
        logement_menage = logement_survey.get_values(
            table="menage1", variables=logement_menage_variables)

    logement_menage.rename(columns={'idlog': 'ident'}, inplace=True)

    for revenus in ['mrcho', 'mrret', 'mrsal', 'mrtns']:
        logement_menage[revenus].fillna(0, inplace=True)

    logement_menage['revtot'] = (logement_menage.mrcho +
                                 logement_menage.mrret +
                                 logement_menage.mrsal + logement_menage.mrtns)
    # TODO : Virer les revenus négatifs ? mrtns :  118 revenus négatifs sur 42845 en 2006
    assert logement_menage.revtot.notnull().all()
    logement_menage[
        'nvpr'] = 10.0 * logement_menage['revtot'] / logement_menage['muc1']

    assert logement_menage.qex.notnull().all()
    assert (logement_menage.qex > 0).all()

    dec, values = mark_weighted_percentiles(
        logement_menage['nvpr'].values,
        numpy.arange(1, 11),
        logement_menage['qex'].values,
        2,
        return_quantiles=True,
    )
    values.sort()
    logement_menage['deci'] = (1 + (logement_menage.nvpr > values[1]) +
                               (logement_menage.nvpr > values[2]) +
                               (logement_menage.nvpr > values[3]) +
                               (logement_menage.nvpr > values[4]) +
                               (logement_menage.nvpr > values[5]) +
                               (logement_menage.nvpr > values[6]) +
                               (logement_menage.nvpr > values[7]) +
                               (logement_menage.nvpr > values[8]) +
                               (logement_menage.nvpr > values[9]))

    del dec, values
    assert logement_menage['deci'].isin(range(
        1, 11)).all(), "Logement decile are out of range'"
    gc.collect()

    if year_lgt == 2006:
        log.info('Preparing logement logement table')
        try:
            lgtlgt = logement_survey.get_values(
                table="lgt_logt", variables=logement_logement_variables)
        except Exception:
            lgtlgt = logement_survey.get_values(
                table="logement", variables=logement_logement_variables)

        lgtlgt.rename(columns={'idlog': 'ident'}, inplace=True)
        logement_menage = logement_menage.merge(lgtlgt,
                                                left_on='ident',
                                                right_on='ident',
                                                how='inner')
        del lgtlgt

    data = logement_menage.loc[logement_menage.sec1.isin([21, 22, 23, 24,
                                                          30])].copy()
    del logement_menage
    gc.collect()

    if year_lgt == 2006:
        data.rename(columns={'mnatio': 'mnatior'}, inplace=True)

    data = data.loc[data.mnatior.notnull()].copy()
    data = data.loc[data.sec1.notnull()].copy()
    data['tmp'] = data.sec1.astype("int")
    data.loc[data.sec1.isin([21, 22, 23]), 'tmp'] = 3
    data.loc[data.sec1 == 24, 'tmp'] = 4
    data.loc[data.sec1 == 30, 'tmp'] = 5
    data['statut_occupation'] = data.tmp
    count_NA('statut_occupation', data)
    logement_menage = data[data.statut_occupation.notnull()].copy()

    # Table adresse
    log.info(u"Préparation de la table adresse de l'enquête logement")

    logement_adresse = logement_survey.get_values(
        table="adresse", variables=logement_adresse_variables)
    logement_adresse.rename(columns={'idlog': 'ident'}, inplace=True)

    log.info(u"Fusion des tables logement et ménage de l'enquête logement")
    Logement = logement_menage.merge(logement_adresse, on='ident', how='inner')

    Logement.loc[Logement.hnph2 >= 6, 'hnph2'] = 6
    Logement.loc[Logement.hnph2 < 1, 'hnph2'] = 1
    count_NA('hnph2', Logement)
    assert Logement.hnph2.notnull().any(), "Some hnph2 are null"
    # Logement=(Logement[Logement['hnph2'].notnull()]) # Mis en comment car 0 NA pour hnph2

    # On est dans la même étape within ici et par la suite ( cf code R )
    # TODO : ici problème je transforme les 07 en 7
    # car Python considère les 0n comme des nombres octaux ( < 08 ).
    # J'espère que ce n'est pas important.
    Logement.loc[Logement.mnatior.isin([0, 1]), 'mnatior'] = 1
    Logement.loc[Logement.mnatior.isin([2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
                 'mnatior'] = 2
    count_NA('mnatior', Logement)
    assert_variable_in_range('mnatior', [1, 3], Logement)

    Logement['iaat_bis'] = 0
    Logement.loc[Logement.iaat.isin([1, 2, 3, 4, 5]),
                 'iaat_bis'] = 1  # avant 1967
    Logement.loc[Logement.iaat == 6, 'iaat_bis'] = 2  # 1968 - 1974
    Logement.loc[Logement.iaat == 7, 'iaat_bis'] = 3  # 1975 - 1981
    Logement.loc[Logement.iaat == 8, 'iaat_bis'] = 4  # 1982 - 1989
    Logement.loc[Logement.iaat == 9, 'iaat_bis'] = 5  # 1990 - 1998
    Logement.loc[Logement.iaat == 10, 'iaat_bis'] = 6  # après 1999
    assert Logement.iaat_bis.isin(range(1, 7)).all()

    Logement.loc[Logement.mdiplo == 1, 'mdiplo'] = 1
    Logement.loc[Logement.mdiplo.isin([2, 3, 4]), 'mdiplo'] = 2
    Logement.loc[Logement.mdiplo.isin([5, 6, 7, 8]), 'mdiplo'] = 3
    Logement.loc[Logement.mdiplo == 9, 'mdiplo'] = 4
    Logement.loc[Logement.mdiplo.isnull(), 'mdiplo'] = 0
    # TODO: assert Logement.mdiplo.isin(range(1, 5)).all()
    assert Logement.mdiplo.isin(range(0, 5)).all()
    Logement.mdiplo = Logement.mdiplo.astype('int')

    Logement.loc[Logement.mtybd == 110, 'mtybd'] = 1
    Logement.loc[Logement.mtybd == 120, 'mtybd'] = 2
    Logement.loc[Logement.mtybd == 200, 'mtybd'] = 3
    Logement.loc[Logement.mtybd.isin([311, 321, 401]), 'mtybd'] = 4
    Logement.loc[Logement.mtybd.isin([312, 322, 402]), 'mtybd'] = 5
    Logement.loc[Logement.mtybd.isin([313, 323, 403]), 'mtybd'] = 6
    Logement.loc[Logement.mtybd == 400, 'mtybd'] = 7
    assert Logement.mtybd.isin(range(1, 8)).all()
    Logement.mtybd = Logement.mtybd.astype('int')

    Logement['tu99_recoded'] = Logement.tu99.copy()
    count_NA('tu99', Logement)
    Logement.loc[Logement.tu99 == 0, 'tu99_recoded'] = 1
    Logement.loc[Logement.tu99.isin([1, 2, 3]), 'tu99_recoded'] = 2
    Logement.loc[Logement.tu99.isin([4, 5, 6]), 'tu99_recoded'] = 3
    Logement.loc[Logement.tu99 == 7, 'tu99_recoded'] = 4
    Logement.loc[Logement.tu99 == 8, 'tu99_recoded'] = 5
    count_NA('tu99_recoded', Logement)
    assert_variable_in_range('tu99_recoded', [1, 6], Logement)

    Logement.loc[Logement.gzc2 == 1, 'gzc2'] = 1
    Logement.loc[Logement.gzc2.isin([2, 3, 4, 5, 6]), 'gzc2'] = 2
    Logement.loc[Logement.gzc2 == 7, 'gzc2'] = 3
    count_NA('gzc2', Logement)
    # TODO: assert_variable_in_range('gzc2', [1, 4], Logement)

    Logement.loc[Logement.magtr.isin([1, 2]), 'magtr'] = 1
    Logement.loc[Logement.magtr.isin([3, 4]), 'magtr'] = 2
    Logement.loc[Logement.magtr == 5, 'magtr'] = 3
    assert Logement.magtr.isin(range(1, 4)).all()

    # Logement.loc[Logement.mcs8 == 1, 'mcs8'] = 1
    # Logement.loc[Logement.mcs8 == 2, 'mcs8'] = 2
    # Logement.loc[Logement.mcs8 == 3, 'mcs8'] = 3
    Logement.loc[Logement.mcs8.isin([4, 8]), 'mcs8'] = 4
    Logement.loc[Logement.mcs8.isin([5, 6, 7]), 'mcs8'] = 5
    assert Logement.mcs8.isin(range(1, 6)).all()

    Logement['logloy'] = numpy.log(Logement['lmlm'].values)
    kept_variables = [
        'deci',
        'hnph2',
        'iaat_bis',
        'lmlm',
        'magtr',
        'mcs8',
        'mdiplo',
        'mtybd',
        'qex',
        'statut_occupation',
        'tu99_recoded',
        # 'ident',
    ]

    logement = Logement[kept_variables].copy()
    # logement.rename(columns = {'qex': 'wprm'}, inplace = True)
    return logement
        'deci',
        'hnph2',
        'iaat_bis',
        'lmlm',
        'magtr',
        'mcs8',
        'mdiplo',
        'mtybd',
        'qex',
        'statut_occupation',
        'tu99_recoded',
        # 'ident',
    ]

    logement = Logement[kept_variables].copy()
    # logement.rename(columns = {'qex': 'wprm'}, inplace = True)
    return logement


if __name__ == '__main__':
    import sys
    logging.basicConfig(level=logging.INFO, stream=sys.stdout)
    year = 2012
    from openfisca_france_data.erfs_fpr.input_data_builder import step_01_preprocessing
    step_01_preprocessing.build_merged_dataframes(year=year)
    openfisca_survey_collection = SurveyCollection(name='openfisca')
    output_data_directory = openfisca_survey_collection.config.get(
        'data', 'output_directory')
    stata_file = os.path.join(output_data_directory, 'log_men_ERFS.dta')
    menages = merge_imputation_loyer(stata_file=stata_file, year=year)
Exemple #46
0
def build_depenses_homogenisees(temporary_store=None, year=None):
    """Build menage consumption by categorie fiscale dataframe """
    assert temporary_store is not None
    assert year is not None

    bdf_survey_collection = SurveyCollection.load(
        collection='budget_des_familles',
        config_files_directory=config_files_directory)
    survey = bdf_survey_collection.get_survey(
        'budget_des_familles_{}'.format(year))

    # Homogénéisation des bases de données de dépenses

    if year == 1995:
        socioscm = survey.get_values(table="socioscm")
        poids = socioscm[['mena', 'ponderrd', 'exdep', 'exrev']]
        # cette étape de ne garder que les données dont on est sûr de la qualité et de la véracité
        # exdep = 1 si les données sont bien remplies pour les dépenses du ménage
        # exrev = 1 si les données sont bien remplies pour les revenus du ménage
        poids = poids[(poids.exdep == 1) & (poids.exrev == 1)]
        del poids['exdep'], poids['exrev']
        poids.rename(columns={
            'mena': 'ident_men',
            'ponderrd': 'pondmen',
        },
                     inplace=True)
        poids.set_index('ident_men', inplace=True)

        conso = survey.get_values(table="depnom")
        conso = conso[["valeur", "montant", "mena", "nomen5"]]
        conso = conso.groupby(["mena", "nomen5"]).sum()
        conso = conso.reset_index()
        conso.rename(columns={
            'mena': 'ident_men',
            'nomen5': 'poste{}'.format(year),
            'valeur': 'depense',
            'montant': 'depense_avt_imput',
        },
                     inplace=True)

        # Passage à l'euro
        conso.depense = conso.depense / 6.55957
        conso.depense_avt_imput = conso.depense_avt_imput / 6.55957
        conso_small = conso[[u'ident_men', u'poste1995', u'depense']]

        conso_unstacked = conso_small.set_index(['ident_men', 'poste1995'
                                                 ]).unstack('poste1995')
        conso_unstacked = conso_unstacked.fillna(0)

        levels = conso_unstacked.columns.levels[1]
        labels = conso_unstacked.columns.labels[1]
        conso_unstacked.columns = levels[labels]
        conso_unstacked.rename(index={0: 'ident_men'}, inplace=True)
        conso = conso_unstacked.merge(poids, left_index=True, right_index=True)
        conso = conso.reset_index()

    if year == 2000:
        conso = survey.get_values(table="consomen")
        conso.rename(
            columns={
                'ident': 'ident_men',
                'pondmen': 'pondmen',
            },
            inplace=True,
        )
        for variable in ['ctotale', 'c99', 'c99999'] + \
                        ["c0{}".format(i) for i in range(1, 10)] + \
                        ["c{}".format(i) for i in range(10, 14)]:
            del conso[variable]

    if year == 2005:
        conso = survey.get_values(table="c05d")

    if year == 2011:
        try:
            conso = survey.get_values(table="C05")
        except:
            conso = survey.get_values(table="c05")
        conso.rename(
            columns={
                'ident_me': 'ident_men',
            },
            inplace=True,
        )
        del conso['ctot']

    # Grouping by coicop

    poids = conso[['ident_men', 'pondmen']].copy()
    poids.set_index('ident_men', inplace=True)
    conso.drop('pondmen', axis=1, inplace=True)
    conso.set_index('ident_men', inplace=True)

    matrice_passage_data_frame, selected_parametres_fiscalite_data_frame = get_transfert_data_frames(
        year)

    coicop_poste_bdf = matrice_passage_data_frame[[
        'poste{}'.format(year), 'posteCOICOP'
    ]]
    coicop_poste_bdf.set_index('poste{}'.format(year), inplace=True)
    coicop_by_poste_bdf = coicop_poste_bdf.to_dict()['posteCOICOP']
    del coicop_poste_bdf

    def reformat_consumption_column_coicop(coicop):
        try:
            return int(coicop.replace('c', '').lstrip('0'))
        except:
            return numpy.NaN

    # cette étape permet d'harmoniser les df pour 1995 qui ne se présentent pas de la même façon
    # que pour les trois autres années
    if year == 1995:
        coicop_labels = [
            normalize_code_coicop(coicop_by_poste_bdf.get(poste_bdf))
            for poste_bdf in conso.columns
        ]
    else:
        coicop_labels = [
            normalize_code_coicop(
                coicop_by_poste_bdf.get(
                    reformat_consumption_column_coicop(poste_bdf)))
            for poste_bdf in conso.columns
        ]
    tuples = zip(coicop_labels, conso.columns)
    conso.columns = pandas.MultiIndex.from_tuples(
        tuples, names=['coicop', 'poste{}'.format(year)])
    coicop_data_frame = conso.groupby(level=0, axis=1).sum()

    depenses = coicop_data_frame.merge(poids,
                                       left_index=True,
                                       right_index=True)

    # Création de gros postes, les 12 postes sur lesquels le calage se fera
    def select_gros_postes(coicop):
        try:
            coicop = unicode(coicop)
        except:
            coicop = coicop
        normalized_coicop = normalize_code_coicop(coicop)
        grosposte = normalized_coicop[0:2]
        return int(grosposte)

    grospostes = [
        select_gros_postes(coicop) for coicop in coicop_data_frame.columns
    ]
    tuples_gros_poste = zip(coicop_data_frame.columns, grospostes)
    coicop_data_frame.columns = pandas.MultiIndex.from_tuples(
        tuples_gros_poste, names=['coicop', 'grosposte'])

    depenses_by_grosposte = coicop_data_frame.groupby(level=1, axis=1).sum()
    depenses_by_grosposte = depenses_by_grosposte.merge(poids,
                                                        left_index=True,
                                                        right_index=True)

    # TODO : understand why it does not work: depenses.rename(columns = {u'0421': 'poste_coicop_421'}, inplace = True)

    produits = [column for column in depenses.columns if column.isdigit()]
    for code in produits:
        if code[-1:] == '0':
            depenses.rename(columns={code: code[:-1]}, inplace=True)
        else:
            depenses.rename(columns={code: code}, inplace=True)
    produits = [column for column in depenses.columns if column.isdigit()]
    for code in produits:
        if code[0:1] == '0':
            depenses.rename(columns={code: code[1:]}, inplace=True)
        else:
            depenses.rename(columns={code: code}, inplace=True)
    produits = [column for column in depenses.columns if column.isdigit()]
    for code in produits:
        depenses.rename(columns={code: 'poste_coicop_' + code}, inplace=True)

    temporary_store['depenses_{}'.format(year)] = depenses

    depenses_by_grosposte.columns = depenses_by_grosposte.columns.astype(str)
    liste_grospostes = [
        column for column in depenses_by_grosposte.columns if column.isdigit()
    ]
    for grosposte in liste_grospostes:
        depenses_by_grosposte.rename(
            columns={grosposte: 'coicop12_' + grosposte}, inplace=True)

    temporary_store['depenses_by_grosposte_{}'.format(
        year)] = depenses_by_grosposte
def create_fip(temporary_store = None, year = None):
    assert temporary_store is not None
    assert year is not None
    # fip : fichier d'imposition des personnes
    """
    Creates a 'fipDat' table containing all these 'fip individuals'
    """
    # Some individuals are declared as 'personne à charge' (pac) on 'tax forms'
    # but are not present in the erf or eec tables.
    # We add them to ensure consistency between concepts.

    year_specific_by_generic = year_specific_by_generic_data_frame_name(year)

    erfs_survey_collection = SurveyCollection.load(
        collection = 'erfs', config_files_directory = config_files_directory)
    survey = erfs_survey_collection.get_survey('erfs_{}'.format(year))

    log.info(u"Démarrage de 03_fip")

    # anaisenf is a string containing letter code of pac (F,G,H,I,J,N,R) and year of birth (example: 'F1990H1992')
    # when a child is invalid, he appears twice in anaisenf (example: F1900G1900 is a single invalid child born in 1990)
    erfFoyVar = ['declar', 'anaisenf']
    foyer = survey.get_values(table = year_specific_by_generic["foyer"], variables = erfFoyVar)
    foyer.replace({'anaisenf': {'NA': np.nan}}, inplace = True)

    log.info(u"Etape 1 : on récupere les personnes à charge des foyers")
    log.info(u"    1.1 : Création des codes des enfants")
    foyer['anaisenf'] = foyer['anaisenf'].astype('string')
    nb_pac_max = len(max(foyer['anaisenf'], key=len)) / 5
    log.info(u"il ya a au maximum {} pac par foyer".format(nb_pac_max))

    # Separating the string coding the pac of each "déclaration".
    # Creating a list containing the new variables.

    # Creating the multi_index for the columns
    multi_index_columns = []
    assert int(nb_pac_max) == nb_pac_max, "nb_pac_max = {} which is not an integer".format(nb_pac_max)
    nb_pac_max = int(nb_pac_max)
    for i in range(1, nb_pac_max + 1):
        pac_tuples_list = [
            (i, 'declaration'),
            (i, 'type_pac'),
            (i, 'naia')
            ]
        multi_index_columns += pac_tuples_list

    columns = MultiIndex.from_tuples(
        multi_index_columns,
        names = ['pac_number', 'variable']
        )
    fip = DataFrame(np.random.randn(len(foyer), 3 * nb_pac_max), columns = columns)

    for i in range(1, nb_pac_max + 1):  # TODO: using values to deal with mismatching indexes
        fip[(i, 'declaration')] = foyer['declar'].values
        fip[(i, 'type_pac')] = foyer['anaisenf'].str[5 * (i - 1)].values
        fip[(i, 'naia')] = foyer['anaisenf'].str[5 * (i - 1) + 1: 5 * i].values

    fip = fip.stack("pac_number")
    fip.reset_index(inplace = True)
    fip.drop(['level_0'], axis = 1, inplace = True)

    log.info(u"    1.2 : elimination des foyers fiscaux sans pac")
    # Clearing missing values and changing data format
    fip = fip[(fip.type_pac.notnull()) & (fip.naia != 'an') & (fip.naia != '')].copy()
    fip = fip.sort(columns = ['declaration', 'naia', 'type_pac'])
    fip.set_index(["declaration", "pac_number"], inplace = True)
    fip = fip.reset_index()
    fip.drop(['pac_number'], axis = 1, inplace = True)
    assert fip.type_pac.isin(["F", "G", "H", "I", "J", "N", "R"]).all(), \
        "Certains types de PAC ne sont pas des cases connues"

    # control(fip, debug=True, verbose=True, verbose_columns=['naia'])

    log.info(u"    1.3 : on enlève les individus F pour lesquels il existe un individu G")
    type_FG = fip[fip.type_pac.isin(['F', 'G'])].copy()  # Filtre pour ne travailler que sur F & G

    type_FG['same_pair'] = type_FG.duplicated(subset = ['declaration', 'naia'], take_last = True)
    type_FG['is_twin'] = type_FG.duplicated(subset = ['declaration', 'naia', 'type_pac'])
    type_FG['to_keep'] = ~(type_FG['same_pair']) | type_FG['is_twin']
    # Note : On conserve ceux qui ont des couples déclar/naia différents et les jumeaux
    #       puis on retire les autres (à la fois F et G)
    fip['to_keep'] = np.nan
    fip.update(type_FG)
    log.info(u"    1.4 : on enlève les H pour lesquels il y a un I")
    type_HI = fip[fip.type_pac.isin(['H', 'I'])].copy()
    type_HI['same_pair'] = type_HI.duplicated(subset = ['declaration', 'naia'], take_last = True)
    type_HI['is_twin'] = type_HI.duplicated(subset = ['declaration', 'naia', 'type_pac'])
    type_HI['to_keep'] = (~(type_HI['same_pair']) | (type_HI['is_twin'])).values

    fip.update(type_HI)
    fip['to_keep'] = fip['to_keep'].fillna(True)
    log.info(u"{} F, G, H or I non redundant pac kept over {} potential candidates".format(
        fip['to_keep'].sum(), len(fip))
        )
    indivifip = fip[fip['to_keep']].copy()
    del indivifip['to_keep'], fip, type_FG, type_HI
    #
    # control(indivifip, debug=True)

    log.info(u"Step 2 : matching indivifip with eec file")
    indivi = temporary_store['indivim_{}'.format(year)]
    pac = indivi[(indivi.persfip.notnull()) & (indivi.persfip == 'pac')].copy()
    assert indivifip.naia.notnull().all(), "Il y a des valeurs manquantes de la variable naia"

    # For safety enforce pac.naia and indivifip.naia dtypes
    pac['naia'] = pac.naia.astype('int32')
    indivifip['naia'] = indivifip.naia.astype('int32')
    pac['key1'] = zip(pac.naia, pac['declar1'].str[:29])
    pac['key2'] = zip(pac.naia, pac['declar2'].str[:29])
    indivifip['key'] = zip(indivifip.naia.values, indivifip['declaration'].str[:29].values)
    assert pac.naia.dtype == indivifip.naia.dtype, \
        "Les dtypes de pac.naia {} et indvifip.naia {} sont différents".format(pac.naia.dtype, indivifip.naia.dtype)

    fip = indivifip[~(indivifip.key.isin(pac.key1.values))].copy()
    fip = fip[~(fip.key.isin(pac.key2.values))].copy()

    log.info(u"    2.1 new fip created")
    # We build a dataframe to link the pac to their type and noindiv
    tmp_pac1 = pac[['noindiv', 'key1']].copy()
    tmp_pac2 = pac[['noindiv', 'key2']].copy()
    tmp_indivifip = indivifip[['key', 'type_pac', 'naia']].copy()

    pac_ind1 = tmp_pac1.merge(tmp_indivifip, left_on='key1', right_on='key', how='inner')
    log.info(u"{} pac dans les 1ères déclarations".format(len(pac_ind1)))
    pac_ind2 = tmp_pac2.merge(tmp_indivifip, left_on='key2', right_on='key', how='inner')
    log.info(u"{} pac dans les 2èms déclarations".format(len(pac_ind2)))

    log.info("{} duplicated pac_ind1".format(pac_ind1.duplicated().sum()))
    log.info("{} duplicated pac_ind2".format(pac_ind2.duplicated().sum()))

    del pac_ind1['key1'], pac_ind2['key2']

    if len(pac_ind1.index) == 0:
        if len(pac_ind2.index) == 0:
            log.info(u"Warning : no link between pac and noindiv for both pacInd1&2")
        else:
            log.info(u"Warning : pacInd1 is an empty data frame")
            pacInd = pac_ind2
    elif len(pac_ind2.index) == 0:
        log.info(u"Warning : pacInd2 is an empty data frame")
        pacInd = pac_ind1
    else:
        pacInd = concat([pac_ind2, pac_ind1])
    assert len(pac_ind1) + len(pac_ind2) == len(pacInd)
    log.info("{} null pac_ind2.type_pac".format(pac_ind2.type_pac.isnull().sum()))
    log.info("pacInd.type_pac.value_counts()) \n {}".format(pacInd.type_pac.value_counts(dropna = False)))

    log.info(u"    2.2 : pacInd created")
    log.info(u"doublons noindiv, type_pac {}".format(pacInd.duplicated(['noindiv', 'type_pac']).sum()))
    log.info(u"doublons noindiv seulement {}".format(pacInd.duplicated('noindiv').sum()))
    log.info(u"nb de NaN {}".format(pacInd.type_pac.isnull().sum()))

    del pacInd["key"]
    pacIndiv = pacInd[~(pacInd.duplicated('noindiv'))].copy()
    # pacIndiv.reset_index(inplace=True)
    log.info("{}".format(pacIndiv.columns))

    temporary_store['pacIndiv_{}'.format(year)] = pacIndiv

    log.info("{}".format(pacIndiv.type_pac.value_counts()))
    gc.collect()

    # We keep the fip in the menage of their parents because it is used in to
    # build the famille. We should build an individual ident (ménage) for the fip that are
    # older than 18 since they are not in their parents' menage according to the eec
    log.info("{}".format(indivi['declar1'].str[0:2].value_counts()))
    log.info("{}".format(indivi['declar1'].str[0:2].describe()))
    log.info("{}".format(indivi['declar1'].str[0:2].notnull().all()))
    log.info("{}".format(indivi.info()))
    selection = indivi['declar1'].str[0:2] != ""
    indivi['noidec'] = indivi.declar1[selection].str[0:2].astype('int32')  # To be used later to set idfoy

    individec1 = indivi[(indivi.declar1.isin(fip.declaration.values)) & (indivi.persfip == "vous")]
    individec1 = individec1[["declar1", "noidec", "ident", "rga", "ztsai", "ztsao"]].copy()
    individec1 = individec1.rename(columns = {'declar1': 'declaration'})
    fip1 = fip.merge(individec1, on = 'declaration')
    log.info(u"    2.3 : fip1 created")

    individec2 = indivi.loc[
        (indivi.declar2.isin(fip.declaration.values)) & (indivi['persfip'] == "vous"),
        ["declar2", "noidec", "ident", "rga", "ztsai", "ztsao"]
        ].copy()
    individec2.rename(columns = {'declar2': 'declaration'}, inplace = True)
    fip2 = fip.merge(individec2)
    log.info(u"    2.4 : fip2 created")

    fip1.duplicated().value_counts()
    fip2.duplicated().value_counts()

    fip = concat([fip1, fip2])

    fip['persfip'] = 'pac'
    fip['year'] = year
    fip['year'] = fip['year'].astype('float')  # BUG; pas de colonne année dans la DF
    fip['noi'] = 99
    fip['noicon'] = None
    fip['noindiv'] = fip['declaration'].copy()
    fip['noiper'] = None
    fip['noimer'] = None
    fip['declar1'] = fip['declaration'].copy()
    fip['naim'] = 99
    fip['lien'] = None
    fip['quelfic'] = 'FIP'
    fip['acteu'] = None
    fip['agepf'] = fip['year'] - fip.naia.astype('float')
    fip['lpr'] = (fip['agepf'] <= 20) * 3 + (fip['agepf'] > 20) * 4
    fip['stc'] = None
    fip['contra'] = None
    fip['titc'] = None
    fip['mrec'] = None
    fip['forter'] = None
    fip['rstg'] = None
    fip['retrai'] = None
    fip['cohab'] = None
    fip['sexe'] = None
    fip['persfip'] = "pac"
    fip['agepr'] = None
    fip['actrec'] = (fip['agepf'] <= 15) * 9 + (fip['agepf'] > 15) * 5

    # TODO: probleme actrec des enfants fip entre 16 et 20 ans : on ne sait pas s'ils sont étudiants ou salariés */
    # TODO problème avec les mois des enfants FIP : voir si on ne peut pas remonter à ces valeurs: Alexis: clairement non

    # Reassigning noi for fip children if they are more than one per foyer fiscal
    fip["noi"] = fip["noi"].astype("int64")
    fip["ident"] = fip["ident"].astype("int64")

    fip_tmp = fip[['noi', 'ident']]

    while any(fip.duplicated(subset = ['noi', 'ident'])):
        fip_tmp = fip.loc[:, ['noi', 'ident']]
        dup = fip_tmp.duplicated()
        tmp = fip.loc[dup, 'noi']
        log.info("{}".format(len(tmp)))
        fip.loc[dup, 'noi'] = tmp.astype('int64') - 1

    fip['idfoy'] = 100 * fip['ident'] + fip['noidec']
    fip['noindiv'] = 100 * fip['ident'] + fip['noi']
    fip['type_pac'] = 0
    fip['key'] = 0

    log.info("Number of duplicated fip: {}".format(fip.duplicated('noindiv').value_counts()))
    temporary_store['fipDat_{}'.format(year)] = fip
    del fip, fip1, individec1, indivifip, indivi, pac
    log.info(u"fip sauvegardé")
Exemple #48
0
def create_fip(year = None):
    assert year is not None
    # fip : fichier d'imposition des personnes
    """
    Creates a 'fipDat' table containing all these 'fip individuals'
    """
    # Some individuals are declared as 'personne à charge' (pac) on 'tax forms'
    # but are not present in the erf or eec tables.
    # We add them to ensure consistency between concepts.

    temporary_store = TemporaryStore.create(file_name = "erfs")

    replace = create_replace(year)

    erfs_survey_collection = SurveyCollection.load(
        collection = 'erfs', config_files_directory = config_files_directory)
    survey = erfs_survey_collection.get_survey('erfs_{}'.format(year))

    log.info(u"Démarrage de 03_fip")

    # anaisenf is a string containing letter code of pac (F,G,H,I,J,N,R) and year of birth (example: 'F1990H1992')
    # when a child is invalid, he appears twice in anaisenf (example: F1900G1900 is a single invalid child born in 1990)
    erfFoyVar = ['declar', 'anaisenf']
    foyer = survey.get_values(table = replace["foyer"], variables = erfFoyVar)
    foyer.replace({'anaisenf': {'NA': np.nan}}, inplace = True)

    log.info(u"Etape 1 : on récupere les personnes à charge des foyers")
    log.info(u"    1.1 : Création des codes des enfants")
    foyer['anaisenf'] = foyer['anaisenf'].astype('string')
    nb_pac_max = len(max(foyer['anaisenf'], key=len)) / 5
    log.info(u"il ya a au maximum {} pac par foyer".format(nb_pac_max))

    # Separating the string coding the pac of each "déclaration".
    # Creating a list containing the new variables.

    # Creating the multi_index for the columns
    multi_index_columns = []
    assert int(nb_pac_max) == nb_pac_max, "nb_pac_max = {} which is not an integer".format(nb_pac_max)
    nb_pac_max = int(nb_pac_max)
    for i in range(1, nb_pac_max + 1):
        pac_tuples_list = [
            (i, 'declaration'),
            (i, 'type_pac'),
            (i, 'naia')
            ]
        multi_index_columns += pac_tuples_list

    columns = MultiIndex.from_tuples(
        multi_index_columns,
        names = ['pac_number', 'variable']
        )
    fip = DataFrame(np.random.randn(len(foyer), 3 * nb_pac_max), columns = columns)
    log.info("{}".format(fip.describe()))
    log.info("{}".format(fip.info()))

    for i in range(1, nb_pac_max + 1):  # TODO: using values to deal with mismatching indexes
        fip[(i, 'declaration')] = foyer['declar'].values
        fip[(i, 'type_pac')] = foyer['anaisenf'].str[5 * (i - 1)].values
        fip[(i, 'naia')] = foyer['anaisenf'].str[5 * (i - 1) + 1: 5 * i].values

    fip = fip.stack("pac_number")
    fip.reset_index(inplace = True)
    fip.drop(['level_0'], axis = 1, inplace = True)

    log.info(u"    1.2 : elimination des foyers fiscaux sans pac")
    # Clearing missing values and changing data format
    fip = fip[(fip.type_pac.notnull()) & (fip.naia != 'an') & (fip.naia != '')].copy()
    fip = fip.sort(columns = ['declaration', 'naia', 'type_pac'])
    # TODO: check if useful
    fip.set_index(["declaration", "pac_number"], inplace = True)
    fip = fip.reset_index()
    fip.drop(['pac_number'], axis = 1, inplace = True)
    # TODO: rajouter la case I : "Dont enfants titulaires de la carte d’invalidité"
    assert fip.type_pac.isin(["F", "G", "H", "I", "J", "N", "R"]).all(), "Certains type de PAC sont inconnus"
    # TODO: find a more explicit message

#    control(fip, debug=True, verbose=True, verbose_columns=['naia'])

    log.info(u"    1.3 : on enlève les individus F pour lesquels il existe un individu G")
    type_FG = fip[fip.type_pac.isin(['F', 'G'])].copy()  # Filtre pour ne travailler que sur F & G

    type_FG['same_pair'] = type_FG.duplicated(subset = ['declaration', 'naia'], take_last = True)
    type_FG['is_twin'] = type_FG.duplicated(subset = ['declaration', 'naia', 'type_pac'])
    type_FG['to_keep'] = ~(type_FG['same_pair']) | type_FG['is_twin']
    # Note : On conserve ceux qui ont des couples déclar/naia différents et les jumeaux
    #       puis on retire les autres (à la fois F et G)
    log.info(u"longueur fip {}".format(len(fip)))

    fip['to_keep'] = np.nan
    fip.update(type_FG)

    log.info(u"    1.4 : on enlève les H pour lesquels il y a un I")
    type_HI = fip[fip.type_pac.isin(['H', 'I'])].copy()
    type_HI['same_pair'] = type_HI.duplicated(subset = ['declaration', 'naia'], take_last = True)
    type_HI['is_twin'] = type_HI.duplicated(subset = ['declaration', 'naia', 'type_pac'])
    type_HI['to_keep'] = (~(type_HI['same_pair']) | (type_HI['is_twin'])).values

    fip.update(type_HI)
    fip['to_keep'] = fip['to_keep'].fillna(True)
    log.info(u"nb lines to keep = {} / nb initial lines {}".format(len(fip[fip['to_keep']]), len(fip)))

    indivifip = fip[fip['to_keep']].copy()
    del indivifip['to_keep'], fip, type_FG, type_HI
    #
    # control(indivifip, debug=True)

    log.info(u"Step 2 : matching indivifip with eec file")
    indivi = temporary_store['indivim_{}'.format(year)]
    pac = indivi[(indivi.persfip.notnull()) & (indivi.persfip == 'pac')].copy()
    assert indivifip.naia.notnull().all(), "Il y a des valeurs manquantes de la variable naia"

    pac['naia'] = pac.naia.astype('int32')  # TODO: was float in pac fix upstream
    indivifip['naia'] = indivifip.naia.astype('int32')
    pac['key1'] = zip(pac.naia, pac['declar1'].str[:29])
    pac['key2'] = zip(pac.naia, pac['declar2'].str[:29])
    indivifip['key'] = zip(indivifip.naia.values, indivifip['declaration'].str[:29].values)
    assert pac.naia.dtype == indivifip.naia.dtype, \
        "Les dtypes de pac.naia {} et indvifip.naia {} sont différents".format(pac.naia.dtype, indivifip.naia.dtype)

    fip = indivifip[~(indivifip.key.isin(pac.key1.values))].copy()
    fip = fip[~(fip.key.isin(pac.key2.values))].copy()

    log.info(u"    2.1 new fip created")
#   We build a dataframe to link the pac to their type and noindiv
    tmp_pac1 = pac[['noindiv', 'key1']].copy()
    tmp_pac2 = pac[['noindiv', 'key2']].copy()
    tmp_indivifip = indivifip[['key', 'type_pac', 'naia']].copy()

    pac_ind1 = tmp_pac1.merge(tmp_indivifip, left_on='key1', right_on='key', how='inner')
    log.info(u"longueur pacInd1 {}".format(len(pac_ind1)))
    pac_ind2 = tmp_pac2.merge(tmp_indivifip, left_on='key2', right_on='key', how='inner')
    log.info(u"longueur pacInd2 {}".format(len(pac_ind2)))
    log.info(u"pacInd1 & pacInd2 créés")

    log.info("{}".format(pac_ind1.duplicated().sum()))
    log.info("{}".format(pac_ind2.duplicated().sum()))

    del pac_ind1['key1'], pac_ind2['key2']

    if len(pac_ind1.index) == 0:
        if len(pac_ind2.index) == 0:
                log.info(u"Warning : no link between pac and noindiv for both pacInd1&2")
        else:
            log.info(u"Warning : pacInd1 is an empty data frame")
            pacInd = pac_ind2
    elif len(pac_ind2.index) == 0:
        log.info(u"Warning : pacInd2 is an empty data frame")
        pacInd = pac_ind1
    else:
        pacInd = concat([pac_ind2, pac_ind1])
    log.info("{}{}{}".format(len(pac_ind1), len(pac_ind2), len(pacInd)))
    log.info("{}".format(pac_ind2.type_pac.isnull().sum()))
    log.info("{}".format(pacInd.type_pac.value_counts()))

    log.info(u"    2.2 : pacInd created")

    log.info(u"doublons noindiv, type_pac {}".format(pacInd.duplicated(['noindiv', 'type_pac']).sum()))
    log.info(u"doublons noindiv seulement {}".format(pacInd.duplicated('noindiv').sum()))
    log.info(u"nb de NaN {}".format(pacInd.type_pac.isnull().sum()))

    del pacInd["key"]
    pacIndiv = pacInd[~(pacInd.duplicated('noindiv'))].copy()
    # pacIndiv.reset_index(inplace=True)
    log.info("{}".format(pacIndiv.columns))

    temporary_store['pacIndiv_{}'.format(year)] = pacIndiv

    log.info("{}".format(pacIndiv.type_pac.value_counts()))
    gc.collect()

# # We keep the fip in the menage of their parents because it is used in to
# # build the famille. We should build an individual ident (ménage) for the fip that are
# # older than 18 since they are not in their parents' menage according to the eec

# individec1 <- subset(indivi, (declar1 %in% fip$declar) & (persfip=="vous"))
# individec1 <- individec1[,c("declar1","noidec","ident","rga","ztsai","ztsao")]
# individec1 <- upData(individec1,rename=c(declar1="declar"))
# fip1       <- merge(fip,individec1)
# indivi$noidec <- as.numeric(substr(indivi$declar1,1,2))
    log.info("{}".format(indivi['declar1'].str[0:2].value_counts()))
    log.info("{}".format(indivi['declar1'].str[0:2].describe()))
    log.info("{}".format(indivi['declar1'].str[0:2].notnull().all()))
    log.info("{}".format(indivi.info()))
    selection = indivi['declar1'].str[0:2] != ""
    indivi['noidec'] = indivi.declar1[selection].str[0:2].astype('int32')  # To be used later to set idfoy

    individec1 = indivi[(indivi.declar1.isin(fip.declaration.values)) & (indivi.persfip == "vous")]
    individec1 = individec1[["declar1", "noidec", "ident", "rga", "ztsai", "ztsao"]].copy()
    individec1 = individec1.rename(columns = {'declar1': 'declaration'})
    fip1 = fip.merge(individec1, on = 'declaration')
    log.info(u"    2.3 : fip1 created")

# # TODO: On ne s'occupe pas des declar2 pour l'instant
# # individec2 <- subset(indivi, (declar2 %in% fip$declar) & (persfip=="vous"))
# # individec2 <- individec2[,c("declar2","noidec","ident","rga","ztsai","ztsao")]
# # individec2 <- upData(individec2,rename=c(declar2="declar"))
# # fip2 <-merge(fip,individec2)

    individec2 = indivi[(indivi.declar2.isin(fip.declaration.values)) & (indivi['persfip'] == "vous")]
    individec2 = individec2[["declar2", "noidec", "ident", "rga", "ztsai", "ztsao"]].copy()
    individec2.rename(columns = {'declar2': 'declaration'}, inplace = True)
    fip2 = fip.merge(individec2)
    log.info(u"    2.4 : fip2 created")

    fip1.duplicated().value_counts()
    fip2.duplicated().value_counts()

    fip = concat([fip1, fip2])

    fip['persfip'] = 'pac'
    fip['year'] = year
    fip['year'] = fip['year'].astype('float')  # BUG; pas de colonne année dans la DF
    fip['noi'] = 99
    fip['noicon'] = None
    fip['noindiv'] = fip['declaration']
    fip['noiper'] = None
    fip['noimer'] = None
    fip['declar1'] = fip['declaration']  # TODO: declar ?
    fip['naim'] = 99
    fip['lien'] = None
    fip['quelfic'] = 'FIP'
    fip['acteu'] = None
    fip['agepf'] = fip['year'] - fip.naia.astype('float')
    fip['lpr'] = (fip['agepf'] <= 20) * 3 + (fip['agepf'] > 20) * 4
    fip['stc'] = None
    fip['contra'] = None
    fip['titc'] = None
    fip['mrec'] = None
    fip['forter'] = None
    fip['rstg'] = None
    fip['retrai'] = None
    fip['cohab'] = None
    fip['sexe'] = None
    fip['persfip'] = "pac"
    fip['agepr'] = None
    fip['actrec'] = (fip['agepf'] <= 15) * 9 + (fip['agepf'] > 15) * 5

## TODO: probleme actrec des enfants fip entre 16 et 20 ans : on ne sait pas s'ils sont étudiants ou salariés */
## TODO problème avec les mois des enfants FIP : voir si on ne peut pas remonter à ces valeurs: Alexis : clairement non

# Reassigning noi for fip children if they are more than one per foyer fiscal
# while ( any(duplicated( fip[,c("noi","ident")]) ) ) {
#   dup <- duplicated( fip[, c("noi","ident")])
#   tmp <- fip[dup,"noi"]
#   fip[dup, "noi"] <- (tmp-1)
# }
    # TODO: Le vecteur dup est-il correct
    fip["noi"] = fip["noi"].astype("int64")
    fip["ident"] = fip["ident"].astype("int64")

    fip_tmp = fip[['noi', 'ident']]

    while any(fip.duplicated(cols=['noi', 'ident'])):
        fip_tmp = fip.loc[:, ['noi', 'ident']]
        dup = fip_tmp.duplicated()
        tmp = fip.loc[dup, 'noi']
        log.info("{}".format(len(tmp)))
        fip.loc[dup, 'noi'] = tmp.astype('int64') - 1

    fip['idfoy'] = 100 * fip['ident'] + fip['noidec']
    fip['noindiv'] = 100 * fip['ident'] + fip['noi']
    fip['type_pac'] = 0
    fip['key'] = 0

    log.info("{}".format(fip.duplicated('noindiv').value_counts()))
    temporary_store['fipDat_{}'.format(year)] = fip
    del fip, fip1, individec1, indivifip, indivi, pac
    log.info(u"fip sauvegardé")
Exemple #49
0
def build_homogeneisation_caracteristiques_sociales(temporary_store=None,
                                                    year=None):
    u"""Homogénéisation des caractéristiques sociales des ménages """

    assert temporary_store is not None
    assert year is not None
    # Load data
    bdf_survey_collection = SurveyCollection.load(
        collection='budget_des_familles',
        config_files_directory=config_files_directory)
    survey = bdf_survey_collection.get_survey(
        'budget_des_familles_{}'.format(year))
    # ******************************************************************************************************************
    # * Etape n° 0-3 : HOMOGENEISATION DES CARACTERISTIQUES SOCIALES DES MENAGES
    # ******************************************************************************************************************
    # ******************************************************************************************************************

    if year == 1995:
        kept_variables = [
            'exdep', 'exrev', 'mena', 'v', 'ponderrd', 'nbpers', 'nbenf',
            'typmen1', 'cohabpr', 'sexepr', 'agepr', 'agecj', 'matripr',
            'occuppr', 'occupcj', 'nbact', 'sitlog', 'stalog', 'mena', 'nm14a',
            'typmen1'
        ]
        menage = survey.get_values(
            table="socioscm",
            variables=kept_variables,
        )
        # cette étape permet de ne garder que les données dont on est sûr de la qualité et de la véracité
        # exdep = 1 si les données sont bien remplies pour les dépenses du ménage
        # exrev = 1 si les données sont bien remplies pour les revenus du ménage
        menage = menage[(menage.exdep == 1) & (menage.exrev == 1)]
        menage.rename(
            columns={
                'v': 'vag',
                'mena': 'ident_men',
                'ponderrd': 'pondmen',
                'nbpers': 'npers',
                'nm14a': 'nenfants',
                'nbenf': 'nenfhors',
                'nbact': 'nactifs',
                'cohabpr': 'couplepr',
                'matripr': 'etamatri',
                'typmen1': 'typmen'
            },
            inplace=True,
        )
        # la variable vag est utilisée dans les modèles QAIDS et AIDS comme variable temporelle afin d'attibuer
        # le bon prix mensuel
        menage.agecj = menage.agecj.fillna(0)
        menage.nenfhors = menage.nenfhors.fillna(0)
        menage.vag = menage.vag.astype('int')

        menage['nadultes'] = menage['npers'] - menage['nenfants']
        menage['ocde10'] = 1 + 0.5 * numpy.maximum(
            0, menage['nadultes'] - 1) + 0.3 * menage['nenfants']

        # harmonisation des types de ménage sur la nomenclature 2010
        menage['typmen_'] = menage['typmen']
        menage.typmen[menage.typmen_ == 1] = 1
        menage.typmen[menage.typmen_ == 2] = 3
        menage.typmen[menage.typmen_ == 3] = 4
        menage.typmen[menage.typmen_ == 4] = 4
        menage.typmen[menage.typmen_ == 5] = 4
        menage.typmen[menage.typmen_ == 6] = 2
        menage.typmen[menage.typmen_ == 7] = 5
        del menage['typmen_']

        var_to_ints = ['couplepr', 'etamatri']
        for var_to_int in var_to_ints:
            menage[var_to_int] = menage[var_to_int].astype(int)

        #  Methode :
        #  1. on nettoite les variables (i.e. changement de nom de format)
        #  2. Reformatage des variables (réattribution des catégories pour quelles soient identiques
        #     pour les différentes années)

        menage["situacj"] = 0
        menage.situacj[menage.occupcj == 1] = 1
        menage.situacj[menage.occupcj == 3] = 3
        menage.situacj[menage.occupcj == 2] = 4
        menage.situacj[menage.occupcj == 5] = 5
        menage.situacj[menage.occupcj == 6] = 5
        menage.situacj[menage.occupcj == 7] = 6
        menage.situacj[menage.occupcj == 8] = 7
        menage.situacj[menage.occupcj == 4] = 8

        menage["situapr"] = 0
        menage.situapr[menage.occuppr == 1] = 1
        menage.situapr[menage.occuppr == 3] = 3
        menage.situapr[menage.occuppr == 2] = 4
        menage.situapr[menage.occuppr == 5] = 5
        menage.situapr[menage.occuppr == 6] = 5
        menage.situapr[menage.occuppr == 7] = 6
        menage.situapr[menage.occuppr == 8] = 7
        menage.situapr[menage.occuppr == 4] = 8

        menage["typlog"] = 0
        menage.typlog[menage.sitlog == 1] = 1
        menage.typlog[menage.sitlog != 1] = 2

        menage['stalog'] = menage['stalog'].astype(int)

        individus = survey.get_values(table="individu", )
        variables = ['mena', 'v']
        individus.rename(
            columns={'mena': 'identmen'},
            inplace=True,
        )
        menage.set_index('ident_men', inplace=True)

    if year == 2000:
        menage = survey.get_values(
            table="menage",
            variables=[
                'ident', 'pondmen', 'nbact', 'nbenf1', 'nbpers', 'ocde10',
                'sitlog', 'stalog', 'strate', 'typmen1', 'zeat', 'stalog',
                'vag', 'sexepr', 'sexecj', 'agecj', 'napr', 'nacj', 'cs2pr',
                'cs2cj', 'diegpr', 'dieppr', 'diespr', 'diegcj', 'diepcj',
                'diescj', 'hod_nb', 'cohabpr', 'occupapr', 'occupacj',
                'occupbpr', 'occupbcj', 'occupcpr', 'occupccj', 'typmen1'
            ])
        menage.rename(
            columns={
                'cohabpr': 'couplepr',
                'hod_nb': 'nenfhors',
                'ident': 'ident_men',
                'nbact': 'nactifs',
                'nbenf1': 'nenfants',
                'nbpers': 'npers',
                'rev81': 'poste_coicop_421',
                'typmen1': 'typmen'
            },
            inplace=True,
        )
        menage.ocde10 = menage.ocde10 / 10
        # on met un numéro à chaque vague pour pouvoir faire un meilleur suivi des évolutions temporelles
        # pour le modèle de demande
        menage.agecj = menage.agecj.fillna(0)

        assert menage.notnull().all().all(
        ), 'The following variables contains NaN values: {}'.format(
            list(menage.isnull().any()[menage.isnull().any()].index))

        menage['vag_'] = menage['vag']
        menage.vag.loc[menage.vag_ == 1] = 9
        menage.vag.loc[menage.vag_ == 2] = 10
        menage.vag.loc[menage.vag_ == 3] = 11
        menage.vag.loc[menage.vag_ == 4] = 12
        menage.vag.loc[menage.vag_ == 5] = 13
        menage.vag.loc[menage.vag_ == 6] = 14
        menage.vag.loc[menage.vag_ == 7] = 15
        menage.vag.loc[menage.vag_ == 8] = 16
        del menage['vag_']
        # harmonisation des types de ménage sur la nomenclature 2010
        menage['typmen_'] = menage['typmen']
        menage.typmen.loc[menage.typmen_ == 1] = 1
        menage.typmen.loc[menage.typmen_ == 2] = 3
        menage.typmen.loc[menage.typmen_ == 3] = 4
        menage.typmen.loc[menage.typmen_ == 4] = 4
        menage.typmen.loc[menage.typmen_ == 5] = 4
        menage.typmen.loc[menage.typmen_ == 6] = 2
        menage.typmen.loc[menage.typmen_ == 7] = 5
        del menage['typmen_']

        menage.couplepr = menage.couplepr.astype('int')
        menage["nadultes"] = menage['npers'] - menage['nenfants']

        menage.typmen = menage.typmen.astype('int')

        # occupa : 1 si la personne travaille, 2 sinon. occupb : 1 si elle travaille effectivement, 2 si congé de
        # longue durée (négligé ici). occupc : de 2 à 8 selon le statut si ne travaille pas (étudiant, retraité, etc.)
        menage["situacj"] = 0
        menage.situacj.loc[menage.occupacj == 1] = 1
        menage.situacj.loc[menage.occupccj == 3] = 3
        menage.situacj.loc[menage.occupccj == 2] = 4
        menage.situacj.loc[menage.occupccj == 5] = 5
        menage.situacj.loc[menage.occupccj == 6] = 5
        menage.situacj.loc[menage.occupccj == 7] = 6
        menage.situacj.loc[menage.occupccj == 8] = 7
        menage.situacj.loc[menage.occupccj == 4] = 8

        menage["situapr"] = 0
        menage.situapr.loc[menage.occupapr == 1] = 1
        menage.situapr.loc[menage.occupcpr == 3] = 3
        menage.situapr.loc[menage.occupcpr == 2] = 4
        menage.situapr.loc[menage.occupcpr == 5] = 5
        menage.situapr.loc[menage.occupcpr == 6] = 5
        menage.situapr.loc[menage.occupcpr == 7] = 6
        menage.situapr.loc[menage.occupcpr == 8] = 7
        menage.situapr.loc[menage.occupcpr == 4] = 8

        menage["natiocj"] = 0
        menage["natiopr"] = 0
        menage.natiocj.loc[menage.nacj == 1] = 1
        menage.natiocj.loc[menage.nacj == 2] = 1
        menage.natiocj.loc[menage.nacj == 3] = 2
        menage.natiopr.loc[menage.napr == 1] = 1
        menage.natiopr.loc[menage.napr == 2] = 1
        menage.natiopr.loc[menage.napr == 3] = 2

        menage["typlog"] = 0
        menage.typlog.loc[menage.sitlog == 1] = 1
        menage.typlog.loc[menage.sitlog != 1] = 2

        # Homogénéisation des diplômes, choix d'équivalence entre les diplômes
        menage["dip14pr"] = 999999
        menage.dip14pr.loc[menage.diegpr == 0] = 71
        menage.dip14pr.loc[menage.diegpr == 2] = 70
        menage.dip14pr.loc[menage.diegpr == 15] = 60
        menage.dip14pr.loc[menage.diegpr == 18] = 60
        menage.dip14pr.loc[menage.diegpr == 16] = 41
        menage.dip14pr.loc[menage.diegpr == 17] = 41
        menage.dip14pr.loc[menage.diegpr == 19] = 41

        menage.dip14pr.loc[menage.dieppr == 23] = 50
        menage.dip14pr.loc[menage.dieppr == 25] = 50
        menage.dip14pr.loc[menage.dieppr == 27] = 50
        menage.dip14pr.loc[menage.dieppr == 29] = 50
        menage.dip14pr.loc[menage.dieppr == 34] = 43
        menage.dip14pr.loc[menage.dieppr == 32] = 42
        menage.dip14pr.loc[menage.dieppr == 36] = 42

        menage.dip14pr.loc[menage.diespr == 41] = 30
        menage.dip14pr.loc[menage.diespr == 42] = 31
        menage.dip14pr.loc[menage.diespr == 43] = 31
        menage.dip14pr.loc[menage.diespr == 44] = 33
        menage.dip14pr.loc[menage.diespr == 46] = 20
        menage.dip14pr.loc[menage.diespr == 48] = 12
        menage.dip14pr.loc[menage.diespr == 47] = 10

        menage.set_index('ident_men', inplace=True)

        # Recodage des catégories zeat
        menage.zeat.loc[menage.zeat == 7] = 6
        menage.zeat.loc[menage.zeat == 8] = 7
        menage.zeat.loc[menage.zeat == 9] = 8

        assert menage.zeat.isin(range(1, 9)).all()

        individus = survey.get_values(
            table="individus", variables=['ident', 'matri', 'lien', 'anais'])

        individus = individus.loc[individus.lien == 1].copy()
        individus.rename(
            columns={
                'ident': 'ident_men',
                'matri': 'etamatri'
            },
            inplace=True,
        )
        variables_to_destring = ['anais']
        for variable_to_destring in variables_to_destring:
            individus[variable_to_destring] = individus[
                variable_to_destring].astype('int').copy()
        individus['agepr'] = year - individus.anais
        individus.set_index('ident_men', inplace=True)

        assert menage.notnull().all().all(
        ), 'The following variables contains NaN values: {}'.format(
            list(menage.isnull().any()[menage.isnull().any()].index))

        menage = menage.merge(individus, left_index=True, right_index=True)

    if year == 2005:
        menage = survey.get_values(table="menage")
        # données socio-démographiques
        socio_demo_variables = [
            'agpr', 'agcj', 'couplepr', 'decuc', 'ident_men', 'nactifs',
            'nenfants', 'nenfhors', 'npers', 'ocde10', 'pondmen', 'sexecj',
            'sexepr', 'typmen5', 'vag', 'zeat', 'cs24pr'
        ]
        socio_demo_variables += [
            column for column in menage.columns if column.startswith('dip14')
        ]
        socio_demo_variables += [
            column for column in menage.columns if column.startswith('natio7')
        ]
        # activité professionnelle
        activite_prof_variables = ['situacj', 'situapr']
        activite_prof_variables += [
            column for column in menage.columns if column.startswith('cs42')
        ]
        # logement
        logement_variables = ['htl', 'strate']
        menage = menage[socio_demo_variables + activite_prof_variables +
                        logement_variables]
        menage.rename(
            columns={
                # "agpr": "agepr",
                "agcj": "agecj",
                "typmen5": "typmen",
                "cs24pr": "cs_pr"
            },
            inplace=True,
        )
        del menage['agpr']
        menage['nadultes'] = menage.npers - menage.nenfants
        for person in ['pr', 'cj']:
            menage['natio' + person] = (menage['natio7' + person] > 2
                                        )  # TODO: changer de convention ?
            del menage['natio7' + person]

        menage.agecj = menage.agecj.fillna(0)
        menage.nenfhors = menage.nenfhors.fillna(0)
        var_to_ints = [
            'ocde10', 'decuc', 'nactifs', 'nenfants', 'npers', 'pondmen',
            'nadultes'
        ]
        assert menage.notnull().all().all(
        ), 'The following variables contains NaN values: {}'.format(
            list(menage.isnull().any()[menage.isnull().any()].index))

        menage.couplepr = menage.couplepr > 2  # TODO: changer de convention ?
        menage.ocde10 = menage.ocde10 / 10
        menage.set_index('ident_men', inplace=True)
        # on met un numéro à chaque vague pour pouvoir faire un meilleur suivi des évolutions temporelles
        # pour le modèle de demande
        menage['vag_'] = menage['vag']
        menage.vag.loc[menage.vag_ == 1] = 17
        menage.vag.loc[menage.vag_ == 2] = 18
        menage.vag.loc[menage.vag_ == 3] = 19
        menage.vag.loc[menage.vag_ == 4] = 20
        menage.vag.loc[menage.vag_ == 5] = 21
        menage.vag.loc[menage.vag_ == 6] = 22
        del menage['vag_']

        # Recodage des catégories zeat
        menage.zeat.loc[menage.zeat == 7] = 6
        menage.zeat.loc[menage.zeat == 8] = 7
        menage.zeat.loc[menage.zeat == 9] = 8

        assert menage.zeat.isin(range(1, 9)).all()

        stalog = survey.get_values(table="depmen",
                                   variables=['ident_men', 'stalog'])
        stalog['stalog'] = stalog.stalog.astype('int').copy()
        stalog['new_stalog'] = 0
        stalog.loc[stalog.stalog == 2, 'new_stalog'] = 1
        stalog.loc[stalog.stalog == 1, 'new_stalog'] = 2
        stalog.loc[stalog.stalog == 4, 'new_stalog'] = 3
        stalog.loc[stalog.stalog == 5, 'new_stalog'] = 4
        stalog.loc[stalog.stalog.isin([3, 6]), 'new_stalog'] = 5
        stalog.stalog = stalog.new_stalog.copy()
        del stalog['new_stalog']

        assert stalog.stalog.isin(range(1, 6)).all()
        stalog.set_index('ident_men', inplace=True)
        menage = menage.merge(stalog, left_index=True, right_index=True)
        menage['typlog'] = 2
        menage.loc[menage.htl.isin(['1', '5']), 'typlog'] = 1
        assert menage.typlog.isin([1, 2]).all()
        del menage['htl']

        individus = survey.get_values(table='individu')
        # Il y a un problème sur l'année de naissance,
        # donc on le recalcule avec l'année de naissance et la vague d'enquête
        individus['agepr'] = year - individus.anais
        individus.loc[individus.vag == 6,
                      ['agepr']] = year + 1 - individus.anais
        individus = individus[individus.lienpref == 00].copy()
        kept_variables = ['ident_men', 'etamatri', 'agepr']
        individus = individus[kept_variables].copy()
        individus.etamatri.loc[individus.etamatri == 0] = 1
        individus['etamatri'] = individus['etamatri'].astype(
            'int')  # MBJ TODO: define as a catagory ?
        individus.set_index('ident_men', inplace=True)
        menage = menage.merge(individus, left_index=True, right_index=True)

        individus = survey.get_values(
            table='individu',
            variables=[
                'ident_men', 'ident_ind', 'age', 'anais', 'vag', 'lienpref'
            ],
        )
        # Il y a un problème sur l'année de naissance,
        # donc on le recalcule avec l'année de naissance et la vague d'enquête
        individus['age'] = year - individus.anais
        individus.loc[individus.vag == 6, ['age']] = year + 1 - individus.anais
        # Garder toutes les personnes du ménage qui ne sont pas la personne de référence et le conjoint
        individus = individus[(individus.lienpref != 00)
                              & (individus.lienpref != 01)].copy()
        individus.sort_values(by=['ident_men', 'ident_ind'], inplace=True)

        # Inspired by http://stackoverflow.com/questions/17228215/enumerate-each-row-for-each-group-in-a-dataframe
        def add_col_numero(data_frame):
            data_frame['numero'] = numpy.arange(len(data_frame)) + 3
            return data_frame

        individus = individus.groupby(by='ident_men').apply(add_col_numero)
        pivoted = individus.pivot(index='ident_men',
                                  columns="numero",
                                  values='age')
        pivoted.columns = [
            "age{}".format(column) for column in pivoted.columns
        ]
        menage = menage.merge(pivoted,
                              left_index=True,
                              right_index=True,
                              how='outer')

        individus = survey.get_values(
            table='individu',
            variables=['ident_men', 'ident_ind', 'agfinetu', 'lienpref'],
        )
        individus.set_index('ident_men', inplace=True)
        pr = individus.loc[individus.lienpref == 00, 'agfinetu'].copy()
        conjoint = individus.loc[individus.lienpref == 01, 'agfinetu'].copy()
        conjoint.name = 'agfinetu_cj'
        agfinetu_merged = pandas.concat([pr, conjoint], axis=1)
        menage = menage.merge(agfinetu_merged,
                              left_index=True,
                              right_index=True)
        temporary_store['donnes_socio_demog_{}'.format(year)] = menage

        # label var agepr "Age de la personne de référence au 31/12/${yearrawdata}"
        # label var agecj "Age du conjoint de la PR au 31/12/${yearrawdata}"
        # label var sexepr "Sexe de la personne de référence"
        # label var sexecj "Sexe du conjoint de la PR"
        # label var cs42pr "Catégorie socio-professionnelle de la PR"
        # label var cs42cj "Catégorie socio-professionnelle du conjoint de la PR"
        # label var ocde10 "Nombre d'unités de consommation (échelle OCDE)"
        # label var ident_men "Identifiant du ménage"
        # label var pondmen "Ponderation du ménage"
        # label var npers "Nombre total de personnes dans le ménage"
        # label var nadultes "Nombre d'adultes dans le ménage"
        # label var nenfants "Nombre d'enfants dans le ménage"
        # label var nenfhors "Nombre d'enfants vivant hors domicile"
        # label var nactifs  "Nombre d'actifs dans le ménage"
        # label var couplepr "Vie en couple de la personne de référence"
        # label define typmen5 1 "Personne seule" 2 "Famille monoparentale" 3 "Couple sans enfant"
        #                      4 "Couple avec enfants" 5 "Autre type de ménage (complexe)"
        # label values typmen5 typmen5
        # label var typmen5 "Type de ménage (5 modalités)"
        # label var etamatri "Situation matrimoniale de la personne de référence"
        # label define matripr 1 "Célibataire" 2 "Marié(e)" 3 "Veuf(ve)" 4 "Divorcé(e)"
        # label values etamatri matripr
        # label define occupation 1 "Occupe un emploi" ///
        # 2 "Apprenti" ///
        # 3 "Etudiant, élève, en formation"  ///
        # 4 "Chômeur (inscrit ou non à l'ANPE)" ///
        # 5 "Retraité, préretraité ou retiré des affaires" ///
        # 6 "Au foyer"  ///
        # 7 "Autre situation (handicapé)"  ///
        # 8 "Militaire du contingent"
        # label values situapr occupation
        # label values situacj occupation
        # label var situapr "Situation d'activité de la personne de référence"
        # label var situacj "Situation d'activité du conjoint de la PR"
        # label define diplome 10 "Diplôme de 3ème cycle universitaire, doctorat" ///
        # 12 "Diplôme d'ingénieur, grande école" ///
        # 20 "Diplôme de 2nd cycle universitaire" ///
        # 30 "Diplôme de 1er cycle universitaire" ///
        # 31 "BTS, DUT ou équivalent" ///
        # 33 "Diplôme des professions sociales et de la santé niveau Bac +2" ///
        # 41 "Baccalauréat général, brevet supérieur, capacité en droit" ///
        # 42 "Baccalauréat technologique" ///
        # 43 "Baccalauréat professionnel" ///
        # 44 "Brevet professionnel ou de technicien" ///
        # 50 "CAP, BEP ou diplôme de même niveau" ///
        # 60 "Brevet des collèges, BEPC" ///
        # 70 "Certificat d'études primaires" ///
        # 71 "Aucun diplôme"
        # label values dip14pr diplome
        # label values dip14cj diplome
        # label var dip14pr "Diplôme le plus élevé de la PR"
        # label var dip14cj "Diplôme le plus élevé du conjoint de la PR"
        # label define nationalite 1 "Français, par naissance ou naturalisation" 2 "Etranger"
        # label values natiopr nationalite
        # label values natiocj nationalite
        # label var natiopr "Nationalité de la personne de référence"
        # label var natiocj "Nationalité du conjoint de la PR"
        # label define logement 1 "Maison" 2 "Appartement"
        # label values typlog logement
        # label var typlog "Type de logement"
        # label define statutlogement 1 "Propriétaire ou copropriétaire" ///
        # 2 "Accédant à la propriété (rembourse un prêt)" ///
        # 3 "Locataire" ///
        # 4 "Sous-locataire" ///
        # 5 "Logé gratuitement"
        # label values stalog statutlogement
        # label var stalog "Statut d'occupation du logement"
        # label define viecouple 1 "Vit en couple" 2 "Ne vit pas en couple"
        # label values couplepr viecouple
        #
        # /* Recodage des CSP en 12 et 8 postes à partir de classification de l'INSEE (2003, PCS niveaux 1 et 2) */
        # gen cs24pr=00
        # replace cs24pr=10 if cs42pr=="11"
        # replace cs24pr=10 if cs42pr=="12"
        # replace cs24pr=10 if cs42pr=="13"
        # replace cs24pr=21 if cs42pr=="21"
        # replace cs24pr=22 if cs42pr=="22"
        # replace cs24pr=23 if cs42pr=="23"
        # replace cs24pr=31 if cs42pr=="31"
        # replace cs24pr=32 if cs42pr=="33"
        # replace cs24pr=32 if cs42pr=="34"
        # replace cs24pr=32 if cs42pr=="35"
        # replace cs24pr=36 if cs42pr=="37"
        # replace cs24pr=36 if cs42pr=="38"
        # replace cs24pr=41 if cs42pr=="42"
        # replace cs24pr=41 if cs42pr=="43"
        # replace cs24pr=41 if cs42pr=="44"
        # replace cs24pr=41 if cs42pr=="45"
        # replace cs24pr=46 if cs42pr=="46"
        # replace cs24pr=47 if cs42pr=="47"
        # replace cs24pr=48 if cs42pr=="48"
        # replace cs24pr=51 if cs42pr=="52"
        # replace cs24pr=51 if cs42pr=="53"
        # replace cs24pr=54 if cs42pr=="54"
        # replace cs24pr=55 if cs42pr=="55"
        # replace cs24pr=56 if cs42pr=="56"
        # replace cs24pr=61 if cs42pr=="62"
        # replace cs24pr=61 if cs42pr=="63"
        # replace cs24pr=61 if cs42pr=="64"
        # replace cs24pr=61 if cs42pr=="65"
        # replace cs24pr=66 if cs42pr=="67"
        # replace cs24pr=66 if cs42pr=="68"
        # replace cs24pr=69 if cs42pr=="69"
        # replace cs24pr=71 if cs42pr=="71"
        # replace cs24pr=72 if cs42pr=="72"
        # replace cs24pr=73 if cs42pr=="74"
        # replace cs24pr=73 if cs42pr=="75"
        # replace cs24pr=76 if cs42pr=="77"
        # replace cs24pr=76 if cs42pr=="78"
        # replace cs24pr=81 if cs42pr=="81"
        # replace cs24pr=82 if cs42pr=="83"
        # replace cs24pr=82 if cs42pr=="84"
        # replace cs24pr=82 if cs42pr=="85"
        # replace cs24pr=82 if cs42pr=="86"
        # replace cs24pr=82 if cs42pr=="**"
        # replace cs24pr=82 if cs42pr=="00"
        #

        menage['cs24pr'] = 0
        csp42s_by_csp24 = {
            10: ["11", "12", "13"],
            21: ["21"],
            22: ["22"],
            23: ["23"],
            31: ["31"],
            32: ["32", "33", "34", "35"],
            36: ["37", "38"],
            41: ["42", "43", "44", "45"],
            46: ["46"],
            47: ["47"],
            48: ["48"],
            51: ["52", "53"],
            54: ["54"],
            55: ["55"],
            56: ["56"],
            61: ["62", "63", "64", "65"],
            66: ["67", "68"],
            69: ["69"],
            71: ["71"],
            72: ["72"],
            73: ["74", "75"],
            76: ["77", "78"],
            81: ["81"],
            82: ["83", "84", "85", "86", "**", "00"],
        }
        for csp24, csp42s in csp42s_by_csp24.items():
            menage.loc[menage.cs42pr.isin(csp42s), 'cs24pr'] = csp24
        assert menage.cs24pr.isin(csp42s_by_csp24.keys()).all()

        menage['cs8pr'] = numpy.floor(menage.cs24pr / 10)
        assert menage.cs8pr.isin(range(1, 9)).all()

        variables = [
            'pondmen', 'npers', 'nenfants', 'nenfhors', 'nadultes', 'nactifs',
            'ocde10', 'typmen', 'sexepr', 'agepr', 'etamatri', 'couplepr',
            'situapr', 'dip14pr', 'cs42pr', 'cs24pr', 'cs8pr', 'natiopr',
            'sexecj', 'agecj', 'situacj', 'dip14cj', 'cs42cj', 'natiocj',
            'typlog', 'stalog'
        ] + ["age{}".format(age) for age in range(3, 14)]

        for variable in variables:
            assert variable in menage.columns, "{} is not a column of menage data frame".format(
                variable)

    if year == 2011:
        variables = [
            'agecj',
            'agepr',
            'coeffuc',
            'decuc1',
            'ident_me',
            'pondmen',
            'npers',
            'nenfants',
            'nactifs',
            'sexepr',
            'sexecj',
            'dip14cj',
            'dip14pr',
            'typmen5',
            'cataeu',
            'situapr',
            'situacj',
            'zeat',
        ]

        try:
            menage = survey.get_values(table="MENAGE", variables=variables)
        except:
            menage = survey.get_values(table="menage", variables=variables)

        menage.rename(
            columns={
                'ident_me': 'ident_men',
                'coeffuc': 'ocde10',
                'typmen5': 'typmen',
                'decuc1': 'decuc',
                'cataeu': 'strate'
            },
            inplace=True,
        )
        del variables
        menage.agecj = menage.agecj.fillna(0)
        # Ajout de la variable vag
        try:
            depmen = survey.get_values(table="DEPMEN")
        except:
            depmen = survey.get_values(table="depmen")
        depmen.rename(columns={'ident_me': 'ident_men'}, inplace=True)
        vague = depmen[['vag', 'ident_men']].copy()
        stalog = depmen[['stalog', 'ident_men']].copy()
        del depmen

        menage.set_index('ident_men', inplace=True)
        vague.set_index('ident_men', inplace=True)
        menage = menage.merge(vague, left_index=True, right_index=True)
        # On met un numéro à chaque vague pour pouvoir faire un meilleur suivi des évolutions temporelles pour
        # le modèle de demande
        menage['vag_'] = menage['vag'].copy()
        menage.vag.loc[menage.vag_ == 1] = 23
        menage.vag.loc[menage.vag_ == 2] = 24
        menage.vag.loc[menage.vag_ == 3] = 25
        menage.vag.loc[menage.vag_ == 4] = 26
        menage.vag.loc[menage.vag_ == 5] = 27
        menage.vag.loc[menage.vag_ == 6] = 28
        del menage['vag_']

        # Homogénéisation de la variable statut du logement qui prend des valeurs différentes pour 2011
        stalog['stalog'] = stalog.stalog.astype('int').copy()
        stalog['new_stalog'] = 0
        stalog.loc[stalog.stalog == 2, 'new_stalog'] = 1
        stalog.loc[stalog.stalog == 1, 'new_stalog'] = 2
        stalog.loc[stalog.stalog == 4, 'new_stalog'] = 3
        stalog.loc[stalog.stalog == 5, 'new_stalog'] = 4
        stalog.loc[stalog.stalog.isin([3, 6]), 'new_stalog'] = 5
        stalog.stalog = stalog.new_stalog.copy()
        del stalog['new_stalog']
        assert stalog.stalog.isin(range(1, 6)).all()
        stalog.set_index('ident_men', inplace=True)
        menage = menage.merge(stalog, left_index=True, right_index=True)

        # Recodage des catégories zeat
        menage.loc[menage.zeat == 7, 'zeat'] = 6
        menage.zeat.loc[menage.zeat == 8] = 7
        menage.zeat.loc[menage.zeat == 9] = 8
        assert menage.zeat.isin(range(0, 9)).all()
        menage.index.name = 'ident_men'

    #
    assert menage.index.name == 'ident_men'
    menage['role_menage'] = 0
    temporary_store['donnes_socio_demog_{}'.format(year)] = menage
def create_comparable_logement_data_frame(temporary_store = None, year = None):
    assert temporary_store is not None
    assert year is not None
    logement_adresse_variables = ["gzc2"]
    logement_menage_variables = [
        "maa1at",
        "magtr",
        "mcs8",
        "mdiplo",
        "mrcho",
        "mrret",
        "mrsal",
        "mrtns",
        "mtybd",
        "muc1",
        "qex",
        "sec1",
        ]

    if year == 2003:
        logement_menage_variables.extend(["hnph2", "ident", "lmlm", "mnatior", "typse"])
        logement_adresse_variables.extend(["iaat", "ident", "tu99"])
    if year < 2010 and year > 2005:
        logement_menage_variables.extend(["idlog", "mnatio"])
        logement_adresse_variables.extend(["idlog"])  # pas de typse en 2006
        logement_logement_variables = ["hnph2", "iaat", "idlog", "lmlm", "tu99"]  # pas de typse en 2006

    # Travail sur la table logement
    # Table menage
    if year == 2003:
        year_lgt = 2003
    if year > 2005 and year < 2010:
        year_lgt = 2006

    logement_survey_collection = SurveyCollection.load(collection = 'logement',
            config_files_directory = config_files_directory)
    logement_survey = logement_survey_collection.get_survey('logement_{}'.format(year_lgt))

    log.info("Preparing logement menage table")

    try:
        logement_menage = logement_survey.get_values(
            table = "lgt_menage", variables = logement_menage_variables)
    except:
        logement_menage = logement_survey.get_values(
            table = "menage1", variables = logement_menage_variables)

    logement_menage.rename(columns = {'idlog': 'ident'}, inplace = True)

    logement_menage['mrcho'].fillna(0, inplace = True)
    logement_menage['mrret'].fillna(0, inplace = True)
    logement_menage['mrsal'].fillna(0, inplace = True)
    logement_menage['mrtns'].fillna(0, inplace = True)
    logement_menage['revtot'] = logement_menage['mrcho'] + logement_menage ['mrret'] + logement_menage['mrsal'] + logement_menage['mrtns'] # TODO : Virer les revenus négatifs ? mrtns :  118 revenus négatifs sur 42845 en 2006
    assert logement_menage.revtot.notnull().all()
    logement_menage['nvpr'] = 10.0 * logement_menage['revtot'] / logement_menage['muc1']

    assert logement_menage.qex.notnull().all()
    assert (logement_menage.qex > 0).all()

    dec, values = mark_weighted_percentiles(
        logement_menage['nvpr'].values,
        numpy.arange(1, 11),
        logement_menage['qex'].values,
        2,
        return_quantiles = True,
        )
    values.sort()
    logement_menage['deci'] = (
        1 +
        (logement_menage.nvpr > values[1]) +
        (logement_menage.nvpr > values[2]) +
        (logement_menage.nvpr > values[3]) +
        (logement_menage.nvpr > values[4]) +
        (logement_menage.nvpr > values[5]) +
        (logement_menage.nvpr > values[6]) +
        (logement_menage.nvpr > values[7]) +
        (logement_menage.nvpr > values[8]) +
        (logement_menage.nvpr > values[9])
        )

    del dec, values
    assert logement_menage['deci'].isin(range(1, 11)).all(), "Logement decile are out of range'"
    gc.collect()

    if year_lgt == 2006:
        log.info('Preparing logement logement table')
        try:
            lgtlgt = logement_survey.get_values(
                table = "lgt_logt", variables = logement_logement_variables)
        except:
            lgtlgt = logement_survey.get_values(
                table = "logement", variables = logement_logement_variables)

        lgtlgt.rename(columns = {'idlog': 'ident'}, inplace = True)
        logement_menage = logement_menage.merge(lgtlgt, left_on = 'ident', right_on = 'ident', how = 'inner')
        del lgtlgt

    data = logement_menage[logement_menage['sec1'].isin([21, 22, 23, 24, 30])]
    del logement_menage
    gc.collect()

    if year_lgt == 2006:
        data.rename(columns = {'mnatio': 'mnatior'}, inplace = True)

    data = (data[data['mnatior'].notnull()])
    data = (data[data['sec1'].notnull()])
    data['tmp'] = data['sec1'].astype("int")
    data['tmp'][data['sec1'].isin([21, 22, 23])] = 3
    data['tmp'][data['sec1'] == 24] = 4
    data['tmp'][data['sec1'] == 30] = 5
    data['statut_occupation'] = data['tmp']
    count_NA('statut_occupation', data)
    data = (data[data['statut_occupation'].notnull()])
    logement_menage = data

    # Table adresse
    log.info(u"Préparation de la table adresse de l'enquête logement")

    logement_adresse = logement_survey.get_values(table = "adresse", variables = logement_adresse_variables)
    logement_adresse.rename(columns = {'idlog': 'ident'}, inplace = True)

    log.info(u"Fusion des tables logement et ménage de l'enquête logement")
    Logement = logement_menage.merge(logement_adresse, on = 'ident', how = 'inner')

    Logement.hnph2[Logement.hnph2 >= 6] = 6
    Logement.hnph2[Logement.hnph2 < 1] = 1
    count_NA('hnph2', Logement)
    assert Logement.hnph2.notnull().any(), "Some hnph2 are null"
#     Logement=(Logement[Logement['hnph2'].notnull()]) # Mis en comment car 0 NA pour hnph2

    # On est dans la même étape within ici et par la suite ( cf code R )
    # TODO : ici problème je transforme les 07 en 7
    # car Python considère les 0n comme des nombres octaux ( < 08 ).
    # J'espère que ce n'est pas important.
    Logement.mnatior[Logement['mnatior'].isin([0, 1])] = 1
    Logement.mnatior[Logement['mnatior'].isin([2, 3, 4, 5, 6, 7, 8, 9, 10, 11])] = 2
    count_NA('mnatior', Logement)
    assert_variable_in_range('mnatior', [1, 3], Logement)

    Logement['iaat_bis'] = 0
    Logement.iaat_bis[Logement.iaat.isin([1, 2, 3, 4, 5])] = 1  # avant 1967
    Logement.iaat_bis[Logement.iaat == 6] = 2  # 1968 - 1974
    Logement.iaat_bis[Logement.iaat == 7] = 3  # 1975 - 1981
    Logement.iaat_bis[Logement.iaat == 8] = 4  # 1982 - 1989
    Logement.iaat_bis[Logement.iaat == 9] = 5  # 1990 - 1998
    Logement.iaat_bis[Logement.iaat == 10] = 6  # après 1999
    assert Logement.iaat_bis.isin(range(1, 7)).all()

    Logement.mdiplo[Logement.mdiplo == 1] = 1
    Logement.mdiplo[Logement.mdiplo.isin([2, 3, 4])] = 2
    Logement.mdiplo[Logement.mdiplo.isin([5, 6, 7, 8])] = 3
    Logement.mdiplo[Logement.mdiplo == 9] = 4
    Logement.mdiplo[Logement.mdiplo.isnull()] = 0
    # TODO: assert Logement.mdiplo.isin(range(1, 5)).all()
    assert Logement.mdiplo.isin(range(0, 5)).all()
    Logement.mdiplo = Logement.mdiplo.astype('int')

    Logement.mtybd[Logement['mtybd'] == 110] = 1
    Logement.mtybd[Logement['mtybd'] == 120] = 2
    Logement.mtybd[Logement['mtybd'] == 200] = 3
    Logement.mtybd[Logement['mtybd'].isin([311, 321, 401])] = 4
    Logement.mtybd[Logement['mtybd'].isin([312, 322, 402])] = 5
    Logement.mtybd[Logement['mtybd'].isin([313, 323, 403])] = 6
    Logement.mtybd[Logement['mtybd'] == 400] = 7
    assert Logement.mtybd.isin(range(1, 8)).all()
    Logement.mtybd = Logement.mtybd.astype('int')

    Logement['tu99_recoded'] = Logement['tu99'].copy()
    count_NA('tu99', Logement)
    Logement.tu99_recoded[Logement['tu99'] == 0] = 1
    Logement.tu99_recoded[Logement['tu99'].isin([1, 2, 3])] = 2
    Logement.tu99_recoded[Logement['tu99'].isin([4, 5, 6])] = 3
    Logement.tu99_recoded[Logement['tu99'] == 7] = 4
    Logement.tu99_recoded[Logement['tu99'] == 8] = 5
    count_NA('tu99_recoded', Logement)
    assert_variable_in_range('tu99_recoded', [1, 6], Logement)

    Logement.gzc2[Logement['gzc2'] == 1] = 1
    Logement.gzc2[Logement['gzc2'].isin([2, 3, 4, 5, 6])] = 2
    Logement.gzc2[Logement['gzc2'] == 7] = 3
    count_NA('gzc2', Logement)
    # TODO: assert_variable_in_range('gzc2', [1, 4], Logement)

    Logement.magtr[Logement['magtr'].isin([1, 2])] = 1
    Logement.magtr[Logement['magtr'].isin([3, 4])] = 2
    Logement.magtr[Logement['magtr'] == 5] = 3
    assert Logement.magtr.isin(range(1, 4)).all()

    Logement['mcs8'][Logement['mcs8'] == 1] = 1
    Logement['mcs8'][Logement['mcs8'] == 2] = 2
    Logement['mcs8'][Logement['mcs8'] == 3] = 3
    Logement['mcs8'][Logement['mcs8'].isin([4, 8])] = 4
    Logement['mcs8'][Logement['mcs8'].isin([5, 6, 7])] = 5
    assert Logement.mcs8.isin(range(1, 6)).all()

    Logement['logloy'] = numpy.log(Logement['lmlm'].values)
    kept_variables = [
        'deci',
        'hnph2',
        'iaat_bis',
        # 'ident',
        'lmlm',
        'statut_occupation',
        'magtr',
        'mcs8',
        'mdiplo',
        'mtybd',
        'qex',
        'tu99_recoded',
        ]

    logement = Logement[kept_variables].copy()
    # logement.rename(columns = {'qex': 'wprm'}, inplace = True)
    return logement
Exemple #51
0
def create_totals(year = None):

    assert year is not None
    temporary_store = TemporaryStore.create(file_name = "erfs")
    replace = create_replace(year)

    # On part de la table individu de l'ERFS
    # on renomme les variables

    log.info(u"Creating Totals")
    log.info(u"Etape 1 : Chargement des données")

    erfs_survey_collection = SurveyCollection.load(collection = 'erfs', config_files_directory = config_files_directory)
    data = erfs_survey_collection.get_survey('erfs_{}'.format(year))

    indivim = temporary_store['indivim_{}'.format(year)]

    assert not indivim.duplicated(['noindiv']).any(), "Présence de doublons"

    # Deals individuals with imputed income : some individuals are in 'erf individu table' but
    # not in the 'foyer' table. We need to create a foyer for them.

    selection = Series()
    for var in ["zsali", "zchoi", "zrsti", "zalri", "zrtoi", "zragi", "zrici", "zrnci"]:
        varo = var[:-1] + "o"
        test = indivim[var] != indivim[varo]
        if len(selection) == 0:
            selection = test
        else:
            selection = (test) | (selection)

    indivi_i = indivim[selection].copy()
    indivi_i.rename(
        columns = {
            "ident": "idmen",
            "persfip": "quifoy",
            "zsali": "sali2",  # Inclu les salaires non imposables des agents d'assurance
            "zchoi": "choi2",
            "zrsti": "rsti2",
            "zalri": "alr2"
            },
        inplace = True,
        )

    indivi_i.quifoy = where(indivi_i.quifoy.isnull(), "vous", indivi_i.quifoy)
    indivi_i.quelfic = "FIP_IMP"

    # We merge them with the other individuals
    indivim.rename(
        columns = dict(
            ident = "idmen",
            persfip = "quifoy",
            zsali = "sali2",  # Inclu les salaires non imposables des agents d'assurance
            zchoi = "choi2",
            zrsti = "rsti2",
            zalri = "alr2",
            ),
        inplace = True,
        )

    if not (set(list(indivim.noindiv)) > set(list(indivi_i.noindiv))):
        raise Exception("Individual ")
    indivim.set_index("noindiv", inplace = True)
    indivi_i.set_index("noindiv", inplace = True)
    indivi = indivim
    del indivim
    indivi.update(indivi_i)

    indivi.reset_index(inplace = True)

    log.info("Etape 2 : isolation des FIP")
    fip_imp = indivi.quelfic == "FIP_IMP"
    indivi["idfoy"] = (
        indivi.idmen.astype("int64") * 100 +
        (indivi.declar1.str[0:2]).convert_objects(convert_numeric=True)
        )

    indivi.loc[fip_imp, "idfoy"] = np.nan
    # Certains FIP (ou du moins avec revenus imputés) ont un numéro de déclaration d'impôt ( pourquoi ?)
    fip_has_declar = (fip_imp) & (indivi.declar1.notnull())

    indivi["idfoy"] = where(
        fip_has_declar,
        indivi.idmen * 100 + indivi.declar1.str[0:2].convert_objects(convert_numeric = True),
        indivi.idfoy)
    del fip_has_declar

    fip_no_declar = (fip_imp) & (indivi.declar1.isnull())
    del fip_imp
    indivi["idfoy"] = where(fip_no_declar, indivi["idmen"] * 100 + 50, indivi["idfoy"])

    indivi_fnd = indivi[["idfoy", "noindiv"]][fip_no_declar].copy()

    while any(indivi_fnd.duplicated(cols=["idfoy"])):
        indivi_fnd["idfoy"] = where(
            indivi_fnd.duplicated(cols=["idfoy"]),
            indivi_fnd["idfoy"] + 1,
            indivi_fnd["idfoy"]
            )

    # assert indivi_fnd["idfoy"].duplicated().value_counts()[False] == len(indivi_fnd["idfoy"].values), "Duplicates remaining"
    assert len(indivi[indivi.duplicated(['noindiv'])]) == 0, "Doublons"

    indivi.idfoy[fip_no_declar] = indivi_fnd.idfoy.copy()
    del indivi_fnd, fip_no_declar

    log.info(u"Etape 3 : Récupération des EE_NRT")

    nrt = indivi.quelfic == "EE_NRT"
    indivi.idfoy = where(nrt, indivi.idmen * 100 + indivi.noi, indivi.idfoy)
    indivi.quifoy[nrt] = "vous"
    del nrt

    pref_or_cref = indivi.lpr.isin([1, 2])
    adults = (indivi.quelfic.isin(["EE", "EE_CAF"])) & (pref_or_cref)
    indivi.idfoy = where(adults, indivi.idmen * 100 + indivi.noi, indivi.idfoy)
    indivi.loc[adults, "quifoy"] = "vous"
    del adults
    # TODO: hack to avoid assert error
    log.info("{}".format(indivi.loc[indivi['lpr'].isin([1, 2]), "idfoy"].notnull().value_counts()))
    assert indivi.idfoy[indivi.lpr.dropna().isin([1, 2])].all()

    log.info(u"Etape 4 : Rattachement des enfants aux déclarations")

    assert not(indivi.noindiv.duplicated().any()), "Some noindiv appear twice"
    lpr3_or_lpr4 = indivi['lpr'].isin([3, 4])
    enf_ee = (lpr3_or_lpr4) & (indivi.quelfic.isin(["EE", "EE_CAF"]))
    assert indivi.noindiv[enf_ee].notnull().all(), " Some noindiv are not set, which will ruin next stage"
    assert not(indivi.noindiv[enf_ee].duplicated().any()), "Some noindiv appear twice"

    pere = DataFrame({
        "noindiv_enf": indivi.noindiv.loc[enf_ee],
        "noindiv": 100 * indivi.idmen.loc[enf_ee] + indivi.noiper.loc[enf_ee]
        })
    mere = DataFrame({
        "noindiv_enf": indivi.noindiv.loc[enf_ee],
        "noindiv": 100 * indivi.idmen.loc[enf_ee] + indivi.noimer.loc[enf_ee]
        })

    foyer = data.get_values(variables = ["noindiv", "zimpof"], table = replace["foyer"])
    pere = pere.merge(foyer, how = "inner", on = "noindiv")
    mere = mere.merge(foyer, how = "inner", on = "noindiv")
    df = pere.merge(mere, how = "outer", on = "noindiv_enf", suffixes=('_p', '_m'))

    log.info(u"    4.1 : gestion des personnes dans 2 foyers")
    for col in ["noindiv_p", "noindiv_m", "noindiv_enf"]:
        df[col] = df[col].fillna(0, inplace = True)  # beacause groupby drop groups with NA in index
    df = df.groupby(by = ["noindiv_p", "noindiv_m", "noindiv_enf"]).sum()
    df.reset_index(inplace = True)

    df["which"] = ""
    df.which = where((df.zimpof_m.notnull()) & (df.zimpof_p.isnull()), "mere", "")
    df.which = where((df.zimpof_p.notnull()) & (df.zimpof_m.isnull()), "pere", "")
    both = (df.zimpof_p.notnull()) & (df.zimpof_m.notnull())
    df.which = where(both & (df.zimpof_p > df.zimpof_m), "pere", "mere")
    df.which = where(both & (df.zimpof_m >= df.zimpof_p), "mere", "pere")

    assert df.which.notnull().all(), "Some enf_ee individuals are not matched with any pere or mere"
    del lpr3_or_lpr4, pere, mere

    df.rename(columns = {"noindiv_enf": "noindiv"}, inplace = True)
    df['idfoy'] = where(df.which == "pere", df.noindiv_p, df.noindiv_m)
    df['idfoy'] = where(df.which == "mere", df.noindiv_m, df.noindiv_p)

    assert df["idfoy"].notnull().all()

    dropped = [col for col in df.columns if col not in ["idfoy", "noindiv"]]
    df.drop(dropped, axis = 1, inplace = True)

    assert not(df.duplicated().any())

    df.set_index("noindiv", inplace = True, verify_integrity = True)
    indivi.set_index("noindiv", inplace = True, verify_integrity = True)

    ind_notnull = indivi["idfoy"].notnull().sum()
    ind_isnull = indivi["idfoy"].isnull().sum()
    indivi = indivi.combine_first(df)
    assert ind_notnull + ind_isnull == (
        indivi["idfoy"].notnull().sum() +
        indivi["idfoy"].isnull().sum()
        )
    indivi.reset_index(inplace = True)
    assert not(indivi.duplicated().any())

    # MBJ: issue delt with when moving from R code to python
    # TODO il faut rajouterles enfants_fip et créer un ménage pour les majeurs
    # On suit guide méthodo erf 2003 page 135
    # On supprime les conjoints FIP et les FIP de 25 ans et plus;
    # On conserve les enfants FIP de 19 à 24 ans;
    # On supprime les FIP de 18 ans et moins, exceptés les FIP nés en 2002 dans un
    # ménage en 6ème interrogation car ce sont des enfants nés aprés la date d'enquète
    # EEC que l'on ne retrouvera pas dans les EEC suivantes.
    #
    log.info(u"    4.2 : On enlève les individus pour lesquels il manque le déclarant")
    fip = temporary_store['fipDat_{}'.format(year)]
    fip["declar"] = np.nan
    fip["agepf"] = np.nan

    fip.drop(["actrec", "year", "noidec"], axis = 1, inplace = True)
    fip.naia = fip.naia.astype("int32")
    fip.rename(
        columns = dict(
            ident = "idmen",
            persfip = "quifoy",
            zsali = "sali2",  # Inclu les salaires non imposables des agents d'assurance
            zchoi = "choi2",
            zrsti = "rsti2",
            zalri = "alr2"),
        inplace = True)

    is_fip_19_25 = ((year - fip.naia - 1) >= 19) & ((year - fip.naia - 1) < 25)

    # TODO: BUT for the time being we keep them in thier vous menage so the following lines are commented
    # The idmen are of the form 60XXXX we use idmen 61XXXX, 62XXXX for the idmen of the kids over 18 and less than 25
    # fip[is_fip_19_25 ,"idmen"] <- (99-fip[is_fip_19_25,"noi"]+1)*100000 + fip[is_fip_19_25,"idmen"]
    # fip[is_fip_19_25 ,"lpr"]  <- 1
    #
    # indivi <- rbind.fill(indivi,fip[is_fip_19_25,])

    indivi = concat([indivi, fip.loc[is_fip_19_25]])
    del is_fip_19_25
    indivi['age'] = year - indivi.naia - 1
    indivi['age_en_mois'] = 12 * indivi.age + 12 - indivi.naim

    indivi["quimen"] = 0
    indivi.quimen[indivi.lpr == 1] = 0
    indivi.quimen[indivi.lpr == 2] = 1
    indivi.quimen[indivi.lpr == 3] = 2
    indivi.quimen[indivi.lpr == 4] = 3
    indivi['not_pr_cpr'] = None  # Create a new row
    indivi.not_pr_cpr[indivi.lpr <= 2] = False
    indivi.not_pr_cpr[indivi.lpr > 2] = True

    assert indivi.not_pr_cpr.isin([True, False]).all()

    log.info(u"    4.3 : Creating non pr=0 and cpr=1 idmen's")
    indivi.reset_index(inplace = True)

    test1 = indivi[['quimen', 'idmen']][indivi.not_pr_cpr].copy()
    test1['quimen'] = 2

    j = 2
    while any(test1.duplicated(['quimen', 'idmen'])):
        test1.loc[test1.duplicated(['quimen', 'idmen']), 'quimen'] = j + 1
        j += 1
    print_id(indivi)
    indivi.update(test1)

    print_id(indivi)

    # indivi.set_index(['quimen']) #TODO: check relevance
    # TODO problème avec certains idfoy qui n'ont pas de vous
    log.info(u"Etape 5 : Gestion des idfoy qui n'ont pas de vous")
    all_ind = indivi.drop_duplicates('idfoy')
    with_ = indivi.loc[indivi.quifoy == 'vous', 'idfoy']
    without = all_ind[~(all_ind.idfoy.isin(with_.values))]

    log.info(u"On cherche si le déclarant donné par la deuxième déclaration est bien un vous")

    # TODO: the following should be delt with at the import of the tables
    indivi.replace(
        to_replace = {
            'declar2': {'NA': np.nan, '': np.nan}
            },
        inplace = True
        )

    has_declar2 = (indivi.idfoy.isin(without.idfoy.values)) & (indivi.declar2.notnull())

    decl2_idfoy = (
        indivi.loc[has_declar2, "idmen"].astype('int') * 100 +
        indivi.loc[has_declar2, "declar2"].str[0:2].astype('int')        )
    indivi.loc[has_declar2, 'idfoy'] = where(decl2_idfoy.isin(with_.values), decl2_idfoy, None)
    del all_ind, with_, without, has_declar2

    log.info(u"    5.1 : Elimination idfoy restant")
    idfoyList = indivi.loc[indivi.quifoy == "vous", 'idfoy'].drop_duplicates()
    indivi = indivi[indivi.idfoy.isin(idfoyList.values)]
    del idfoyList
    print_id(indivi)

    # Sélectionne les variables à garder pour les steps suivants
    myvars = [
        "actrec",
        "age",
        "age_en_mois",
        "chpub",
        "encadr",
        "idfoy",
        "idmen",
        "nbsala",
        "noi",
        "noindiv",
        "prosa",
        "quelfic",
        "quifoy",
        "quimen",
        "statut",
        "titc",
        "txtppb",
        "wprm",
        "rc1rev",
        "maahe",
        ]

    assert len(set(myvars).difference(set(indivi.columns))) == 0, \
        "Manquent les colonnes suivantes : {}".format(set(myvars).difference(set(indivi.columns)))

    indivi = indivi[myvars].copy()
    # TODO les actrec des fip ne sont pas codées (on le fera à la fin quand on aura rassemblé
    # les infos provenant des déclarations)
    log.info(u"Etape 6 : Création des variables descriptives")
    log.info(u"    6.1 : variable activité")
    indivi['activite'] = None
    indivi['activite'][indivi.actrec <= 3] = 0
    indivi['activite'][indivi.actrec == 4] = 1
    indivi['activite'][indivi.actrec == 5] = 2
    indivi['activite'][indivi.actrec == 7] = 3
    indivi['activite'][indivi.actrec == 8] = 4
    indivi['activite'][indivi.age <= 13] = 2  # ce sont en fait les actrec=9
    log.info("{}".format(indivi['activite'].value_counts(dropna = False)))
    # TODO: MBJ problem avec les actrec
    # TODO: FIX AND REMOVE
    indivi.activite[indivi.actrec.isnull()] = 5
    indivi.titc[indivi.titc.isnull()] = 0
    assert indivi.titc.notnull().all(), u"Problème avec les titc" # On a 420 NaN pour les varaibels statut, titc etc

    log.info(u"    6.2 : variable statut")
    indivi.statut[indivi.statut.isnull()] = 0
    indivi.statut = indivi.statut.astype('int')
    indivi.statut[indivi.statut == 11] = 1
    indivi.statut[indivi.statut == 12] = 2
    indivi.statut[indivi.statut == 13] = 3
    indivi.statut[indivi.statut == 21] = 4
    indivi.statut[indivi.statut == 22] = 5
    indivi.statut[indivi.statut == 33] = 6
    indivi.statut[indivi.statut == 34] = 7
    indivi.statut[indivi.statut == 35] = 8
    indivi.statut[indivi.statut == 43] = 9
    indivi.statut[indivi.statut == 44] = 10
    indivi.statut[indivi.statut == 45] = 11
    assert indivi.statut.isin(range(12)).all(), u"statut value over range"


    log.info(u"    6.3 : variable txtppb")
    indivi.txtppb.fillna(0, inplace = True)
    assert indivi.txtppb.notnull().all()

    indivi.nbsala.fillna(0, inplace = True)
    indivi['nbsala'] = indivi.nbsala.astype('int')
    indivi.nbsala[indivi.nbsala == 99] = 10
    assert indivi.nbsala.isin(range(11)).all()

    log.info(u"    6.4 : variable chpub et CSP")
    indivi.chpub.fillna(0, inplace = True)
    indivi.chpub = indivi.chpub.astype('int')
    indivi.chpub[indivi.chpub.isnull()] = 0
    assert indivi.chpub.isin(range(11)).all()

    indivi['cadre'] = 0
    indivi.prosa.fillna(0, inplace = True)
    assert indivi['prosa'].notnull().all()
    log.info("{}".format(indivi['encadr'].value_counts(dropna = False)))

    # encadr : 1=oui, 2=non
    indivi.encadr.fillna(2, inplace = True)
    indivi.encadr[indivi.encadr == 0] = 2

    assert indivi.encadr.notnull().all()
    assert indivi.encadr.isin([1, 2]).all()

    indivi['cadre'][indivi.prosa.isin([7, 8])] = 1
    indivi['cadre'][(indivi.prosa == 9) & (indivi.encadr == 1)] = 1

    assert indivi['cadre'].isin(range(2)).all()

    log.info(
        u"Etape 7: on vérifie qu'il ne manque pas d'info sur les liens avec la personne de référence")
    log.info(
        u"nb de doublons idfam/quifam {}".format(len(indivi[indivi.duplicated(cols=['idfoy', 'quifoy'])])))

    log.info(u"On crée les n° de personnes à charge")
    assert indivi['idfoy'].notnull().all()
    print_id(indivi)
    indivi['quifoy2'] = 2
    indivi.quifoy2[indivi.quifoy == 'vous'] = 0
    indivi.quifoy2[indivi.quifoy == 'conj'] = 1
    indivi.quifoy2[indivi.quifoy == 'pac'] = 2

    del indivi['quifoy']
    indivi['quifoy'] = indivi.quifoy2
    del indivi['quifoy2']

    print_id(indivi)
    test2 = indivi[['quifoy', 'idfoy', 'noindiv']][indivi['quifoy'] == 2].copy()
    print_id(test2)

    j = 2
    while test2.duplicated(['quifoy', 'idfoy']).any():
        test2.loc[test2.duplicated(['quifoy', 'idfoy']), 'quifoy'] = j
        j += 1

    print_id(test2)
    indivi = indivi.merge(test2, on = ['noindiv', 'idfoy'], how = "left")
    indivi['quifoy'] = indivi['quifoy_x']
    indivi['quifoy'] = where(indivi['quifoy_x'] == 2, indivi['quifoy_y'], indivi['quifoy_x'])
    del indivi['quifoy_x'], indivi['quifoy_y']
    print_id(indivi)

    del test2, fip
    log.info(
        u"nb de doublons idfam/quifam' {}".format(
            len(indivi[indivi.duplicated(subset = ['idfoy', 'quifoy'])])
            )
        )
    print_id(indivi)

    log.info(u"Etape 8 : création des fichiers totaux")
    famille = temporary_store['famc_{}'.format(year)]

    log.info(u"    8.1 : création de tot2 & tot3")
    tot2 = indivi.merge(famille, on = 'noindiv', how = 'inner')
#     del famille # TODO: MBJ increase in number of menage/foyer when merging with family ...
    del famille

    control(tot2, debug = True, verbose = True)
    assert tot2.quifam.notnull().all()

    temporary_store['tot2_{}'.format(year)] = tot2
    del indivi
    log.info(u"    tot2 saved")

    tot2.merge(foyer, how = 'left')

    tot2 = tot2[tot2.idmen.notnull()].copy()

    print_id(tot2)
    tot3 = tot2
    # TODO: check where they come from
    tot3 = tot3.drop_duplicates(subset = 'noindiv')
    log.info("{}".format(len(tot3)))

    # Block to remove any unwanted duplicated pair
    control(tot3, debug = True, verbose = True)
    tot3 = tot3.drop_duplicates(subset = ['idfoy', 'quifoy'])
    tot3 = tot3.drop_duplicates(subset = ['idfam', 'quifam'])
    tot3 = tot3.drop_duplicates(subset = ['idmen', 'quimen'])
    tot3 = tot3.drop_duplicates(subset = ['noindiv'])
    control(tot3)

    log.info(u"    8.2 : On ajoute les variables individualisables")

    allvars = temporary_store['ind_vars_to_remove_{}'.format(year)]
    vars2 = set(tot3.columns).difference(set(allvars))
    tot3 = tot3[list(vars2)]
    log.info("{}".format(len(tot3)))

    assert not(tot3.duplicated(subset = ['noindiv']).any()), "doublon dans tot3['noindiv']"
    lg_dup = len(tot3[tot3.duplicated(['idfoy', 'quifoy'])])
    assert lg_dup == 0, "{} pairs of idfoy/quifoy in tot3 are duplicated".format(lg_dup)

    temporary_store['tot3_{}'.format(year)] = tot3
    control(tot3)

    del tot2, allvars, tot3, vars2
    log.info(u"tot3 sauvegardé")
    gc.collect()
Exemple #52
0
def build_homogeneisation_revenus_menages(temporary_store=None, year=None):
    assert temporary_store is not None
    """Build menage consumption by categorie fiscale dataframe """

    assert year is not None
    # Load data
    bdf_survey_collection = SurveyCollection.load(
        collection='budget_des_familles',
        config_files_directory=config_files_directory)
    survey = bdf_survey_collection.get_survey(
        'budget_des_familles_{}'.format(year))

    # **********************************************************************************************************************
    # ********************************* HOMOGENEISATION DES DONNEES SUR LES REVENUS DES MENAGES ****************************
    # ************************************ CALCUL D'UN PROXI DU REVENU DISPONIBLE DES MENAGES ******************************
    # **********************************************************************************************************************
    #
    # ********************HOMOGENEISATION DES BASES DE RESSOURCES***************************

    # La base 95 permet de distinguer taxe d'habitation et impôts fonciers.
    # On calcule leur montant relatif pour l'appliquer à 00 et 05

    if year == 1995:
        menrev = survey.get_values(
            table="menrev",
            variables=[
                'revtot', 'ir', 'irbis', 'imphab', 'impfon', 'revaid',
                'revsal', 'revind', 'revsec', 'revret', 'revcho', 'revfam',
                'revlog', 'revinv', 'revrmi', 'revpat', 'mena', 'ponderr'
            ],
        )
        menage = survey.get_values(table="socioscm",
                                   variables=['exdep', 'exrev', 'mena'])

        menage.set_index('mena')
        menrev = menrev.merge(menage, left_index=True, right_index=True)
        # cette étape de ne garder que les données dont on est sûr de la qualité et de la véracité
        # exdep = 1 si les données sont bien remplies pour les dépenses du ménage
        # exrev = 1 si les données sont bien remplies pour les revenus du ménage

        menrev = menrev[(menrev.exdep == 1) & (menrev.exrev == 1)]

        menrev['foncier_hab'] = menrev.imphab + menrev.impfon
        menrev['part_IMPHAB'] = menrev.imphab / menrev.foncier_hab
        menrev['part_IMPFON'] = menrev.impfon / menrev.foncier_hab

        menrev['revsoc'] = (menrev.revret + menrev.revcho + menrev.revfam +
                            menrev.revlog + menrev.revinv + menrev.revrmi)
        for variable in [
                'revcho', 'revfam', 'revinv', 'revlog', 'revret', 'revrmi'
        ]:
            del menrev[variable]

        menrev[
            'revact'] = menrev['revsal'] + menrev['revind'] + menrev['revsec']
        menrev.rename(columns=dict(
            revpat="revpat",
            impfon="impfon",
            imphab="imphab",
            revaid="somme_obl_recue",
        ),
                      inplace=True)
        menrev['impot_revenu'] = menrev['ir'] + menrev['irbis']

        rev_disp = survey.get_values(
            table="menrev",
            variables=[
                'revtot', 'revret', 'revcho', 'revfam', 'revlog', 'revinv',
                'revrmi', 'imphab', 'impfon', 'revaid', 'revsal', 'revind',
                'revsec', 'revpat', 'mena', 'ponderr', 'ir', 'irbis'
            ],
        )
        rev_disp.set_index('mena', inplace=True)

        menage2 = survey.get_values(table="socioscm",
                                    variables=['exdep', 'exrev', 'mena'])

        menage2.set_index('mena', inplace=True)
        rev_disp = menage2.merge(rev_disp, left_index=True, right_index=True)

        rev_disp = rev_disp[(rev_disp.exrev == 1) & (rev_disp.exdep == 1)]

        rev_disp['revsoc'] = (rev_disp['revret'] + rev_disp['revcho'] +
                              rev_disp['revfam'] + rev_disp['revlog'] +
                              rev_disp['revinv'] + rev_disp['revrmi'])
        rev_disp['impot_revenu'] = rev_disp['ir'] + rev_disp['irbis']

        rev_disp.rename(columns=dict(revaid='somme_obl_recue', ), inplace=True)
        rev_disp.somme_obl_recue = rev_disp.somme_obl_recue.fillna(0)

        rev_disp['revact'] = rev_disp['revsal'] + rev_disp[
            'revind'] + rev_disp['revsec']

        rev_disp['revtot'] = rev_disp['revact'] + rev_disp[
            'revpat'] + rev_disp['revsoc'] + rev_disp['somme_obl_recue']

        rev_disp['revact'] = rev_disp['revsal'] + rev_disp[
            'revind'] + rev_disp['revsec']

        rev_disp.rename(columns=dict(
            ponderr="pondmen",
            mena="ident_men",
            revind="act_indpt",
            revsal="salaires",
            revsec="autres_rev",
        ),
                        inplace=True)

        rev_disp['autoverses'] = '0'
        rev_disp['somme_libre_recue'] = '0'
        rev_disp['autres_ress'] = '0'

        #
        # /* Le revenu disponible se calcule à partir de revtot à laquelle on retrancher la taxe d'habitation
        # et l'impôt sur le revenu, plus éventuellement les CSG et CRDS.
        # La variable revtot est la somme des revenus d'activité, sociaux, du patrimoine et d'aide. */
        #
        rev_disp[
            'rev_disponible'] = rev_disp.revtot - rev_disp.impot_revenu - rev_disp.imphab
        loyers_imputes = temporary_store['depenses_bdf_{}'.format(year)]
        loyers_imputes.rename(
            columns={"0411": "loyer_impute"},
            inplace=True,
        )

        rev_dispbis = loyers_imputes.merge(rev_disp,
                                           left_index=True,
                                           right_index=True)
        rev_disp['rev_disp_loyerimput'] = rev_disp[
            'rev_disponible'] - rev_dispbis['loyer_impute']

        for var in [
                'somme_obl_recue', 'act_indpt', 'revpat', 'salaires',
                'autres_rev', 'rev_disponible', 'impfon', 'imphab', 'revsoc',
                'revact', 'impot_revenu', 'revtot', 'rev_disp_loyerimput'
        ]:
            rev_disp[var] = rev_disp[var] / 6.55957  # CONVERSION EN EUROS

        temporary_store["revenus_{}".format(year)] = rev_disp

    elif year == 2000:
        # TODO: récupérer plutôt les variables qui viennent de la table dépenses (dans temporary_store)
        rev_disp = survey.get_values(
            table="consomen",
            variables=[
                'c13141', 'c13111', 'c13121', 'c13131', 'pondmen', 'ident'
            ],
        )
        menage = survey.get_values(
            table="menage",
            variables=[
                'ident', 'revtot', 'revact', 'revsoc', 'revpat', 'rev70',
                'rev71', 'revt_d', 'pondmen', 'rev10', 'rev11', 'rev20',
                'rev21'
            ],
        ).sort_values(by=['ident'])
        menage.index = menage.index.astype(ident_men_dtype)
        rev_disp.index = rev_disp.index.astype(ident_men_dtype)
        revenus = menage.join(rev_disp, how="outer", rsuffix="rev_disp")
        revenus.fillna(0, inplace=True)
        revenus.rename(columns=dict(
            c13111="impot_res_ppal",
            c13141="impot_revenu",
            c13121="impot_autres_res",
            rev70="somme_obl_recue",
            rev71="somme_libre_recue",
            revt_d="autres_ress",
            ident="ident_men",
            rev10="act_indpt",
            rev11="autoverses",
            rev20="salaires",
            rev21="autres_rev",
        ),
                       inplace=True)

        var_to_ints = [
            'pondmen', 'impot_autres_res', 'impot_res_ppal', 'pondmenrev_disp',
            'c13131'
        ]
        for var_to_int in var_to_ints:
            revenus.loc[revenus[var_to_int].isnull(), var_to_int] = 0
            revenus[var_to_int] = revenus[var_to_int].astype(int)

        revenus['imphab'] = 0.65 * (revenus.impot_res_ppal +
                                    revenus.impot_autres_res)
        revenus['impfon'] = 0.35 * (revenus.impot_res_ppal +
                                    revenus.impot_autres_res)

        loyers_imputes = temporary_store["depenses_bdf_{}".format(year)]
        variables = ["poste_coicop_421"]
        loyers_imputes = loyers_imputes[variables]

        loyers_imputes.rename(
            columns={"poste_coicop_421": "loyer_impute"},
            inplace=True,
        )

        temporary_store["loyers_imputes_{}".format(year)] = loyers_imputes
        loyers_imputes.index = loyers_imputes.index.astype(ident_men_dtype)

        revenus.set_index('ident_men', inplace=True)
        revenus.index = revenus.index.astype(ident_men_dtype)
        assert set(revenus.index) == set(
            loyers_imputes.index
        ), 'revenus and loyers_imputes indexes are not equal'
        revenus = revenus.merge(loyers_imputes,
                                left_index=True,
                                right_index=True)
        revenus[
            'rev_disponible'] = revenus.revtot - revenus.impot_revenu - revenus.imphab
        revenus['rev_disponible'] = revenus['rev_disponible'] * (
            revenus['rev_disponible'] >= 0)
        revenus[
            'rev_disp_loyerimput'] = revenus.rev_disponible + revenus.loyer_impute

        var_to_ints = ['loyer_impute']
        for var_to_int in var_to_ints:
            revenus[var_to_int] = revenus[var_to_int].astype(int)

        temporary_store["revenus_{}".format(year)] = revenus

    elif year == 2005:
        c05d = survey.get_values(
            table="c05d",
            variables=['c13111', 'c13121', 'c13141', 'pondmen', 'ident_men'],
        )
        rev_disp = c05d.sort_values(by=['ident_men'])
        del c05d
        menage = survey.get_values(
            table="menage",
            variables=[
                'ident_men', 'revtot', 'revact', 'revsoc', 'revpat',
                'rev700_d', 'rev701_d', 'rev999_d', 'rev100_d', 'rev101_d',
                'rev200_d', 'rev201_d'
            ],
        ).sort_values(by=['ident_men'])
        rev_disp.set_index('ident_men', inplace=True)
        menage.set_index('ident_men', inplace=True)
        menage.index = menage.index.astype('str')
        rev_disp.index = rev_disp.index.astype('str')
        assert menage.index.dtype == rev_disp.index.dtype, 'menage ({}) and revdisp ({}) dtypes differs'.format(
            menage.index.dtype, rev_disp.index.dtype)
        revenus = pandas.concat([menage, rev_disp], axis=1)
        assert len(menage.index) == len(revenus.index)
        revenus.rename(columns=dict(
            rev100_d="act_indpt",
            rev101_d="autoverses",
            rev200_d="salaires",
            rev201_d="autres_rev",
            rev700_d="somme_obl_recue",
            rev701_d="somme_libre_recue",
            rev999_d="autres_ress",
            c13111="impot_res_ppal",
            c13141="impot_revenu",
            c13121="impot_autres_res",
        ),
                       inplace=True)
        # * Ces pondérations (0.65 0.35) viennent de l'enquête BdF 1995 qui distingue taxe d'habitation et impôts
        #   fonciers. A partir de BdF 1995,
        # * on a calculé que la taxe d'habitation représente en moyenne 65% des impôts locaux, et que les impôts
        #   fonciers en représentenr 35%.
        # * On applique ces taux aux enquêtes 2000 et 2005.

        revenus['imphab'] = 0.65 * (revenus.impot_res_ppal +
                                    revenus.impot_autres_res)
        revenus['impfon'] = 0.35 * (revenus.impot_res_ppal +
                                    revenus.impot_autres_res)
        del revenus['impot_autres_res']
        del revenus['impot_res_ppal']

        #    * Calculer le revenu disponible avec et sans le loyer imputé

        loyers_imputes = temporary_store["depenses_bdf_{}".format(year)]
        variables = ["poste_coicop_421"]
        loyers_imputes = loyers_imputes[variables]
        loyers_imputes.rename(
            columns={"poste_coicop_421": "loyer_impute"},
            inplace=True,
        )
        temporary_store["loyers_imputes_{}".format(year)] = loyers_imputes
        loyers_imputes.index = loyers_imputes.index.astype('str')
        assert revenus.index.dtype == loyers_imputes.index.dtype
        assert set(revenus.index) == set(
            loyers_imputes.index
        ), '''revenus and loyers_imputes indexes are not equal.
In revenus and not in loyers_imputes:
{}
In loyers_imputes and not in revenus:
{}
'''.format(
            set(revenus.index) - set(loyers_imputes.index),
            set(loyers_imputes.index) - set(revenus.index))
        revenus = revenus.merge(loyers_imputes,
                                left_index=True,
                                right_index=True)
        revenus[
            'rev_disponible'] = revenus.revtot - revenus.impot_revenu - revenus.imphab
        revenus['rev_disponible'] = revenus['rev_disponible'] * (
            revenus['rev_disponible'] >= 0)
        revenus[
            'rev_disp_loyerimput'] = revenus.rev_disponible + revenus.loyer_impute

        temporary_store["revenus_{}".format(year)] = revenus

    elif year == 2011:
        try:
            c05 = survey.get_values(
                table="C05",
                variables=[
                    'c13111', 'c13121', 'c13141', 'pondmen', 'ident_me'
                ],
            )
            rev_disp = c05.sort_values(by=['ident_me'])
        except:
            c05 = survey.get_values(
                table="c05",
                variables=[
                    'c13111', 'c13121', 'c13141', 'pondmen', 'ident_me'
                ],
            )
            rev_disp = c05.sort_values(by=['ident_me'])
        del c05
        try:
            menage = survey.get_values(
                table="MENAGE",
                variables=[
                    'ident_me', 'revtot', 'revact', 'revsoc', 'revpat',
                    'rev700', 'rev701', 'rev999', 'revindep', 'salaires'
                ],
            ).sort_values(by=['ident_me'])
        except:
            menage = survey.get_values(
                table="menage",
                variables=[
                    'ident_me', 'revtot', 'revact', 'revsoc', 'revpat',
                    'rev700', 'rev701', 'rev999', 'revindep', 'salaires'
                ],
            ).sort_values(by=['ident_me'])

        rev_disp.index = rev_disp.index.astype(ident_men_dtype)
        menage.index = menage.index.astype(ident_men_dtype)
        rev_disp.set_index('ident_me', inplace=True)
        menage.set_index('ident_me', inplace=True)
        revenus = pandas.concat([menage, rev_disp], axis=1)
        menage.index.name = 'ident_men'
        revenus.index.name = 'ident_men'
        revenus.rename(
            columns=dict(
                revindep="act_indpt",
                # TODO: trouver ces revenus commentés dans bdf 2011
                # rev101_d = "autoverses",
                salaires="salaires",
                # rev201_d = "autres_rev",
                rev700="somme_obl_recue",
                rev701="somme_libre_recue",
                rev999="autres_ress",
                c13111="impot_res_ppal",
                c13141="impot_revenu",
                c13121="impot_autres_res",
            ),
            inplace=True)
        revenus['imphab'] = 0.65 * (revenus.impot_res_ppal +
                                    revenus.impot_autres_res)
        revenus['impfon'] = 0.35 * (revenus.impot_res_ppal +
                                    revenus.impot_autres_res)
        del revenus['impot_autres_res']
        del revenus['impot_res_ppal']

        loyers_imputes = temporary_store["depenses_bdf_{}".format(year)]
        variables = ["poste_coicop_421"]
        loyers_imputes = loyers_imputes[variables]
        loyers_imputes.rename(
            columns={"poste_coicop_421": "loyer_impute"},
            inplace=True,
        )
        temporary_store["loyers_imputes_{}".format(year)] = loyers_imputes
        revenus = revenus.merge(loyers_imputes,
                                left_index=True,
                                right_index=True)
        revenus[
            'rev_disponible'] = revenus.revtot - revenus.impot_revenu - revenus.imphab
        revenus['rev_disponible'] = revenus['rev_disponible'] * (
            revenus['rev_disponible'] >= 0)
        revenus[
            'rev_disp_loyerimput'] = revenus.rev_disponible + revenus.loyer_impute
        temporary_store["revenus_{}".format(year)] = revenus
@author: malkaguillot
"""
import pandas

from openfisca_france_data import default_config_files_directory as config_files_directory
from openfisca_france_data.input_data_builders.build_openfisca_survey_data.base \
    import year_specific_by_generic_data_frame_name
from openfisca_france_data.temporary import get_store
from openfisca_survey_manager.survey_collections import SurveyCollection

# En entrée : tables individus, foyer et sif de ERFS (testé sur 2009)
year = 2009
year_specific_by_generic = year_specific_by_generic_data_frame_name(year)

erfs_survey_collection = SurveyCollection.load(collection = 'erfs', config_files_directory = config_files_directory)
survey = erfs_survey_collection.get_survey('erfs_{}'.format(year))
foyer_all = survey.get_values(table = year_specific_by_generic["foyer"])
erfind = survey.get_values(table = year_specific_by_generic["erf_indivi"])

temporary_store = get_store(file_name = 'erfs')
sif = temporary_store['sif']

ind = erfind[['ident', 'noindiv', 'declar1', 'declar2', 'zsali', 'persfip', 'persfipd']]
small_sif = sif[['noindiv', 'declar', 'causeXYZ']]
foyer = foyer_all[['ident', 'noindiv', 'declar', 'sif', '_1aj', '_1bj', '_1cj', '_1dj', '_1aq', '_1bq', '_8by', '_8cy'
                   ]]
foyer = foyer.drop(['_1cj', '_1dj', '_1aq', '_1bq', '_8by', '_8cy'], axis=1)
foyer_sif = pandas.merge(foyer, small_sif, on = ['declar', 'noindiv'])

def create_indivim_menagem(temporary_store=None, year=None):
    """
    Création des tables ménages et individus concaténée (merged)
    """
    # Prepare the some useful merged tables

    assert temporary_store is not None
    assert year is not None
    # load data
    erfs_survey_collection = SurveyCollection.load(
        collection='erfs', config_files_directory=config_files_directory)

    year_specific_by_generic = year_specific_by_generic_data_frame_name(year)
    survey = erfs_survey_collection.get_survey('erfs_{}'.format(year))
    erfmen = survey.get_values(table=year_specific_by_generic["erf_menage"])
    eecmen = survey.get_values(table=year_specific_by_generic["eec_menage"])
    erfind = survey.get_values(table=year_specific_by_generic["erf_indivi"])
    eecind = survey.get_values(table=year_specific_by_generic["eec_indivi"])

    # travail sur la cohérence entre les bases
    noappar_m = eecmen[~(eecmen.ident.isin(erfmen.ident.values))].copy()

    noappar_i = eecmen[~(eecind.ident.isin(erfind.ident.values))].copy()
    noappar_i = noappar_i.drop_duplicates(subset='ident', take_last=True)
    # TODO: vérifier qu'il n'y a théoriquement pas de doublon

    difference = set(noappar_i.ident).symmetric_difference(noappar_m.ident)
    intersection = set(noappar_i.ident) & set(noappar_m.ident)
    log.info("There are {} differences and {} intersections".format(
        len(difference), len(intersection)))
    del noappar_i, noappar_m, difference, intersection
    gc.collect()

    # fusion enquete emploi et source fiscale
    menagem = erfmen.merge(eecmen)
    indivim = eecind.merge(erfind, on=['noindiv', 'ident', 'noi'], how="inner")

    # optimisation des types? Controle de l'existence en passant
    # TODO: minimal dtype
    # TODO: this should be done somewhere else
    var_list = ([
        'acteu',
        'agepr',
        'cohab',
        'contra',
        'encadr',
        'forter',
        'lien',
        'mrec',
        'naia',
        'noicon',
        'noimer',
        'noiper',
        'prosa',
        'retrai',
        'rstg',
        'statut',
        'stc',
        'titc',
        'txtppb',
    ])

    for var in var_list:
        assert numpy.issubdtype(
            indivim[var].dtype, numpy.integer
        ), "Variable {} dtype is {} and should be an integer".format(
            var, indivim[var].dtype)

    ########################
    # création de variables#
    ########################

#  print indivim


#   actrec : activité recodée comme preconisé par l'INSEE p84 du guide utilisateur
    indivim["actrec"] = numpy.nan
    # Attention : Q: pas de 6 ?!! A : Non pas de 6, la variable recodée de l'INSEE (voit p84 du guide methodo), ici \
    # la même nomenclature à été adopée
    # 3: contrat a durée déterminée
    indivim.actrec.loc[indivim.acteu == 1] = 3
    # 8 : femme (homme) au foyer, autre inactif
    indivim.actrec.loc[indivim.acteu == 3] = 8
    # 1 : actif occupé non salarié
    filter1 = (indivim.acteu == 1) & (indivim.stc.isin(
        [1, 3]))  # actifs occupés non salariés à son compte ou pour un
    indivim.actrec.loc[filter1] = 1  # membre de sa famille
    # 2 : salarié pour une durée non limitée
    filter2 = (indivim.acteu == 1) & (((indivim.stc == 2) &
                                       (indivim.contra == 1)) |
                                      (indivim.titc == 2))
    indivim.actrec.loc[filter2] = 2
    # 4 : au chomage
    filter4 = (indivim.acteu == 2) | ((indivim.acteu == 3) &
                                      (indivim.mrec == 1))
    indivim.actrec.loc[filter4] = 4
    # 5 : élève étudiant , stagiaire non rémunéré
    filter5 = (indivim.acteu == 3) & ((indivim.forter == 2) |
                                      (indivim.rstg == 1))
    indivim.actrec.loc[filter5] = 5
    # 7 : retraité, préretraité, retiré des affaires unchecked
    filter7 = (indivim.acteu == 3) & ((indivim.retrai == 1) |
                                      (indivim.retrai == 2))
    indivim.actrec.loc[filter7] = 7
    # 9 : probablement enfants de - de 16 ans TODO: check that fact in database and questionnaire
    indivim.actrec.loc[indivim.acteu == 0] = 9

    indivim.actrec = indivim.actrec.astype("int8")
    assert_dtype(indivim.actrec, "int8")
    assert indivim.actrec.isin(range(
        1, 10)).all(), 'actrec values are outside the interval [1, 9]'

    #   TODO : compare the result with results provided by Insee
    #   tu99
    if year == 2009:
        erfind['tu99'] = None  # TODO: why ?

    # Locataire
    menagem["locataire"] = menagem.so.isin([3, 4, 5])
    assert_dtype(menagem.locataire, "bool")

    transfert = indivim.loc[indivim.lpr == 1, ['ident', 'ddipl']].copy()
    menagem = menagem.merge(transfert)

    # Correction
    def _manually_remove_errors():
        '''
        This method is here because some oddities can make it through the controls throughout the procedure
        It is here to remove all these individual errors that compromise the process.
        '''
        if year == 2006:
            indivim.lien[indivim.noindiv == 603018905] = 2
            indivim.noimer[indivim.noindiv == 603018905] = 1
            log.info("{}".format(
                indivim[indivim.noindiv == 603018905].to_string()))

    _manually_remove_errors()

    temporary_store['menagem_{}'.format(year)] = menagem
    del eecmen, erfmen, menagem, transfert
    gc.collect()
    temporary_store['indivim_{}'.format(year)] = indivim
    del erfind, eecind
def build_homogeneisation_revenus_menages(year = None):
    """Build menage consumption by categorie fiscale dataframe """

    assert year is not None
    # Load data
    bdf_survey_collection = SurveyCollection.load(
        collection = 'budget_des_familles', config_files_directory = config_files_directory)
    survey = bdf_survey_collection.get_survey('budget_des_familles_{}'.format(year))

# **********************************************************************************************************************
# ********************************* HOMOGENEISATION DES DONNEES SUR LES REVENUS DES MENAGES ****************************
# ************************************ CALCUL D'UN PROXI DU REVENU DISPONIBLE DES MENAGES ******************************
# **********************************************************************************************************************
#
# ********************HOMOGENEISATION DES BASES DE RESSOURCES***************************

# /* La base 95 permet de distinguer taxe d'habitation et impôts fonciers. On calcule leur montant relatif pour l'appliquer à 00 et 05 */


    if year == 1995:
        menrev = survey.get_values(
            table = "menrev",
            variables = [
                'revtot', 'ir', 'irbis', 'imphab', 'impfon', 'revaid', 'revsal', 'revind', 'revsec', 'revret',
                'revcho', 'revfam', 'revlog', 'revinv', 'revrmi', 'revpat', 'mena', 'ponderr'
                ],
            )
        menage = survey.get_values(
            table = "socioscm",
            variables = ['exdep', 'exrev', 'mena']
            )

        menage.set_index('mena')
        menrev = menrev.merge(menage, left_index = True, right_index = True)
        # cette étape de ne garder que les données dont on est sûr de la qualité et de la véracité
        # exdep = 1 si les données sont bien remplies pour les dépenses du ménage
        # exrev = 1 si les données sont bien remplies pour les revenus du ménage

        menrev = menrev[(menrev.exdep == 1) & (menrev.exrev == 1)]


        menrev['foncier_hab'] = menrev.imphab + menrev.impfon
        menrev['part_IMPHAB'] = menrev.imphab / menrev.foncier_hab
        menrev['part_IMPFON'] = menrev.impfon / menrev.foncier_hab

        menrev['revsoc'] = (
            menrev.revret + menrev.revcho + menrev.revfam + menrev.revlog + menrev.revinv + menrev.revrmi
            )
        for variable in ['revcho', 'revfam', 'revinv', 'revlog', 'revret', 'revrmi']:
            del menrev[variable]

        menrev['revact'] = menrev['revsal'] + menrev['revind'] + menrev['revsec']
        menrev.rename(
            columns = dict(
                revpat = "revpat",
                impfon = "impfon",
                imphab = "imphab",
                revaid = "somme_obl_recue",
                ),
            inplace = True
            )
        menrev['impot_revenu'] = menrev['ir'] + menrev['irbis']


        rev_disp = survey.get_values(
            table = "menrev",
            variables = ['revtot', 'revret', 'revcho', 'revfam', 'revlog', 'revinv', 'revrmi', 'imphab', 'impfon', 'revaid', 'revsal', 'revind', 'revsec', 'revpat', 'mena', 'ponderr', 'ir','irbis' ],
            )
        rev_disp.set_index('mena', inplace=True)

        menage2 = survey.get_values(
            table = "socioscm",
            variables = ['exdep', 'exrev', 'mena']
            )

        menage2.set_index('mena', inplace = True)
        rev_disp = menage2.merge(rev_disp, left_index = True, right_index = True)

        rev_disp = rev_disp[(rev_disp.exrev == 1) & (rev_disp.exdep == 1)]

        rev_disp['revsoc'] = rev_disp['revret'] + rev_disp['revcho'] + rev_disp['revfam'] + rev_disp['revlog'] + rev_disp['revinv'] + rev_disp['revrmi']
        rev_disp['impot_revenu'] = rev_disp['ir'] + rev_disp['irbis']

        rev_disp.rename(
            columns = dict(
                revaid = 'somme_obl_recue',
                ),
            inplace = True
            )
        rev_disp.somme_obl_recue = rev_disp.somme_obl_recue.fillna(0)

        rev_disp['revact'] = rev_disp['revsal'] + rev_disp['revind'] + rev_disp['revsec']

        rev_disp['revtot'] = rev_disp['revact'] + rev_disp['revpat'] + rev_disp['revsoc'] + rev_disp['somme_obl_recue']

        rev_disp['revact'] = rev_disp['revsal'] + rev_disp['revind'] + rev_disp['revsec']

        rev_disp.rename(
            columns = dict(
                ponderr = "pondmen",
                mena = "ident_men",
                revind = "act_indpt",
                revsal = "salaires",
                revsec = "autres_rev",
                ),
            inplace = True
            )

        rev_disp['autoverses'] = '0'
        rev_disp['somme_libre_recue'] = '0'
        rev_disp['autres_ress'] = '0'


#
# /* Le revenu disponible se calcule à partir de revtot à laquelle on retrancher la taxe d'habitation
# et l'impôt sur le revenu, plus éventuellement les CSG et CRDS.
# La variable revtot est la somme des revenus d'activité, sociaux, du patrimoine et d'aide. */
#
        rev_disp['rev_disponible'] = rev_disp.revtot - rev_disp.impot_revenu - rev_disp.imphab
        loyers_imputes = temporary_store['depenses_bdf_{}'.format(year)]
        loyers_imputes.rename(
            columns = {"0411": "loyer_impute"},
            inplace = True,
            )

        rev_dispbis = loyers_imputes.merge(rev_disp, left_index = True, right_index = True)
        rev_disp['rev_disp_loyerimput'] = rev_disp['rev_disponible'] - rev_dispbis['loyer_impute']

        for var in ['somme_obl_recue', 'act_indpt', 'revpat', 'salaires', 'autres_rev', 'rev_disponible', 'impfon', 'imphab', 'revsoc', 'revact', 'impot_revenu', 'revtot', 'rev_disp_loyerimput'] :
            rev_disp[var] = rev_disp[var] / 6.55957
# * CONVERSION EN EUROS

        temporary_store["revenus_{}".format(year)] = rev_disp

    elif year == 2000:
    # TODO: récupérer plutôt les variables qui viennent de la table dépenses (dans temporary_store)
        consomen = survey.get_values(
            table = "consomen",
            variables = ['c13141', 'c13111', 'c13121', 'c13131', 'pondmen', 'ident'],
            )
        rev_disp = consomen.sort(columns = ['ident'])
        del consomen


        menage = survey.get_values(
            table = "menage",
            variables = ['ident', 'revtot', 'revact', 'revsoc', 'revpat', 'rev70', 'rev71', 'revt_d', 'pondmen', 'rev10', 'rev11', 'rev20', 'rev21'],
            ).sort(columns = ['ident'])


        revenus = menage.join(rev_disp, how = "outer", rsuffix = "rev_disp")
        revenus.rename(
            columns = dict(
                c13111 = "impot_res_ppal",
                c13141 = "impot_revenu",
                c13121 = "impot_autres_res",
                rev70 = "somme_obl_recue",
                rev71 = "somme_libre_recue",
                revt_d= "autres_ress",
                ident = "ident_men",
                rev10 = "act_indpt",
                rev11 = "autoverses",
                rev20 = "salaires",
                rev21 = "autres_rev",
                ),
            inplace = True
            )

        var_to_ints = ['pondmen','impot_autres_res','impot_res_ppal','pondmenrev_disp','c13131']
        for var_to_int in var_to_ints:
            revenus[var_to_int] = revenus[var_to_int].astype(int)

        revenus['imphab'] = 0.65 * (revenus.impot_res_ppal + revenus.impot_autres_res)
        revenus['impfon'] = 0.35 * (revenus.impot_res_ppal + revenus.impot_autres_res)


        loyers_imputes = temporary_store["depenses_bdf_{}".format(year)]
        variables = ["0421"]
        loyers_imputes = loyers_imputes[variables]

        loyers_imputes.rename(
            columns = {"0421": "loyer_impute"},
            inplace = True,
            )

        temporary_store["loyers_imputes_{}".format(year)] = loyers_imputes

        loyers_imputes.index = loyers_imputes.index.astype('int')
        revenus = revenus.set_index('ident_men')
        revenus.index = revenus.index.astype('int')

        revenus = revenus.merge(loyers_imputes, left_index = True, right_index = True)

        revenus['rev_disponible'] = revenus.revtot - revenus.impot_revenu - revenus.imphab
        revenus['rev_disponible'] = revenus['rev_disponible'] * (revenus['rev_disponible'] >= 0)
        revenus['rev_disp_loyerimput'] = revenus.rev_disponible + revenus.loyer_impute

        var_to_ints = ['loyer_impute']
        for var_to_int in var_to_ints:
            revenus[var_to_int] = revenus[var_to_int].astype(int)


        temporary_store["revenus_{}".format(year)] = revenus



    elif year == 2005:
        c05d = survey.get_values(
            table = "c05d",
            variables = ['c13111', 'c13121', 'c13141', 'pondmen', 'ident_men'],
            )
        rev_disp = c05d.sort(columns = ['ident_men'])
        del c05d
        menage = survey.get_values(
            table = "menage",
            variables = ['ident_men', 'revtot', 'revact', 'revsoc', 'revpat', 'rev700_d', 'rev701_d',
                'rev999_d', 'rev100_d', 'rev101_d', 'rev200_d', 'rev201_d'],
            ).sort(columns = ['ident_men'])
        rev_disp.set_index('ident_men', inplace = True)
        menage.set_index('ident_men', inplace = True)
        revenus = pandas.concat([menage, rev_disp], axis = 1)
        revenus.rename(
            columns = dict(
                rev100_d = "act_indpt",
                rev101_d = "autoverses",
                rev200_d = "salaires",
                rev201_d = "autres_rev",
                rev700_d = "somme_obl_recue",
                rev701_d = "somme_libre_recue",
                rev999_d = "autres_ress",
                c13111 = "impot_res_ppal",
                c13141 = "impot_revenu",
                c13121 = "impot_autres_res",
                ),
            inplace = True
            )

        # * Ces pondérations (0.65 0.35) viennent de l'enquête BdF 1995 qui distingue taxe d'habitation et impôts fonciers. A partir de BdF 1995,
        # * on a calculé que la taxe d'habitation représente en moyenne 65% des impôts locaux, et que les impôts fonciers en représentenr 35%.
        # * On applique ces taux aux enquêtes 2000 et 2005.
        # gen imphab= 0.65*(impot_res_ppal + impot_autres_res)
        # gen impfon= 0.35*(impot_res_ppal + impot_autres_res)
        # drop impot_autres_res impot_res_ppal

        revenus['imphab'] = 0.65 * (revenus.impot_res_ppal + revenus.impot_autres_res)
        revenus['impfon'] = 0.35 * (revenus.impot_res_ppal + revenus.impot_autres_res)
        del revenus['impot_autres_res']
        del revenus['impot_res_ppal']

        #    * Calculer le revenu disponible avec et sans le loyer imputé

        loyers_imputes = temporary_store["depenses_bdf_{}".format(year)]
        variables = ["0421"]
        loyers_imputes = loyers_imputes[variables]
        loyers_imputes.rename(
            columns = {"0421": "loyer_impute"},
            inplace = True,
            )
        temporary_store["loyers_imputes_{}".format(year)] = loyers_imputes
        revenus = revenus.merge(loyers_imputes, left_index = True, right_index = True)
        revenus['rev_disponible'] = revenus.revtot - revenus.impot_revenu - revenus.imphab
        revenus['rev_disponible'] = revenus['rev_disponible'] * (revenus['rev_disponible'] >= 0)
        revenus['rev_disp_loyerimput'] = revenus.rev_disponible + revenus.loyer_impute
        temporary_store["revenus_{}".format(year)] = revenus

    elif year == 2011:
       try:
          c05 = survey.get_values(
            table = "C05",
            variables = ['c13111', 'c13121', 'c13141', 'pondmen', 'ident_me'],
            )
       except:
          c05 = survey.get_values(
            table = "c05",
            variables = ['c13111', 'c13121', 'c13141', 'pondmen', 'ident_me'],
            )
       rev_disp = c05.sort(columns = ['ident_me'])
       del c05
       try:
          menage = survey.get_values(
            table = "MENAGE",
            variables = ['ident_me', 'revtot', 'revact', 'revsoc', 'revpat', 'rev700', 'rev701', 'rev999', 'revindep', 'salaires'],
            ).sort(columns = ['ident_me'])
       except:
          menage = survey.get_values(
            table = "menage",
            variables = ['ident_me', 'revtot', 'revact', 'revsoc', 'revpat', 'rev700', 'rev701', 'rev999', 'revindep', 'salaires'],
            ).sort(columns = ['ident_me'])

#      variables = ['ident_me', 'revtot', 'revact', 'revsoc', 'revpat', 'rev700', 'rev701', 'rev999', 'revindep', 'rev101_d', 'salaires', 'rev201'],

       rev_disp.set_index('ident_me', inplace = True)
       menage.set_index('ident_me', inplace = True)
       revenus = pandas.concat([menage, rev_disp], axis = 1)
       revenus.rename(
            columns = dict(
                revindep = "act_indpt",
#TODO: trouver ces revenus commentés dans bdf 2011
#                rev101_d = "autoverses",
                salaires = "salaires",
#                rev201_d = "autres_rev",
                rev700 = "somme_obl_recue",
                rev701 = "somme_libre_recue",
                rev999 = "autres_ress",
                c13111 = "impot_res_ppal",
                c13141 = "impot_revenu",
                c13121 = "impot_autres_res",
                ),
            inplace = True
            )
       revenus['imphab'] = 0.65 * (revenus.impot_res_ppal + revenus.impot_autres_res)
       revenus['impfon'] = 0.35 * (revenus.impot_res_ppal + revenus.impot_autres_res)
       del revenus['impot_autres_res']
       del revenus['impot_res_ppal']

       loyers_imputes = temporary_store["depenses_bdf_{}".format(year)]
       variables = ["0421"]
       loyers_imputes = loyers_imputes[variables]
       loyers_imputes.rename(
            columns = {"0421": "loyer_impute"},
            inplace = True,
            )
       temporary_store["loyers_imputes_{}".format(year)] = loyers_imputes
       revenus = revenus.merge(loyers_imputes, left_index = True, right_index = True)
       revenus['rev_disponible'] = revenus.revtot - revenus.impot_revenu - revenus.imphab
       revenus['rev_disponible'] = revenus['rev_disponible'] * (revenus['rev_disponible'] >= 0)
       revenus['rev_disp_loyerimput'] = revenus.rev_disponible + revenus.loyer_impute
       temporary_store["revenus_{}".format(year)] = revenus
Exemple #56
0
def build_imputation_loyers_proprietaires(temporary_store = None, year = None):
    """Build menage consumption by categorie fiscale dataframe """

    assert temporary_store is not None
    assert year is not None

    # Load data
    bdf_survey_collection = SurveyCollection.load(collection = 'budget_des_familles',
        config_files_directory = config_files_directory)
    survey = bdf_survey_collection.get_survey('budget_des_familles_{}'.format(year))

    if year == 1995:
        imput00 = survey.get_values(table = "socioscm")
        # cette étape permet de ne garder que les données dont on est sûr de la qualité et de la véracité
        # exdep = 1 si les données sont bien remplies pour les dépenses du ménage
        # exrev = 1 si les données sont bien remplies pour les revenus du ménage
        imput00 = imput00[(imput00.exdep == 1) & (imput00.exrev == 1)]
        imput00 = imput00[(imput00.exdep == 1) & (imput00.exrev == 1)]
        kept_variables = ['mena', 'stalog', 'surfhab', 'confort1', 'confort2', 'confort3', 'confort4',
                        'ancons', 'sitlog', 'nbphab', 'rg', 'cc']
        imput00 = imput00[kept_variables]
        imput00.rename(columns = {'mena': 'ident_men'}, inplace = True)

        #TODO: continue variable cleaning
        var_to_filnas = ['surfhab']
        for var_to_filna in var_to_filnas:
            imput00[var_to_filna] = imput00[var_to_filna].fillna(0)

        var_to_ints = ['sitlog', 'confort1', 'stalog', 'surfhab', 'ident_men', 'ancons', 'nbphab']
        for var_to_int in var_to_ints:
            imput00[var_to_int] = imput00[var_to_int].astype(int)

        depenses = temporary_store['depenses_{}'.format(year)]
        depenses.reset_index(inplace = True)
        depenses_small = depenses[['ident_men', '04110', 'pondmen']].copy()
        depenses_small.ident_men = depenses_small.ident_men.astype('int')
        imput00 = depenses_small.merge(imput00, on = 'ident_men').set_index('ident_men')
        imput00.rename(columns = {'04110': 'loyer_reel'}, inplace = True)

#       * une indicatrice pour savoir si le loyer est connu et l'occupant est locataire

        imput00['observe'] = (imput00.loyer_reel > 0) & (imput00.stalog.isin([3, 4]))
        imput00['maison_appart'] = imput00.sitlog == 1

        imput00['catsurf'] = (
            1 +
            (imput00.surfhab > 15) +
            (imput00.surfhab > 30) +
            (imput00.surfhab > 40) +
            (imput00.surfhab > 60) +
            (imput00.surfhab > 80) +
            (imput00.surfhab > 100) +
            (imput00.surfhab > 150)
            )
        assert imput00.catsurf.isin(range(1, 9)).all()
        # TODO: vérifier ce qe l'on fait notamment regarder la vleur catsurf = 2 ommise dans le code stata
        imput00.maison = 1 - ((imput00.cc == 5) & (imput00.catsurf == 1) & (imput00.maison_appart == 1))
        imput00.maison = 1 - ((imput00.cc == 5) & (imput00.catsurf == 3) & (imput00.maison_appart == 1))
        imput00.maison = 1 - ((imput00.cc == 5) & (imput00.catsurf == 8) & (imput00.maison_appart == 1))
        imput00.maison = 1 - ((imput00.cc == 4) & (imput00.catsurf == 1) & (imput00.maison_appart == 1))

        try:
            parser = SafeConfigParser()
            config_local_ini = os.path.join(config_files_directory, 'config_local.ini')
            config_ini = os.path.join(config_files_directory, 'config.ini')
            parser.read([config_ini, config_local_ini])
            directory_path = os.path.normpath(
                parser.get("openfisca_france_indirect_taxation", "assets")
                )
            hotdeck = pandas.read_stata(os.path.join(directory_path, 'hotdeck_result.dta'))
        except:
            hotdeck = survey.get_values(table = 'hotdeck_result')


        imput00.reset_index(inplace = True)
        hotdeck.ident_men = hotdeck.ident_men.astype('int')
        imput00 = imput00.merge(hotdeck, on = 'ident_men')
        imput00.loyer_impute[imput00.observe] = 0
        imput00.reset_index(inplace = True)
        loyers_imputes = imput00[['ident_men', 'loyer_impute']].copy()
        assert loyers_imputes.loyer_impute.notnull().all()
        loyers_imputes.rename(columns = dict(loyer_impute = '0411'), inplace = True)

    # POUR BdF 2000 ET 2005, ON UTILISE LES LOYERS IMPUTES CALCULES PAR L'INSEE
    if year == 2000:
        # Garder les loyers imputés (disponibles dans la table sur les ménages)
        loyers_imputes = survey.get_values(table = "menage", variables = ['ident', 'rev81'])
        loyers_imputes.rename(
            columns = {
                'ident': 'ident_men',
                'rev81': 'poste_coicop_421',
                },
            inplace = True,
            )

    if year == 2005:
        # Garder les loyers imputés (disponibles dans la table sur les ménages)
        loyers_imputes = survey.get_values(table = "menage")
        kept_variables = ['ident_men', 'rev801_d']
        loyers_imputes = loyers_imputes[kept_variables]
        loyers_imputes.rename(columns = {'rev801_d': 'poste_coicop_421'}, inplace = True)

    if year == 2011:
        try:
            loyers_imputes = survey.get_values(table = "MENAGE")
        except:
            loyers_imputes = survey.get_values(table = "menage")

        kept_variables = ['ident_me', 'rev801']
        loyers_imputes = loyers_imputes[kept_variables]
        loyers_imputes.rename(columns = {'rev801': 'poste_coicop_421', 'ident_me': 'ident_men'},
                              inplace = True)

    # Joindre à la table des dépenses par COICOP
    loyers_imputes.set_index('ident_men', inplace = True)
    temporary_store['loyers_imputes_{}'.format(year)] = loyers_imputes
    depenses = temporary_store['depenses_{}'.format(year)]
    depenses.index = depenses.index.astype('int64')
    loyers_imputes.index = loyers_imputes.index.astype('int64')
    assert set(depenses.index) == set(loyers_imputes.index)
    assert len(set(depenses.columns).intersection(set(loyers_imputes.columns))) == 0
    depenses = depenses.merge(loyers_imputes, left_index = True, right_index = True)

    # ****************************************************************************************************************
    #  Etape n° 0-1-3 : SAUVER LES BASES DE DEPENSES HOMOGENEISEES DANS LE BON DOSSIER
    # ****************************************************************************************************************

    # Save in temporary store
    temporary_store['depenses_bdf_{}'.format(year)] = depenses