Ejemplo n.º 1
0
def build_survey_collection(name=None,
                            erase_collection_json=False,
                            overwrite_surveys=False,
                            data_directory_path_by_year=None,
                            source_format='sas'):

    assert name is not None
    assert data_directory_path_by_year is not None
    years = data_directory_path_by_year.keys()
    if years is None:
        log.error("A list of years to process is needed")

    if erase_collection_json:
        survey_collection = SurveyCollection(
            name=name, config_files_directory=config_files_directory)
    else:
        try:
            survey_collection = SurveyCollection.load(
                collection=name, config_files_directory=config_files_directory)
        except ConfigParser.NoOptionError:
            survey_collection = SurveyCollection(
                name=name, config_files_directory=config_files_directory)

    for year, data_directory_path in data_directory_path_by_year.iteritems():
        if not os.path.isdir(data_directory_path):
            input_data_directory = survey_collection.config.get(
                'data', 'input_directory')
            assert os.path.isdir(input_data_directory)
            data_directory_path = os.path.join(input_data_directory,
                                               data_directory_path)
            assert os.path.isdir(input_data_directory)

        data_file_by_format = create_data_file_by_format(data_directory_path)
        print data_file_by_format
        survey_name = '{}_{}'.format(name, year)
        add_survey_to_collection(
            survey_name=survey_name,
            survey_collection=survey_collection,
            sas_files=data_file_by_format[source_format],
        )
        collections_directory = survey_collection.config.get(
            'collections', 'collections_directory')
        collection_json_path = os.path.join(collections_directory,
                                            "{}.json".format(name))
        survey_collection.dump(json_file_path=collection_json_path)
        surveys = [
            survey for survey in survey_collection.surveys
            if survey.name.endswith(str(year))
        ]
        survey_collection.fill_hdf(source_format=source_format,
                                   surveys=surveys,
                                   overwrite=overwrite_surveys)
    return survey_collection
Ejemplo n.º 2
0
def set_table_in_survey(input_dataframe, entity, period, collection, survey_name, survey_label = None,
        table_label = None, table_name = None):
    period = periods.period(period)
    if table_name is None:
        table_name = entity + '_' + str(period)
    if table_label is None:
        table_label = "Input data for entity {} at period {}".format(entity, period)
    try:
        survey_collection = SurveyCollection.load(collection = collection)
    except configparser.NoOptionError:
        survey_collection = SurveyCollection(name = collection)
    except configparser.NoSectionError:  # For tests
        data_dir = os.path.join(
            pkg_resources.get_distribution('openfisca-survey-manager').location,
            'openfisca_survey_manager',
            'tests',
            'data_files',
            )
        survey_collection = SurveyCollection(
            name = collection,
            config_files_directory = data_dir,
            )

    try:
        survey = survey_collection.get_survey(survey_name)
    except AssertionError:
        survey = Survey(
            name = survey_name,
            label = survey_label or None,
            survey_collection = survey_collection,
            )

    if survey.hdf5_file_path is None:
        config = survey.survey_collection.config
        directory_path = config.get("data", "output_directory")
        if not os.path.isdir(directory_path):
            log.warn("{} who should be the HDF5 data directory does not exist: we create the directory".format(
                directory_path))
            os.makedirs(directory_path)
        survey.hdf5_file_path = os.path.join(directory_path, survey.name + '.h5')

    assert survey.hdf5_file_path is not None
    survey.insert_table(label = table_label, name = table_name, dataframe = input_dataframe)
    survey_collection.surveys = [
        kept_survey for kept_survey in survey_collection.surveys if kept_survey.name != survey_name
        ]
    survey_collection.surveys.append(survey)
    collections_directory = survey_collection.config.get('collections', 'collections_directory')
    assert os.path.isdir(collections_directory), """{} who should be the collections' directory does not exist.
Fix the option collections_directory in the collections section of your config file.""".format(collections_directory)
    collection_json_path = os.path.join(collections_directory, "{}.json".format(collection))
    survey_collection.dump(json_file_path = collection_json_path)
def build_bdf_survey_collection(years=None, erase=False, overwrite=False):
    if years is None:
        log.error("A list of years to process is needed")

    if erase:
        bdf_survey_collection = SurveyCollection(
            name="budget_des_familles",
            config_files_directory=config_files_directory)
    else:
        try:
            bdf_survey_collection = SurveyCollection.load(
                collection='budget_des_familles',
                config_files_directory=config_files_directory)
        except ConfigParser.NoOptionError:
            bdf_survey_collection = SurveyCollection(
                name="budget_des_familles",
                config_files_directory=config_files_directory)

    input_data_directory = bdf_survey_collection.config.get(
        'data', 'input_directory')
    if getpass.getuser() == 'benjello':
        input_data_directory = os.path.join(
            os.path.dirname(input_data_directory), 'INSEE')
    else:
        input_data_directory = os.path.dirname(input_data_directory)

    for year in years:
        data_directory_path = os.path.join(
            input_data_directory, 'budget_des_familles/{}'.format(year))
        data_file_by_format = create_data_file_by_format(data_directory_path)
        survey_name = 'budget_des_familles_{}'.format(year)

        add_survey_to_collection(
            survey_name=survey_name,
            survey_collection=bdf_survey_collection,
            stata_files=data_file_by_format['stata'],
        )

        collections_directory = bdf_survey_collection.config.get(
            'collections', 'collections_directory')
        collection_json_path = os.path.join(collections_directory,
                                            "budget_des_familles" + ".json")
        bdf_survey_collection.dump(json_file_path=collection_json_path)
        surveys = [
            survey for survey in bdf_survey_collection.surveys
            if survey.name.endswith(str(year))
        ]
        bdf_survey_collection.fill_hdf(source_format='stata',
                                       surveys=surveys,
                                       overwrite=overwrite)
    return bdf_survey_collection
Ejemplo n.º 4
0
def test_survey():
    name = 'fake'
    data_dir = os.path.join(
        pkg_resources.get_distribution('openfisca-survey-manager').location,
        'openfisca_survey_manager',
        'tests',
        'data_files',
    )

    survey_collection = SurveyCollection(name=name,
                                         config_files_directory=data_dir,
                                         json_file_path=os.path.join(
                                             data_dir, 'fake.json'))

    saved_fake_survey_hdf5_file_path = os.path.join(data_dir, 'fake.hdf5')
    saved_fake_survey_file_path = os.path.join(data_dir, 'help.sas7bdat')
    survey = Survey(
        hdf5_file_path=saved_fake_survey_hdf5_file_path,
        name='fake_survey',
        sas_files=[saved_fake_survey_file_path],
        survey_collection=survey_collection,
    )
    survey.insert_table(name='help')
    survey.fill_hdf(source_format='sas')
    print survey.tables
Ejemplo n.º 5
0
def build(year=None):
    assert year is not None
    #
    preprocessing.build_merged_dataframes(year=year)
    #
    # imputation_loyer.imputation_loyer(year = year)
    #
    openfisca_survey_collection = SurveyCollection(name='openfisca')
    stata_directory = openfisca_survey_collection.config.get(
        'data', 'stata_directory')
    stata_file = os.path.join(stata_directory, 'log_men_ERFS.dta')
    imputation_loyer.merge_imputation_loyer(stata_file=stata_file, year=year)
    #
    variables_individuelles.build_variables_individuelles(year=year)
    famille.build_famille(year=year)
    final.create_input_data_frame(year=year)
    #
    temporary_store = get_store(file_name='erfs_fpr')
    data_frame = temporary_store['input_{}'.format(year)]
    # Save the data_frame in a collection
    store_input_data_frame(
        data_frame=data_frame,
        collection="openfisca_erfs_fpr",
        survey="openfisca_erfs_fpr_data_{}".format(year),
    )
def run_all(year = None, filename = "test", check = False):

    assert year is not None
    pre_processing.create_indivim_menage_en_mois(year = year)
    pre_processing.create_enfants_a_naitre(year = year)
    # imputation_loyer.imputation_loyer(year = year)
    fip.create_fip(year = year)
    famille.famille(year = year)
    foyer.sif(year = year)
    foyer.foyer_all(year = year)
    rebuild.create_totals(year = year)
    rebuild.create_final(year = year)
    invalides.invalide(year = year)
    data_frame = final.final(year = year, check = check)

    # Saving the data_frame
    openfisca_survey_collection = SurveyCollection(name = "openfisca", config_files_directory = config_files_directory)
    output_data_directory = openfisca_survey_collection.config.get('data', 'output_directory')
    survey_name = "openfisca_data_{}".format(year)
    table = "input"
    hdf5_file_path = os.path.join(os.path.dirname(output_data_directory), "{}.h5".format(survey_name))
    survey = Survey(
        name = survey_name,
        hdf5_file_path = hdf5_file_path,
        )
    survey.insert_table(name = table, data_frame = data_frame)
    openfisca_survey_collection.surveys.append(survey)
    collections_directory = openfisca_survey_collection.config.get('collections', 'collections_directory')
    json_file_path = os.path.join(collections_directory, 'openfisca.json')
    openfisca_survey_collection.dump(json_file_path = json_file_path)
Ejemplo n.º 7
0
def test_add_survey_to_collection():
    if is_travis or is_circleci:
        return
    name = 'fake'
    survey_name = 'fake_survey'
    data_dir = os.path.join(
        pkg_resources.get_distribution('openfisca-survey-manager').location,
        'openfisca_survey_manager',
        'tests',
        'data_files',
    )
    survey_collection = SurveyCollection(name=name)
    saved_fake_survey_file_path = os.path.join(data_dir, 'help.sas7bdat')
    add_survey_to_collection(survey_name=survey_name,
                             survey_collection=survey_collection,
                             sas_files=[saved_fake_survey_file_path],
                             stata_files=[])
    ordered_dict = survey_collection.to_json()
    assert survey_name in list(ordered_dict['surveys'].keys())
def test_add_survey_to_collection():
    name = 'fake'
    survey_name = 'fake_survey'
    survey_collection = SurveyCollection(name = name)

    data_dir = os.path.join(
        pkg_resources.get_distribution('openfisca-survey-manager').location,
        'openfisca_survey_manager',
        'tests',
        'data_files',
        )
#    saved_fake_survey_hdf5_file_path = os.path.join(data_dir, 'fake.hdf5')
    saved_fake_survey_file_path = os.path.join(data_dir, 'help.sas7bdat')
    add_survey_to_collection(survey_name = survey_name,
                             survey_collection = survey_collection,
                             sas_files = [saved_fake_survey_file_path],
                             stata_files = [])
    ordered_dict = survey_collection.to_json()
#    print ordered_dict
    assert ordered_dict['surveys'].keys() == [survey_name]
Ejemplo n.º 9
0
def build(year=None, check=False):

    assert year is not None
    pre_processing.create_indivim_menagem(year=year)
    pre_processing.create_enfants_a_naitre(year=year)
    #    try:
    #        imputation_loyer.imputation_loyer(year = year)
    #    except Exception, e:
    #        log.info('Do not impute loyer because of the following error: \n {}'.format(e))
    #        pass
    fip.create_fip(year=year)
    famille.famille(year=year)
    foyer.sif(year=year)
    foyer.foyer_all(year=year)
    rebuild.create_totals_first_pass(year=year)
    rebuild.create_totals_second_pass(year=year)
    rebuild.create_final(year=year)
    invalides.invalide(year=year)
    final.final(year=year, check=check)

    temporary_store = get_store(file_name='erfs')
    data_frame = temporary_store['input_{}'.format(year)]
    # Saving the data_frame
    openfisca_survey_collection = SurveyCollection(
        name="openfisca", config_files_directory=config_files_directory)
    output_data_directory = openfisca_survey_collection.config.get(
        'data', 'output_directory')
    survey_name = "openfisca_data_{}".format(year)
    table = "input"
    hdf5_file_path = os.path.join(os.path.dirname(output_data_directory),
                                  "{}.h5".format(survey_name))
    survey = Survey(
        name=survey_name,
        hdf5_file_path=hdf5_file_path,
    )
    survey.insert_table(name=table, data_frame=data_frame)
    openfisca_survey_collection.surveys.append(survey)
    collections_directory = openfisca_survey_collection.config.get(
        'collections', 'collections_directory')
    json_file_path = os.path.join(collections_directory, 'openfisca.json')
    openfisca_survey_collection.dump(json_file_path=json_file_path)
Ejemplo n.º 10
0
def store_input_data_frame(data_frame=None,
                           collection=None,
                           survey=None,
                           table=None):
    assert data_frame is not None
    assert collection is not None
    assert survey is not None
    try:
        openfisca_survey_collection = SurveyCollection.load(
            collection=collection)
    except Exception as e:
        openfisca_survey_collection = SurveyCollection(name=collection)

    log.debug("In collection {} the following survey are present: {}".format(
        collection, openfisca_survey_collection.surveys))
    output_data_directory = openfisca_survey_collection.config.get(
        'data', 'output_directory')
    if table is None:
        table = "input"
    #
    survey_name = survey
    hdf5_file_path = os.path.join(os.path.dirname(output_data_directory),
                                  "{}.h5".format(survey_name))
    available_survey_names = [
        survey_.name for survey_ in openfisca_survey_collection.surveys
    ]
    if survey_name in available_survey_names:
        survey = openfisca_survey_collection.get_survey(survey_name)
    else:
        survey = Survey(name=survey_name, hdf5_file_path=hdf5_file_path)
    survey.insert_table(name=table, data_frame=data_frame)
    openfisca_survey_collection.surveys.append(survey)
    collections_directory = openfisca_survey_collection.config.get(
        'collections', 'collections_directory')
    json_file_path = os.path.join(collections_directory,
                                  '{}.json'.format(collection))
    log.debug("In collection {} the following surveyx are present: {}".format(
        collection, openfisca_survey_collection.surveys))
    openfisca_survey_collection.dump(json_file_path=json_file_path)
        'deci',
        'hnph2',
        'iaat_bis',
        'lmlm',
        'magtr',
        'mcs8',
        'mdiplo',
        'mtybd',
        'qex',
        'statut_occupation',
        'tu99_recoded',
        # 'ident',
    ]

    logement = Logement[kept_variables].copy()
    # logement.rename(columns = {'qex': 'wprm'}, inplace = True)
    return logement


if __name__ == '__main__':
    import sys
    logging.basicConfig(level=logging.INFO, stream=sys.stdout)
    year = 2012
    from openfisca_france_data.erfs_fpr.input_data_builder import step_01_preprocessing
    step_01_preprocessing.build_merged_dataframes(year=year)
    openfisca_survey_collection = SurveyCollection(name='openfisca')
    output_data_directory = openfisca_survey_collection.config.get(
        'data', 'output_directory')
    stata_file = os.path.join(output_data_directory, 'log_men_ERFS.dta')
    menages = merge_imputation_loyer(stata_file=stata_file, year=year)
        data_frame.reset_index(inplace = True)
    except ValueError, e:
        log.info('ignoring reset_index because {}'.format(e))

    # Remove duplicated colums causing bug with HDFStore
    # according to https://github.com/pydata/pandas/issues/6240
    # using solution form stackoverflow
    # http://stackoverflow.com/questions/16938441/how-to-remove-duplicate-columns-from-a-dataframe-using-python-pandas
    data_frame = data_frame.T.groupby(level = 0).first().T

    log.info('Saving the openfisca indirect taxation input dataframe')
    try:
        openfisca_survey_collection = SurveyCollection.load(
            collection = 'openfisca_indirect_taxation', config_files_directory = config_files_directory)
    except:
        openfisca_survey_collection = SurveyCollection(
            name = 'openfisca_indirect_taxation', config_files_directory = config_files_directory)

    output_data_directory = openfisca_survey_collection.config.get('data', 'output_directory')
    survey_name = "openfisca_indirect_taxation_data_{}".format(year_calage)
    table = "input"
    hdf5_file_path = os.path.join(output_data_directory, "{}.h5".format(survey_name))
    survey = Survey(
        name = survey_name,
        hdf5_file_path = hdf5_file_path,
        )
    survey.insert_table(name = table, data_frame = data_frame)
    openfisca_survey_collection.surveys.append(survey)
    openfisca_survey_collection.dump()


def run(years_calage):
def build_survey_collection(collection_name=None,
                            replace_metadata=False,
                            replace_data=False,
                            data_directory_path_by_survey_suffix=None,
                            source_format='sas'):

    assert collection_name is not None
    assert data_directory_path_by_survey_suffix is not None
    surveys_name = data_directory_path_by_survey_suffix.keys()
    assert surveys_name is not None, "A list of surveys to process is needed"

    if replace_metadata:
        survey_collection = SurveyCollection(
            name=collection_name,
            config_files_directory=config_files_directory)
    else:
        try:
            survey_collection = SurveyCollection.load(
                collection=collection_name,
                config_files_directory=config_files_directory)
        except ConfigParser.NoOptionError:
            survey_collection = SurveyCollection(
                name=collection_name,
                config_files_directory=config_files_directory)

    for survey_suffix, data_directory_path in data_directory_path_by_survey_suffix.iteritems(
    ):
        assert os.path.isdir(
            data_directory_path), '{} is not a valid directory path'.format(
                data_directory_path)

        data_file_by_format = create_data_file_by_format(data_directory_path)
        survey_name = '{}_{}'.format(collection_name, survey_suffix)
        add_survey_to_collection(
            survey_name=survey_name,
            survey_collection=survey_collection,
            sas_files=data_file_by_format.get('sas'),
            stata_files=data_file_by_format.get('stata'),
        )

        valid_source_format = [
            _format for _format in data_file_by_format.keys()
            if data_file_by_format.get((_format))
        ]
        log.info("Valid source formats are: {}".format(valid_source_format))
        source_format = valid_source_format[0]
        log.info("Using the following format: {}".format(source_format))
        collections_directory = survey_collection.config.get(
            'collections', 'collections_directory')
        assert os.path.isdir(
            collections_directory
        ), """{} who should be the collections' directory does not exist.
Fix the option collections_directory in the collections section of your config file.""".format(
            collections_directory)
        collection_json_path = os.path.join(collections_directory,
                                            "{}.json".format(collection_name))
        survey_collection.dump(json_file_path=collection_json_path)
        surveys = [
            survey for survey in survey_collection.surveys
            if survey.name.endswith(str(survey_suffix))
        ]
        survey_collection.fill_hdf(source_format=source_format,
                                   surveys=surveys,
                                   overwrite=replace_data)
    return survey_collection