def store_input_data_frame(data_frame = None, collection = None, survey = None, table = None):
    assert data_frame is not None
    assert collection is not None
    assert survey is not None
    try:
        openfisca_survey_collection = SurveyCollection.load(collection = collection)
    except Exception as e:
        openfisca_survey_collection = SurveyCollection(name = collection)

    log.debug("In collection {} the following survey are present: {}".format(collection, openfisca_survey_collection.surveys))
    output_data_directory = openfisca_survey_collection.config.get('data', 'output_directory')
    if table is None:
        table = "input"
    #
    survey_name = survey
    hdf5_file_path = os.path.join(os.path.dirname(output_data_directory), "{}.h5".format(survey_name))
    available_survey_names = [survey_.name for survey_ in openfisca_survey_collection.surveys]
    if survey_name in available_survey_names:
        survey = openfisca_survey_collection.get_survey(survey_name)
    else:
        survey = Survey(name = survey_name, hdf5_file_path = hdf5_file_path)
    survey.insert_table(name = table, data_frame = data_frame)
    openfisca_survey_collection.surveys.append(survey)
    collections_directory = openfisca_survey_collection.config.get('collections', 'collections_directory')
    json_file_path = os.path.join(collections_directory, '{}.json'.format(collection))
    log.debug("In collection {} the following surveyx are present: {}".format(collection, openfisca_survey_collection.surveys))
    openfisca_survey_collection.dump(json_file_path = json_file_path)
def test_survey():
    name = 'fake'
    data_dir = os.path.join(
        pkg_resources.get_distribution('openfisca-survey-manager').location,
        'openfisca_survey_manager',
        'tests',
        'data_files',
        )

    survey_collection = SurveyCollection(
        name = name,
        config_files_directory = data_dir,
        json_file_path = os.path.join(data_dir, 'fake.json')
        )

    saved_fake_survey_hdf5_file_path = os.path.join(data_dir, 'fake.hdf5')
    saved_fake_survey_file_path = os.path.join(data_dir, 'help.sas7bdat')
    survey = Survey(
        hdf5_file_path = saved_fake_survey_hdf5_file_path,
        name = 'fake_survey',
        sas_files = [saved_fake_survey_file_path],
        survey_collection = survey_collection,
        )
    survey.insert_table(name = 'help')
    survey.fill_hdf(source_format = 'sas')
    print survey.tables
def add_survey_to_collection(survey_name=None,
                             survey_collection=None,
                             sas_files=[],
                             stata_files=[]):
    assert survey_collection is not None
    overwrite = True
    label = survey_name

    for test_survey in survey_collection.surveys:
        if test_survey.name == survey_name:
            survey = survey_collection.get_survey(survey_name)
    if overwrite:
        survey = Survey(
            name=survey_name,
            label=label,
            sas_files=sas_files,
            stata_files=stata_files,
            survey_collection=survey_collection,
        )
    else:
        survey = survey_collection.get(survey_name)
        survey.label = label
        survey.informations.update({
            "sas_files": sas_files,
            "stata_files": stata_files,
        })
    survey_collection.surveys = [
        kept_survey for kept_survey in survey_collection.surveys
        if kept_survey.name != survey_name
    ]
    survey_collection.surveys.append(survey)
def add_survey_to_collection(survey_name = None, survey_collection = None, sas_files = [], stata_files = []):
    assert survey_collection is not None
    overwrite = True
    label = survey_name

    for test_survey in survey_collection.surveys:
        if test_survey.name == survey_name:
            survey = survey_collection.get_survey(survey_name)
    if overwrite:
        survey = Survey(
            name = survey_name,
            label = label,
            sas_files = sas_files,
            stata_files = stata_files,
            survey_collection = survey_collection,
            )
    else:
        survey = survey_collection.get(survey_name)
        survey.label = label
        survey.informations.update({
            "sas_files": sas_files,
            "stata_files": stata_files,
            })
    survey_collection.surveys = [
        kept_survey for kept_survey in survey_collection.surveys if kept_survey.name != survey_name
        ]
    survey_collection.surveys.append(survey)
    def load(cls, json_file_path = None, collection = None, config_files_directory = default_config_files_directory):
        assert os.path.exists(config_files_directory)
        if json_file_path is None:
            assert collection is not None
            config = Config(config_files_directory = config_files_directory)
            try:
                json_file_path = config.get("collections", collection)
            except Exception as error:
                log.debug("Looking for congi file in {}".format(config_files_directory))
                log.error(error)
                raise

        with open(json_file_path, 'r') as _file:
            self_json = json.load(_file)
            name = self_json.get('name')

        self = cls(name = name)
        self.config = config
        with open(json_file_path, 'r') as _file:
            self_json = json.load(_file)
            self.json_file_path = json_file_path
            self.label = self_json.get('label')
            self.name = self_json.get('name')

        surveys = self_json.get('surveys')
        for survey_name, survey_json in surveys.items():
            survey = Survey(name = survey_name)
            self.surveys.append(survey.create_from_json(survey_json))
        return self
def run_all(year = None, filename = "test", check = False):

    assert year is not None
    pre_processing.create_indivim_menage_en_mois(year = year)
    pre_processing.create_enfants_a_naitre(year = year)
    # imputation_loyer.imputation_loyer(year = year)
    fip.create_fip(year = year)
    famille.famille(year = year)
    foyer.sif(year = year)
    foyer.foyer_all(year = year)
    rebuild.create_totals(year = year)
    rebuild.create_final(year = year)
    invalides.invalide(year = year)
    data_frame = final.final(year = year, check = check)

    # Saving the data_frame
    openfisca_survey_collection = SurveyCollection(name = "openfisca", config_files_directory = config_files_directory)
    output_data_directory = openfisca_survey_collection.config.get('data', 'output_directory')
    survey_name = "openfisca_data_{}".format(year)
    table = "input"
    hdf5_file_path = os.path.join(os.path.dirname(output_data_directory), "{}.h5".format(survey_name))
    survey = Survey(
        name = survey_name,
        hdf5_file_path = hdf5_file_path,
        )
    survey.insert_table(name = table, data_frame = data_frame)
    openfisca_survey_collection.surveys.append(survey)
    collections_directory = openfisca_survey_collection.config.get('collections', 'collections_directory')
    json_file_path = os.path.join(collections_directory, 'openfisca.json')
    openfisca_survey_collection.dump(json_file_path = json_file_path)
    def load(cls,
             json_file_path=None,
             collection=None,
             config_files_directory=default_config_files_directory):
        assert os.path.exists(config_files_directory)
        if json_file_path is None:
            assert collection is not None
            config = Config(config_files_directory=config_files_directory)
            try:
                json_file_path = config.get("collections", collection)
            except Exception as error:
                log.debug("Looking for congi file in {}".format(
                    config_files_directory))
                log.error(error)
                raise

        with open(json_file_path, 'r') as _file:
            self_json = json.load(_file)
            name = self_json.get('name')

        self = cls(name=name)
        self.config = config
        with open(json_file_path, 'r') as _file:
            self_json = json.load(_file)
            self.json_file_path = json_file_path
            self.label = self_json.get('label')
            self.name = self_json.get('name')

        surveys = self_json.get('surveys')
        for survey_name, survey_json in surveys.items():
            survey = Survey(name=survey_name)
            self.surveys.append(survey.create_from_json(survey_json))
        return self
Example #8
0
def build_empty_logement_survey_collection(years=None):

    if years is None:
        log.error("A list of years to process is needed")

    logement_survey_collection = SurveyCollection(name="logement")
    logement_survey_collection.set_config_files_directory(
        config_files_directory)
    input_data_directory = logement_survey_collection.config.get(
        'data', 'input_directory')
    output_data_directory = logement_survey_collection.config.get(
        'data', 'output_directory')

    for year in years:
        surveys = logement_survey_collection.surveys

        survey_name = 'logement_{}'.format(year)
        hdf5_file_path = os.path.join(os.path.dirname(output_data_directory),
                                      "{}{}".format(survey_name, ".h5"))
        survey = Survey(name=survey_name, hdf5_file_path=hdf5_file_path)
        surveys[survey_name] = survey

        yr = str(year)[2:]
        yr1 = str(year + 1)[2:]

        if yr == "03":
            lgt_men = "menage"
            lgt_logt = None
            renameidlgt = dict(ident='ident')

        elif yr in ["06", "07", "08", "09"]:  # TODO: clean this
            lgt_men = "menage1"
            lgt_lgt = "logement"
            renameidlgt = dict(idlog='ident')

        logement_tables = {
            "adresse": "adresse",
            "lgt_menage": lgt_men,
            "lgt_logt": lgt_lgt,
        }

        RData_directory = os.path.join(os.path.dirname(input_data_directory),
                                       'R', 'logement', str(year))
        SasData_directory = os.path.join(os.path.dirname(input_data_directory),
                                         'enqlog2006/enq_06')

        for name, Rdata_table in logement_tables.iteritems():
            Rdata_file = os.path.join(RData_directory,
                                      "{}.Rdata".format(Rdata_table))
            sas_file = os.path.join(SasData_directory,
                                    "{}.sas7bdat".format(Rdata_table))
            survey.insert_table(name=name,
                                year=year,
                                Rdata_file=Rdata_file,
                                Rdata_table=Rdata_table,
                                sas_file=sas_file)

    return logement_survey_collection
def build_empty_logement_survey_collection(years= None):

    if years is None:
        log.error("A list of years to process is needed")

    logement_survey_collection = SurveyCollection(name = "logement")
    logement_survey_collection.set_config_files_directory(config_files_directory)
    input_data_directory = logement_survey_collection.config.get('data', 'input_directory')
    output_data_directory = logement_survey_collection.config.get('data', 'output_directory')

    for year in years:
        surveys = logement_survey_collection.surveys

        survey_name = 'logement_{}'.format(year)
        hdf5_file_path = os.path.join(
            os.path.dirname(output_data_directory),
            "{}{}".format(survey_name, ".h5")
            )
        survey = Survey(
            name = survey_name,
            hdf5_file_path = hdf5_file_path
            )
        surveys[survey_name] = survey

        yr = str(year)[2:]
        yr1 = str(year+1)[2:]

        if yr == "03":
            lgt_men = "menage"
            lgt_logt = None
            renameidlgt = dict(ident='ident')

        elif yr in ["06" ,"07", "08", "09"]: # TODO: clean this
            lgt_men = "menage1"
            lgt_lgt = "logement"
            renameidlgt = dict(idlog='ident')

        logement_tables = {
            "adresse" : "adresse",
            "lgt_menage" : lgt_men,
            "lgt_logt" : lgt_lgt,
            }

        RData_directory = os.path.join(os.path.dirname(input_data_directory),'R','logement', str(year))
        SasData_directory = os.path.join(os.path.dirname(input_data_directory),'enqlog2006/enq_06')


        for name, Rdata_table in logement_tables.iteritems():
          Rdata_file = os.path.join(RData_directory, "{}.Rdata".format(Rdata_table))
          sas_file = os.path.join(SasData_directory, "{}.sas7bdat".format(Rdata_table))
          survey.insert_table(name = name,
                              year = year,
                              Rdata_file = Rdata_file,
                              Rdata_table = Rdata_table,
                              sas_file = sas_file
                              )

    return logement_survey_collection
def run_all(year_calage = 2007, year_data_list = [1995, 2000, 2005, 2011]):

    # Quelle base de données choisir pour le calage ?
    year_data = find_nearest_inferior(year_data_list, year_calage)

    # 4 étape parallèles d'homogénéisation des données sources :
    # Gestion des dépenses de consommation:
    build_depenses_homogenisees(year = year_data)
    build_imputation_loyers_proprietaires(year = year_data)

    build_depenses_calees(year_calage, year_data)
    build_menage_consumption_by_categorie_fiscale(year_calage, year_data)
    categorie_fiscale_data_frame = temporary_store["menage_consumption_by_categorie_fiscale_{}".format(year_calage)]
    depenses_calees_by_grosposte = temporary_store["depenses_calees_by_grosposte_{}".format(year_calage)]

    # Gestion des véhicules:
    build_homogeneisation_vehicules(year = year_data)
    vehicule = temporary_store['automobile_{}'.format(year_data)]

    # Gestion des variables socio démographiques:
    build_homogeneisation_caracteristiques_sociales(year = year_data)
    menage = temporary_store['donnes_socio_demog_{}'.format(year_data)]

    # Gestion des variables revenues:
    build_homogeneisation_revenus_menages(year = year_data)
    revenus = temporary_store["revenus_{}".format(year_data)]

    # DataFrame résultant de ces 4 étapes
    data_frame = pandas.concat(
        [revenus, vehicule, categorie_fiscale_data_frame, menage, depenses_calees_by_grosposte], axis = 1)

    data_frame.index.name = "ident_men"
    data_frame.reset_index(inplace = True)
    # Remove duplicated colums causing bug with HDFStore
    # according to https://github.com/pydata/pandas/issues/6240
    # using solution form stackoverflow
    # http://stackoverflow.com/questions/16938441/how-to-remove-duplicate-columns-from-a-dataframe-using-python-pandas
    data_frame = data_frame.T.groupby(level = 0).first().T

    # Saving the data_frame
    openfisca_survey_collection = SurveyCollection.load(
        collection = 'openfisca_indirect_taxation', config_files_directory = config_files_directory)

    output_data_directory = openfisca_survey_collection.config.get('data', 'output_directory')
    survey_name = "openfisca_indirect_taxation_data_{}".format(year_calage)
    table = "input"
    hdf5_file_path = os.path.join(os.path.dirname(output_data_directory), "{}.h5".format(survey_name))
    survey = Survey(
        name = survey_name,
        hdf5_file_path = hdf5_file_path,
        )
    survey.insert_table(name = table, data_frame = data_frame)
    openfisca_survey_collection.surveys.append(survey)
    openfisca_survey_collection.dump()
def set_table_in_survey(input_dataframe, entity, period, collection, survey_name, survey_label = None,
        table_label = None, table_name = None):
    period = periods.period(period)
    if table_name is None:
        table_name = entity + '_' + str(period)
    if table_label is None:
        table_label = "Input data for entity {} at period {}".format(entity, period)
    try:
        survey_collection = SurveyCollection.load(collection = collection)
    except configparser.NoOptionError:
        survey_collection = SurveyCollection(name = collection)
    except configparser.NoSectionError:  # For tests
        data_dir = os.path.join(
            pkg_resources.get_distribution('openfisca-survey-manager').location,
            'openfisca_survey_manager',
            'tests',
            'data_files',
            )
        survey_collection = SurveyCollection(
            name = collection,
            config_files_directory = data_dir,
            )

    try:
        survey = survey_collection.get_survey(survey_name)
    except AssertionError:
        survey = Survey(
            name = survey_name,
            label = survey_label or None,
            survey_collection = survey_collection,
            )

    if survey.hdf5_file_path is None:
        config = survey.survey_collection.config
        directory_path = config.get("data", "output_directory")
        if not os.path.isdir(directory_path):
            log.warn("{} who should be the HDF5 data directory does not exist: we create the directory".format(
                directory_path))
            os.makedirs(directory_path)
        survey.hdf5_file_path = os.path.join(directory_path, survey.name + '.h5')

    assert survey.hdf5_file_path is not None
    survey.insert_table(label = table_label, name = table_name, dataframe = input_dataframe)
    survey_collection.surveys = [
        kept_survey for kept_survey in survey_collection.surveys if kept_survey.name != survey_name
        ]
    survey_collection.surveys.append(survey)
    collections_directory = survey_collection.config.get('collections', 'collections_directory')
    assert os.path.isdir(collections_directory), """{} who should be the collections' directory does not exist.
Fix the option collections_directory in the collections section of your config file.""".format(collections_directory)
    collection_json_path = os.path.join(collections_directory, "{}.json".format(collection))
    survey_collection.dump(json_file_path = collection_json_path)
Example #12
0
def set_table_in_survey(input_dataframe, entity, period, collection, survey_name, survey_label = None,
        table_label = None, table_name = None):
    period = periods.period(period)
    if table_name is None:
        table_name = entity + '_' + str(period)
    if table_label is None:
        table_label = "Input data for entity {} at period {}".format(entity, period)
    try:
        survey_collection = SurveyCollection.load(collection = collection)
    except configparser.NoOptionError:
        survey_collection = SurveyCollection(name = collection)
    except configparser.NoSectionError:  # For tests
        data_dir = os.path.join(
            pkg_resources.get_distribution('openfisca-survey-manager').location,
            'openfisca_survey_manager',
            'tests',
            'data_files',
            )
        survey_collection = SurveyCollection(
            name = collection,
            config_files_directory = data_dir,
            )

    try:
        survey = survey_collection.get_survey(survey_name)
    except AssertionError:
        survey = Survey(
            name = survey_name,
            label = survey_label or None,
            survey_collection = survey_collection,
            )

    if survey.hdf5_file_path is None:
        config = survey.survey_collection.config
        directory_path = config.get("data", "output_directory")
        if not os.path.isdir(directory_path):
            log.warn("{} who should be the HDF5 data directory does not exist: we create the directory".format(
                directory_path))
            os.makedirs(directory_path)
        survey.hdf5_file_path = os.path.join(directory_path, survey.name + '.h5')

    assert survey.hdf5_file_path is not None
    survey.insert_table(label = table_label, name = table_name, dataframe = input_dataframe)
    survey_collection.surveys = [
        kept_survey for kept_survey in survey_collection.surveys if kept_survey.name != survey_name
        ]
    survey_collection.surveys.append(survey)
    collections_directory = survey_collection.config.get('collections', 'collections_directory')
    assert os.path.isdir(collections_directory), """{} who should be the collections' directory does not exist.
Fix the option collections_directory in the collections section of your config file.""".format(collections_directory)
    collection_json_path = os.path.join(collections_directory, "{}.json".format(collection))
    survey_collection.dump(json_file_path = collection_json_path)
def test_survey_dump_load():
    erfs_survey_collection = build_empty_erfs_survey_collection(years=[2006])
    erfs_survey = erfs_survey_collection.surveys['erfs_2006']
    saved_fake_survey_file_path = os.path.join(current_dir, 'saved_fake_survey')
    erfs_survey.dump(saved_fake_survey_file_path)
    erfs_survey_bis = Survey.load(saved_fake_survey_file_path)
    assert erfs_survey.to_json() == erfs_survey_bis.to_json()
def test_survey_dump_load():
    erfs_survey_collection = build_empty_erfs_survey_collection(years=[2006])
    erfs_survey = erfs_survey_collection.surveys['erfs_2006']
    saved_fake_survey_file_path = os.path.join(current_dir,
                                               'saved_fake_survey')
    erfs_survey.dump(saved_fake_survey_file_path)
    erfs_survey_bis = Survey.load(saved_fake_survey_file_path)
    assert erfs_survey.to_json() == erfs_survey_bis.to_json()
def add_survey_to_collection(survey_name = None, survey_collection = None, sas_files = [], stata_files = [],
        question = False):
    assert survey_collection is not None
    overwrite = True

    if question:
        label = click.prompt('Enter a description for the survey {}'.format(survey_name), default = survey_name)
    else:
        label = survey_name

    for test_survey in survey_collection.surveys:
        if test_survey.name == survey_name:
            if question:
                click.echo('The following information is available for survey {}'.format(survey_name))
            survey = survey_collection.get_survey(survey_name)
            if question:
                click.echo(survey)
                overwrite = click.confirm(
                    'Overwrite previous survey {} informations ?'.format(survey_name), default = True)
            else:
                overwrite = True
    if question:
        same_survey = click.confirm('Are all the files part of the same survey ?', default = True)
    else:
        same_survey = True
    if same_survey:
            if overwrite:
                survey = Survey(
                    name = survey_name,
                    label = label,
                    sas_files = sas_files,
                    stata_files = stata_files,
                    survey_collection = survey_collection,
                    )
            else:
                survey = survey_collection.get(survey_name)
                survey.label = label
                survey.informations.update({
                    "sas_files": sas_files,
                    "stata_files": stata_files,
                    })
            survey_collection.surveys = [
                kept_survey for kept_survey in survey_collection.surveys if kept_survey.name != survey_name
                ]
            survey_collection.surveys.append(survey)
Example #16
0
def store_input_data_frame(data_frame = None, collection = None, survey = None):
    assert data_frame is not None
    assert collection is not None
    assert survey is not None
    openfisca_survey_collection = SurveyCollection(name = collection, config_files_directory = config_files_directory)
    output_data_directory = openfisca_survey_collection.config.get('data', 'output_directory')
    survey_name = survey
    table = "input"
    hdf5_file_path = os.path.join(os.path.dirname(output_data_directory), "{}.h5".format(survey_name))
    survey = Survey(
        name = survey_name,
        hdf5_file_path = hdf5_file_path,
        )
    survey.insert_table(name = table, data_frame = data_frame)
    openfisca_survey_collection.surveys.append(survey)
    collections_directory = openfisca_survey_collection.config.get('collections', 'collections_directory')
    json_file_path = os.path.join(collections_directory, 'openfisca_erfs_fpr.json')
    openfisca_survey_collection.dump(json_file_path = json_file_path)
def build_empty_recensement_survey_collection(years= None):

    if years is None:
        log.error("A list of years to process is needed")

    recensement_survey_collection = SurveyCollection(name = "recensement")
    recensement_survey_collection.set_config_files_directory(config_files_directory)
    input_data_directory = recensement_survey_collection.config.get('data', 'input_directory')
    output_data_directory = recensement_survey_collection.config.get('data', 'output_directory')

    for year in years:
        tables = [
#            "LOGMENT_ECH_{}".format(year), # TODO: produces a strange error
            "individu_ech_{}".format(year),
            "MENAG_ECH_{}".format(year),
            ]
        survey_tables = dict()
        for year in years:
            for table in tables:
                survey_tables[table] = {
                    "spss_file": os.path.join(
                        os.path.dirname(input_data_directory),
                        "recensement",
                        str(year),
                        "{}.sav".format(table),
                        ),
                    "year": year,
                    }
            survey_name = u"recensement_{}".format(year)
            hdf5_file_path = os.path.join(
                os.path.dirname(output_data_directory),
                u"{}{}".format(survey_name, u".h5")
                )
            print hdf5_file_path
            survey = Survey(
                name = survey_name,
                hdf5_file_path = hdf5_file_path
                )
            for table, table_kwargs in survey_tables.iteritems():
                survey.insert_table(name = table, **table_kwargs)
            surveys = recensement_survey_collection.surveys
            surveys[survey_name] = survey

    return recensement_survey_collection
def build_empty_budget_consommation_survey_collection(years= None):

    if years is None:
        log.error("A list of years to process is needed")

    budget_consommation_survey_collection = SurveyCollection(name = "budget_consommation")
    budget_consommation_survey_collection.set_config_files_directory(config_files_directory)
    input_data_directory = budget_consommation_survey_collection.config.get('data', 'input_directory')
    output_data_directory = budget_consommation_survey_collection.config.get('data', 'output_directory')

    for year in years:
        tables = ["budg0{}".format(i) for i in range(1, 10)] + ["budg{}".format(i) for i in range(10, 21)]
        tables.remove("budg03")
        survey_tables = dict()
        for year in years:
            for table in tables:
                survey_tables[table] = {
                    "stata_file": os.path.join(
                        os.path.dirname(input_data_directory),
                        "budget_consommation",
                        str(year),
                        "stata",
                        "{}.dta".format(table),
                        ),
                    "year": year,
                    }

            survey_name = u"budget_consommation_{}".format(year)
            hdf5_file_path = os.path.join(
                os.path.dirname(output_data_directory),
                u"{}{}".format(survey_name, u".h5")
                )
            print hdf5_file_path
            survey = Survey(
                name = survey_name,
                hdf5_file_path = hdf5_file_path
                )
            for table, table_kwargs in survey_tables.iteritems():
                survey.insert_table(name = table, **table_kwargs)
            surveys = budget_consommation_survey_collection.surveys
            surveys[survey_name] = survey

    return budget_consommation_survey_collection
Example #19
0
def build(year=None, check=False):

    assert year is not None
    pre_processing.create_indivim_menagem(year=year)
    pre_processing.create_enfants_a_naitre(year=year)
    #    try:
    #        imputation_loyer.imputation_loyer(year = year)
    #    except Exception, e:
    #        log.info('Do not impute loyer because of the following error: \n {}'.format(e))
    #        pass
    fip.create_fip(year=year)
    famille.famille(year=year)
    foyer.sif(year=year)
    foyer.foyer_all(year=year)
    rebuild.create_totals_first_pass(year=year)
    rebuild.create_totals_second_pass(year=year)
    rebuild.create_final(year=year)
    invalides.invalide(year=year)
    final.final(year=year, check=check)

    temporary_store = get_store(file_name='erfs')
    data_frame = temporary_store['input_{}'.format(year)]
    # Saving the data_frame
    openfisca_survey_collection = SurveyCollection(
        name="openfisca", config_files_directory=config_files_directory)
    output_data_directory = openfisca_survey_collection.config.get(
        'data', 'output_directory')
    survey_name = "openfisca_data_{}".format(year)
    table = "input"
    hdf5_file_path = os.path.join(os.path.dirname(output_data_directory),
                                  "{}.h5".format(survey_name))
    survey = Survey(
        name=survey_name,
        hdf5_file_path=hdf5_file_path,
    )
    survey.insert_table(name=table, data_frame=data_frame)
    openfisca_survey_collection.surveys.append(survey)
    collections_directory = openfisca_survey_collection.config.get(
        'collections', 'collections_directory')
    json_file_path = os.path.join(collections_directory, 'openfisca.json')
    openfisca_survey_collection.dump(json_file_path=json_file_path)
def build_empty_piketty_survey_collection(years=None):

    if years is None:
        log.error("A list of years to process is needed")

    survey_collection = SurveyCollection(name="piketty")
    survey_collection.set_config_files_directory(config_files_directory)
    input_data_directory = survey_collection.config.get(
        'data', 'input_directory')
    output_data_directory = survey_collection.config.get(
        'data', 'output_directory')

    tables = [
        "indiv_conj", "indiv_ded", "indiv_demo", "indiv_logt", "indiv_rev"
    ]
    piketty_tables = dict()
    for year in [2006]:
        for table in tables:
            piketty_tables[table] = {
                "stata_file":
                os.path.join(
                    os.path.dirname(input_data_directory),
                    "revolution_fiscale",
                    "Fichiers",
                    "original",
                    "{}_{}.dta".format(table, year),
                ),
                "year":
                year,
            }

        survey_name = u"piketty_{}".format(year)
        hdf5_file_path = os.path.join(os.path.dirname(output_data_directory),
                                      u"{}{}".format(survey_name, u".h5"))
        survey = Survey(name=survey_name, hdf5_file_path=hdf5_file_path)
        for table, table_kwargs in piketty_tables.iteritems():
            survey.insert_table(name=table, **table_kwargs)
        surveys = survey_collection.surveys
        surveys[survey_name] = survey
    return survey_collection
Example #21
0
def dump_data_frame(data_frame, year):

    from openfisca_france_data.build_openfisca_survey_data import utils

    utils.print_id(data_frame)
#    utils.control(data_frame, verbose = True)
    utils.check_structure(data_frame)

    survey_collection = SurveyCollection(name = "eipp")
    survey_collection.set_config_files_directory()
    output_data_directory = survey_collection.config.get('data', 'output_directory')
    survey_name = "eipp_data_{}".format(year)
    table = "input"
    hdf5_file_path = os.path.join(
        os.path.dirname(output_data_directory),
        "{}{}".format(survey_name, ".h5"),
        )

    survey = Survey(
        name = survey_name,
        hdf5_file_path = hdf5_file_path,
        )
    survey.insert_table(name = table)
    survey.fill_hdf(table, data_frame)
    survey_collection.surveys[survey_name] = survey
    survey_collection.dump(collection = "eipp")
Example #22
0
def test_survey():
    name = 'fake'
    data_dir = os.path.join(
        pkg_resources.get_distribution('openfisca-survey-manager').location,
        'openfisca_survey_manager',
        'tests',
        'data_files',
    )

    survey_collection = SurveyCollection(name=name,
                                         config_files_directory=data_dir,
                                         json_file_path=os.path.join(
                                             data_dir, 'fake.json'))

    saved_fake_survey_hdf5_file_path = os.path.join(data_dir, 'fake.hdf5')
    saved_fake_survey_file_path = os.path.join(data_dir, 'help.sas7bdat')
    survey = Survey(
        hdf5_file_path=saved_fake_survey_hdf5_file_path,
        name='fake_survey',
        sas_files=[saved_fake_survey_file_path],
        survey_collection=survey_collection,
    )
    survey.insert_table(name='help')
    survey.fill_hdf(source_format='sas')
    print survey.tables
def build_empty_piketty_survey_collection(years= None):

    if years is None:
        log.error("A list of years to process is needed")

    survey_collection = SurveyCollection(name = "piketty")
    survey_collection.set_config_files_directory(config_files_directory)
    input_data_directory = survey_collection.config.get('data', 'input_directory')
    output_data_directory = survey_collection.config.get('data', 'output_directory')

    tables = ["indiv_conj", "indiv_ded", "indiv_demo", "indiv_logt", "indiv_rev"]
    piketty_tables = dict()
    for year in [2006]:
        for table in tables:
            piketty_tables[table] = {
                "stata_file": os.path.join(
                    os.path.dirname(input_data_directory),
                    "revolution_fiscale",
                    "Fichiers",
                    "original",
                    "{}_{}.dta".format(table, year),
                    ),
                "year": year,
                }

        survey_name = u"piketty_{}".format(year)
        hdf5_file_path = os.path.join(
            os.path.dirname(output_data_directory),
            u"{}{}".format(survey_name, u".h5")
            )
        survey = Survey(
            name = survey_name,
            hdf5_file_path = hdf5_file_path
            )
        for table, table_kwargs in piketty_tables.iteritems():
            survey.insert_table(name = table, **table_kwargs)
        surveys = survey_collection.surveys
        surveys[survey_name] = survey
    return survey_collection
def dump_simulation_results_data_frame(survey_scenario, collection = None):
    assert collection is not None
    year = survey_scenario.year
    data_frame_by_entity = get_calculated_data_frame_by_entity(survey_scenario)
    openfisca_survey_collection = SurveyCollection.load(collection = "openfisca")
    output_data_directory = openfisca_survey_collection.config.get('data', 'output_directory')
    survey_name = "openfisca_data_{}".format(year)
    for entity, data_frame in data_frame_by_entity.iteritems():
        print entity
        table = entity
        hdf5_file_path = os.path.join(
            os.path.dirname(output_data_directory),
            "{}{}".format(survey_name, ".h5"),
            )
        survey = Survey(
            name = survey_name,
            hdf5_file_path = hdf5_file_path,
            )
        survey.insert_table(name = table)
        survey.fill_hdf(table, data_frame)
        openfisca_survey_collection.surveys[survey_name] = survey
        openfisca_survey_collection.dump(collection = "openfisca")
Example #25
0
def store_input_data_frame(data_frame=None,
                           collection=None,
                           survey=None,
                           table=None):
    assert data_frame is not None
    assert collection is not None
    assert survey is not None
    try:
        openfisca_survey_collection = SurveyCollection.load(
            collection=collection)
    except Exception as e:
        openfisca_survey_collection = SurveyCollection(name=collection)

    log.debug("In collection {} the following survey are present: {}".format(
        collection, openfisca_survey_collection.surveys))
    output_data_directory = openfisca_survey_collection.config.get(
        'data', 'output_directory')
    if table is None:
        table = "input"
    #
    survey_name = survey
    hdf5_file_path = os.path.join(os.path.dirname(output_data_directory),
                                  "{}.h5".format(survey_name))
    available_survey_names = [
        survey_.name for survey_ in openfisca_survey_collection.surveys
    ]
    if survey_name in available_survey_names:
        survey = openfisca_survey_collection.get_survey(survey_name)
    else:
        survey = Survey(name=survey_name, hdf5_file_path=hdf5_file_path)
    survey.insert_table(name=table, data_frame=data_frame)
    openfisca_survey_collection.surveys.append(survey)
    collections_directory = openfisca_survey_collection.config.get(
        'collections', 'collections_directory')
    json_file_path = os.path.join(collections_directory,
                                  '{}.json'.format(collection))
    log.debug("In collection {} the following surveyx are present: {}".format(
        collection, openfisca_survey_collection.surveys))
    openfisca_survey_collection.dump(json_file_path=json_file_path)
def build_empty_eipp_survey_collection(years= None):

    if years is None:
        log.error("A list of years to process is needed")

    base_eipp_survey_collection = SurveyCollection(name = "eipp")
    base_eipp_survey_collection.set_config_files_directory(config_files_directory)
    input_data_directory = base_eipp_survey_collection.config.get('data', 'input_directory')
    output_data_directory = base_eipp_survey_collection.config.get('data', 'output_directory')

    tables = ["base"]
    eipp_tables = dict()

    for year in years:
        for table in tables:
            eipp_tables[table] = {
                "stata_file": os.path.join(
                    os.path.dirname(input_data_directory),
                    "fichiers_eipp",
                    "{}_{}.dta".format(table, year),
                    ),
                "year": year,
                }

        survey_name = u"eipp_{}".format(year)
        hdf5_file_path = os.path.join(
            os.path.dirname(output_data_directory),
            u"{}{}".format(survey_name, u".h5")
            )
        survey = Survey(
            name = survey_name,
            hdf5_file_path = hdf5_file_path
            )
        for table, table_kwargs in eipp_tables.iteritems():
            survey.insert_table(name = table, **table_kwargs)
        surveys = base_eipp_survey_collection.surveys
        surveys[survey_name] = survey
    return base_eipp_survey_collection
Example #27
0
def run_all(year = None, check = False):

    assert year is not None
    pre_processing.create_indivim_menagem(year = year)
    pre_processing.create_enfants_a_naitre(year = year)
    #    try:
    #        imputation_loyer.imputation_loyer(year = year)
    #    except Exception, e:
    #        log.info('Do not impute loyer because of the following error: \n {}'.format(e))
    #        pass
    fip.create_fip(year = year)
    famille.famille(year = year)
    foyer.sif(year = year)
    foyer.foyer_all(year = year)
    rebuild.create_totals_first_pass(year = year)
    rebuild.create_totals_second_pass(year = year)
    rebuild.create_final(year = year)
    invalides.invalide(year = year)
    final.final(year = year, check = check)

    temporary_store = get_store(file_name = 'erfs')
    data_frame = temporary_store['input_{}'.format(year)]
    # Saving the data_frame
    openfisca_survey_collection = SurveyCollection(name = "openfisca", config_files_directory = config_files_directory)
    output_data_directory = openfisca_survey_collection.config.get('data', 'output_directory')
    survey_name = "openfisca_data_{}".format(year)
    table = "input"
    hdf5_file_path = os.path.join(os.path.dirname(output_data_directory), "{}.h5".format(survey_name))
    survey = Survey(
        name = survey_name,
        hdf5_file_path = hdf5_file_path,
        )
    survey.insert_table(name = table, data_frame = data_frame)
    openfisca_survey_collection.surveys.append(survey)
    collections_directory = openfisca_survey_collection.config.get('collections', 'collections_directory')
    json_file_path = os.path.join(collections_directory, 'openfisca.json')
    openfisca_survey_collection.dump(json_file_path = json_file_path)
def dump_simulation_results_data_frame(survey_scenario, collection = None):
    assert collection is not None
    year = survey_scenario.year
    data_frame_by_entity = get_calculated_data_frame_by_entity(survey_scenario)
    openfisca_survey_collection = SurveyCollection.load(collection = "openfisca")
    output_data_directory = openfisca_survey_collection.config.get('data', 'output_directory')
    survey_name = "openfisca_data_{}".format(year)
    for entity, data_frame in data_frame_by_entity.iteritems():
        print entity
        table = entity
        hdf5_file_path = os.path.join(
            os.path.dirname(output_data_directory),
            "{}{}".format(survey_name, ".h5"),
            )
        survey = Survey(
            name = survey_name,
            hdf5_file_path = hdf5_file_path,
            )
        survey.insert_table(name = table)
        survey.fill_hdf(table, data_frame)
        openfisca_survey_collection.surveys[survey_name] = survey
        openfisca_survey_collection.dump(collection = "openfisca")
    data_frame = data_frame.T.groupby(level = 0).first().T

    log.info('Saving the openfisca indirect taxation input dataframe')
    try:
        openfisca_survey_collection = SurveyCollection.load(
            collection = 'openfisca_indirect_taxation', config_files_directory = config_files_directory)
    except:
        openfisca_survey_collection = SurveyCollection(
            name = 'openfisca_indirect_taxation', config_files_directory = config_files_directory)

    output_data_directory = openfisca_survey_collection.config.get('data', 'output_directory')
    survey_name = "openfisca_indirect_taxation_data_{}".format(year_calage)
    table = "input"
    hdf5_file_path = os.path.join(output_data_directory, "{}.h5".format(survey_name))
    survey = Survey(
        name = survey_name,
        hdf5_file_path = hdf5_file_path,
        )
    survey.insert_table(name = table, data_frame = data_frame)
    openfisca_survey_collection.surveys.append(survey)
    openfisca_survey_collection.dump()


def run(years_calage):
    import time
    year_data_list = [1995, 2000, 2005, 2011]
    for year_calage in years_calage:
        start = time.time()
        run_all(year_calage, year_data_list)
        log.info("Finished {}".format(time.time() - start))

if __name__ == '__main__':