def store_input_data_frame(data_frame = None, collection = None, survey = None, table = None): assert data_frame is not None assert collection is not None assert survey is not None try: openfisca_survey_collection = SurveyCollection.load(collection = collection) except Exception as e: openfisca_survey_collection = SurveyCollection(name = collection) log.debug("In collection {} the following survey are present: {}".format(collection, openfisca_survey_collection.surveys)) output_data_directory = openfisca_survey_collection.config.get('data', 'output_directory') if table is None: table = "input" # survey_name = survey hdf5_file_path = os.path.join(os.path.dirname(output_data_directory), "{}.h5".format(survey_name)) available_survey_names = [survey_.name for survey_ in openfisca_survey_collection.surveys] if survey_name in available_survey_names: survey = openfisca_survey_collection.get_survey(survey_name) else: survey = Survey(name = survey_name, hdf5_file_path = hdf5_file_path) survey.insert_table(name = table, data_frame = data_frame) openfisca_survey_collection.surveys.append(survey) collections_directory = openfisca_survey_collection.config.get('collections', 'collections_directory') json_file_path = os.path.join(collections_directory, '{}.json'.format(collection)) log.debug("In collection {} the following surveyx are present: {}".format(collection, openfisca_survey_collection.surveys)) openfisca_survey_collection.dump(json_file_path = json_file_path)
def test_survey(): name = 'fake' data_dir = os.path.join( pkg_resources.get_distribution('openfisca-survey-manager').location, 'openfisca_survey_manager', 'tests', 'data_files', ) survey_collection = SurveyCollection( name = name, config_files_directory = data_dir, json_file_path = os.path.join(data_dir, 'fake.json') ) saved_fake_survey_hdf5_file_path = os.path.join(data_dir, 'fake.hdf5') saved_fake_survey_file_path = os.path.join(data_dir, 'help.sas7bdat') survey = Survey( hdf5_file_path = saved_fake_survey_hdf5_file_path, name = 'fake_survey', sas_files = [saved_fake_survey_file_path], survey_collection = survey_collection, ) survey.insert_table(name = 'help') survey.fill_hdf(source_format = 'sas') print survey.tables
def add_survey_to_collection(survey_name=None, survey_collection=None, sas_files=[], stata_files=[]): assert survey_collection is not None overwrite = True label = survey_name for test_survey in survey_collection.surveys: if test_survey.name == survey_name: survey = survey_collection.get_survey(survey_name) if overwrite: survey = Survey( name=survey_name, label=label, sas_files=sas_files, stata_files=stata_files, survey_collection=survey_collection, ) else: survey = survey_collection.get(survey_name) survey.label = label survey.informations.update({ "sas_files": sas_files, "stata_files": stata_files, }) survey_collection.surveys = [ kept_survey for kept_survey in survey_collection.surveys if kept_survey.name != survey_name ] survey_collection.surveys.append(survey)
def add_survey_to_collection(survey_name = None, survey_collection = None, sas_files = [], stata_files = []): assert survey_collection is not None overwrite = True label = survey_name for test_survey in survey_collection.surveys: if test_survey.name == survey_name: survey = survey_collection.get_survey(survey_name) if overwrite: survey = Survey( name = survey_name, label = label, sas_files = sas_files, stata_files = stata_files, survey_collection = survey_collection, ) else: survey = survey_collection.get(survey_name) survey.label = label survey.informations.update({ "sas_files": sas_files, "stata_files": stata_files, }) survey_collection.surveys = [ kept_survey for kept_survey in survey_collection.surveys if kept_survey.name != survey_name ] survey_collection.surveys.append(survey)
def load(cls, json_file_path = None, collection = None, config_files_directory = default_config_files_directory): assert os.path.exists(config_files_directory) if json_file_path is None: assert collection is not None config = Config(config_files_directory = config_files_directory) try: json_file_path = config.get("collections", collection) except Exception as error: log.debug("Looking for congi file in {}".format(config_files_directory)) log.error(error) raise with open(json_file_path, 'r') as _file: self_json = json.load(_file) name = self_json.get('name') self = cls(name = name) self.config = config with open(json_file_path, 'r') as _file: self_json = json.load(_file) self.json_file_path = json_file_path self.label = self_json.get('label') self.name = self_json.get('name') surveys = self_json.get('surveys') for survey_name, survey_json in surveys.items(): survey = Survey(name = survey_name) self.surveys.append(survey.create_from_json(survey_json)) return self
def run_all(year = None, filename = "test", check = False): assert year is not None pre_processing.create_indivim_menage_en_mois(year = year) pre_processing.create_enfants_a_naitre(year = year) # imputation_loyer.imputation_loyer(year = year) fip.create_fip(year = year) famille.famille(year = year) foyer.sif(year = year) foyer.foyer_all(year = year) rebuild.create_totals(year = year) rebuild.create_final(year = year) invalides.invalide(year = year) data_frame = final.final(year = year, check = check) # Saving the data_frame openfisca_survey_collection = SurveyCollection(name = "openfisca", config_files_directory = config_files_directory) output_data_directory = openfisca_survey_collection.config.get('data', 'output_directory') survey_name = "openfisca_data_{}".format(year) table = "input" hdf5_file_path = os.path.join(os.path.dirname(output_data_directory), "{}.h5".format(survey_name)) survey = Survey( name = survey_name, hdf5_file_path = hdf5_file_path, ) survey.insert_table(name = table, data_frame = data_frame) openfisca_survey_collection.surveys.append(survey) collections_directory = openfisca_survey_collection.config.get('collections', 'collections_directory') json_file_path = os.path.join(collections_directory, 'openfisca.json') openfisca_survey_collection.dump(json_file_path = json_file_path)
def load(cls, json_file_path=None, collection=None, config_files_directory=default_config_files_directory): assert os.path.exists(config_files_directory) if json_file_path is None: assert collection is not None config = Config(config_files_directory=config_files_directory) try: json_file_path = config.get("collections", collection) except Exception as error: log.debug("Looking for congi file in {}".format( config_files_directory)) log.error(error) raise with open(json_file_path, 'r') as _file: self_json = json.load(_file) name = self_json.get('name') self = cls(name=name) self.config = config with open(json_file_path, 'r') as _file: self_json = json.load(_file) self.json_file_path = json_file_path self.label = self_json.get('label') self.name = self_json.get('name') surveys = self_json.get('surveys') for survey_name, survey_json in surveys.items(): survey = Survey(name=survey_name) self.surveys.append(survey.create_from_json(survey_json)) return self
def build_empty_logement_survey_collection(years=None): if years is None: log.error("A list of years to process is needed") logement_survey_collection = SurveyCollection(name="logement") logement_survey_collection.set_config_files_directory( config_files_directory) input_data_directory = logement_survey_collection.config.get( 'data', 'input_directory') output_data_directory = logement_survey_collection.config.get( 'data', 'output_directory') for year in years: surveys = logement_survey_collection.surveys survey_name = 'logement_{}'.format(year) hdf5_file_path = os.path.join(os.path.dirname(output_data_directory), "{}{}".format(survey_name, ".h5")) survey = Survey(name=survey_name, hdf5_file_path=hdf5_file_path) surveys[survey_name] = survey yr = str(year)[2:] yr1 = str(year + 1)[2:] if yr == "03": lgt_men = "menage" lgt_logt = None renameidlgt = dict(ident='ident') elif yr in ["06", "07", "08", "09"]: # TODO: clean this lgt_men = "menage1" lgt_lgt = "logement" renameidlgt = dict(idlog='ident') logement_tables = { "adresse": "adresse", "lgt_menage": lgt_men, "lgt_logt": lgt_lgt, } RData_directory = os.path.join(os.path.dirname(input_data_directory), 'R', 'logement', str(year)) SasData_directory = os.path.join(os.path.dirname(input_data_directory), 'enqlog2006/enq_06') for name, Rdata_table in logement_tables.iteritems(): Rdata_file = os.path.join(RData_directory, "{}.Rdata".format(Rdata_table)) sas_file = os.path.join(SasData_directory, "{}.sas7bdat".format(Rdata_table)) survey.insert_table(name=name, year=year, Rdata_file=Rdata_file, Rdata_table=Rdata_table, sas_file=sas_file) return logement_survey_collection
def build_empty_logement_survey_collection(years= None): if years is None: log.error("A list of years to process is needed") logement_survey_collection = SurveyCollection(name = "logement") logement_survey_collection.set_config_files_directory(config_files_directory) input_data_directory = logement_survey_collection.config.get('data', 'input_directory') output_data_directory = logement_survey_collection.config.get('data', 'output_directory') for year in years: surveys = logement_survey_collection.surveys survey_name = 'logement_{}'.format(year) hdf5_file_path = os.path.join( os.path.dirname(output_data_directory), "{}{}".format(survey_name, ".h5") ) survey = Survey( name = survey_name, hdf5_file_path = hdf5_file_path ) surveys[survey_name] = survey yr = str(year)[2:] yr1 = str(year+1)[2:] if yr == "03": lgt_men = "menage" lgt_logt = None renameidlgt = dict(ident='ident') elif yr in ["06" ,"07", "08", "09"]: # TODO: clean this lgt_men = "menage1" lgt_lgt = "logement" renameidlgt = dict(idlog='ident') logement_tables = { "adresse" : "adresse", "lgt_menage" : lgt_men, "lgt_logt" : lgt_lgt, } RData_directory = os.path.join(os.path.dirname(input_data_directory),'R','logement', str(year)) SasData_directory = os.path.join(os.path.dirname(input_data_directory),'enqlog2006/enq_06') for name, Rdata_table in logement_tables.iteritems(): Rdata_file = os.path.join(RData_directory, "{}.Rdata".format(Rdata_table)) sas_file = os.path.join(SasData_directory, "{}.sas7bdat".format(Rdata_table)) survey.insert_table(name = name, year = year, Rdata_file = Rdata_file, Rdata_table = Rdata_table, sas_file = sas_file ) return logement_survey_collection
def run_all(year_calage = 2007, year_data_list = [1995, 2000, 2005, 2011]): # Quelle base de données choisir pour le calage ? year_data = find_nearest_inferior(year_data_list, year_calage) # 4 étape parallèles d'homogénéisation des données sources : # Gestion des dépenses de consommation: build_depenses_homogenisees(year = year_data) build_imputation_loyers_proprietaires(year = year_data) build_depenses_calees(year_calage, year_data) build_menage_consumption_by_categorie_fiscale(year_calage, year_data) categorie_fiscale_data_frame = temporary_store["menage_consumption_by_categorie_fiscale_{}".format(year_calage)] depenses_calees_by_grosposte = temporary_store["depenses_calees_by_grosposte_{}".format(year_calage)] # Gestion des véhicules: build_homogeneisation_vehicules(year = year_data) vehicule = temporary_store['automobile_{}'.format(year_data)] # Gestion des variables socio démographiques: build_homogeneisation_caracteristiques_sociales(year = year_data) menage = temporary_store['donnes_socio_demog_{}'.format(year_data)] # Gestion des variables revenues: build_homogeneisation_revenus_menages(year = year_data) revenus = temporary_store["revenus_{}".format(year_data)] # DataFrame résultant de ces 4 étapes data_frame = pandas.concat( [revenus, vehicule, categorie_fiscale_data_frame, menage, depenses_calees_by_grosposte], axis = 1) data_frame.index.name = "ident_men" data_frame.reset_index(inplace = True) # Remove duplicated colums causing bug with HDFStore # according to https://github.com/pydata/pandas/issues/6240 # using solution form stackoverflow # http://stackoverflow.com/questions/16938441/how-to-remove-duplicate-columns-from-a-dataframe-using-python-pandas data_frame = data_frame.T.groupby(level = 0).first().T # Saving the data_frame openfisca_survey_collection = SurveyCollection.load( collection = 'openfisca_indirect_taxation', config_files_directory = config_files_directory) output_data_directory = openfisca_survey_collection.config.get('data', 'output_directory') survey_name = "openfisca_indirect_taxation_data_{}".format(year_calage) table = "input" hdf5_file_path = os.path.join(os.path.dirname(output_data_directory), "{}.h5".format(survey_name)) survey = Survey( name = survey_name, hdf5_file_path = hdf5_file_path, ) survey.insert_table(name = table, data_frame = data_frame) openfisca_survey_collection.surveys.append(survey) openfisca_survey_collection.dump()
def set_table_in_survey(input_dataframe, entity, period, collection, survey_name, survey_label = None, table_label = None, table_name = None): period = periods.period(period) if table_name is None: table_name = entity + '_' + str(period) if table_label is None: table_label = "Input data for entity {} at period {}".format(entity, period) try: survey_collection = SurveyCollection.load(collection = collection) except configparser.NoOptionError: survey_collection = SurveyCollection(name = collection) except configparser.NoSectionError: # For tests data_dir = os.path.join( pkg_resources.get_distribution('openfisca-survey-manager').location, 'openfisca_survey_manager', 'tests', 'data_files', ) survey_collection = SurveyCollection( name = collection, config_files_directory = data_dir, ) try: survey = survey_collection.get_survey(survey_name) except AssertionError: survey = Survey( name = survey_name, label = survey_label or None, survey_collection = survey_collection, ) if survey.hdf5_file_path is None: config = survey.survey_collection.config directory_path = config.get("data", "output_directory") if not os.path.isdir(directory_path): log.warn("{} who should be the HDF5 data directory does not exist: we create the directory".format( directory_path)) os.makedirs(directory_path) survey.hdf5_file_path = os.path.join(directory_path, survey.name + '.h5') assert survey.hdf5_file_path is not None survey.insert_table(label = table_label, name = table_name, dataframe = input_dataframe) survey_collection.surveys = [ kept_survey for kept_survey in survey_collection.surveys if kept_survey.name != survey_name ] survey_collection.surveys.append(survey) collections_directory = survey_collection.config.get('collections', 'collections_directory') assert os.path.isdir(collections_directory), """{} who should be the collections' directory does not exist. Fix the option collections_directory in the collections section of your config file.""".format(collections_directory) collection_json_path = os.path.join(collections_directory, "{}.json".format(collection)) survey_collection.dump(json_file_path = collection_json_path)
def test_survey_dump_load(): erfs_survey_collection = build_empty_erfs_survey_collection(years=[2006]) erfs_survey = erfs_survey_collection.surveys['erfs_2006'] saved_fake_survey_file_path = os.path.join(current_dir, 'saved_fake_survey') erfs_survey.dump(saved_fake_survey_file_path) erfs_survey_bis = Survey.load(saved_fake_survey_file_path) assert erfs_survey.to_json() == erfs_survey_bis.to_json()
def add_survey_to_collection(survey_name = None, survey_collection = None, sas_files = [], stata_files = [], question = False): assert survey_collection is not None overwrite = True if question: label = click.prompt('Enter a description for the survey {}'.format(survey_name), default = survey_name) else: label = survey_name for test_survey in survey_collection.surveys: if test_survey.name == survey_name: if question: click.echo('The following information is available for survey {}'.format(survey_name)) survey = survey_collection.get_survey(survey_name) if question: click.echo(survey) overwrite = click.confirm( 'Overwrite previous survey {} informations ?'.format(survey_name), default = True) else: overwrite = True if question: same_survey = click.confirm('Are all the files part of the same survey ?', default = True) else: same_survey = True if same_survey: if overwrite: survey = Survey( name = survey_name, label = label, sas_files = sas_files, stata_files = stata_files, survey_collection = survey_collection, ) else: survey = survey_collection.get(survey_name) survey.label = label survey.informations.update({ "sas_files": sas_files, "stata_files": stata_files, }) survey_collection.surveys = [ kept_survey for kept_survey in survey_collection.surveys if kept_survey.name != survey_name ] survey_collection.surveys.append(survey)
def store_input_data_frame(data_frame = None, collection = None, survey = None): assert data_frame is not None assert collection is not None assert survey is not None openfisca_survey_collection = SurveyCollection(name = collection, config_files_directory = config_files_directory) output_data_directory = openfisca_survey_collection.config.get('data', 'output_directory') survey_name = survey table = "input" hdf5_file_path = os.path.join(os.path.dirname(output_data_directory), "{}.h5".format(survey_name)) survey = Survey( name = survey_name, hdf5_file_path = hdf5_file_path, ) survey.insert_table(name = table, data_frame = data_frame) openfisca_survey_collection.surveys.append(survey) collections_directory = openfisca_survey_collection.config.get('collections', 'collections_directory') json_file_path = os.path.join(collections_directory, 'openfisca_erfs_fpr.json') openfisca_survey_collection.dump(json_file_path = json_file_path)
def build_empty_recensement_survey_collection(years= None): if years is None: log.error("A list of years to process is needed") recensement_survey_collection = SurveyCollection(name = "recensement") recensement_survey_collection.set_config_files_directory(config_files_directory) input_data_directory = recensement_survey_collection.config.get('data', 'input_directory') output_data_directory = recensement_survey_collection.config.get('data', 'output_directory') for year in years: tables = [ # "LOGMENT_ECH_{}".format(year), # TODO: produces a strange error "individu_ech_{}".format(year), "MENAG_ECH_{}".format(year), ] survey_tables = dict() for year in years: for table in tables: survey_tables[table] = { "spss_file": os.path.join( os.path.dirname(input_data_directory), "recensement", str(year), "{}.sav".format(table), ), "year": year, } survey_name = u"recensement_{}".format(year) hdf5_file_path = os.path.join( os.path.dirname(output_data_directory), u"{}{}".format(survey_name, u".h5") ) print hdf5_file_path survey = Survey( name = survey_name, hdf5_file_path = hdf5_file_path ) for table, table_kwargs in survey_tables.iteritems(): survey.insert_table(name = table, **table_kwargs) surveys = recensement_survey_collection.surveys surveys[survey_name] = survey return recensement_survey_collection
def build_empty_budget_consommation_survey_collection(years= None): if years is None: log.error("A list of years to process is needed") budget_consommation_survey_collection = SurveyCollection(name = "budget_consommation") budget_consommation_survey_collection.set_config_files_directory(config_files_directory) input_data_directory = budget_consommation_survey_collection.config.get('data', 'input_directory') output_data_directory = budget_consommation_survey_collection.config.get('data', 'output_directory') for year in years: tables = ["budg0{}".format(i) for i in range(1, 10)] + ["budg{}".format(i) for i in range(10, 21)] tables.remove("budg03") survey_tables = dict() for year in years: for table in tables: survey_tables[table] = { "stata_file": os.path.join( os.path.dirname(input_data_directory), "budget_consommation", str(year), "stata", "{}.dta".format(table), ), "year": year, } survey_name = u"budget_consommation_{}".format(year) hdf5_file_path = os.path.join( os.path.dirname(output_data_directory), u"{}{}".format(survey_name, u".h5") ) print hdf5_file_path survey = Survey( name = survey_name, hdf5_file_path = hdf5_file_path ) for table, table_kwargs in survey_tables.iteritems(): survey.insert_table(name = table, **table_kwargs) surveys = budget_consommation_survey_collection.surveys surveys[survey_name] = survey return budget_consommation_survey_collection
def build(year=None, check=False): assert year is not None pre_processing.create_indivim_menagem(year=year) pre_processing.create_enfants_a_naitre(year=year) # try: # imputation_loyer.imputation_loyer(year = year) # except Exception, e: # log.info('Do not impute loyer because of the following error: \n {}'.format(e)) # pass fip.create_fip(year=year) famille.famille(year=year) foyer.sif(year=year) foyer.foyer_all(year=year) rebuild.create_totals_first_pass(year=year) rebuild.create_totals_second_pass(year=year) rebuild.create_final(year=year) invalides.invalide(year=year) final.final(year=year, check=check) temporary_store = get_store(file_name='erfs') data_frame = temporary_store['input_{}'.format(year)] # Saving the data_frame openfisca_survey_collection = SurveyCollection( name="openfisca", config_files_directory=config_files_directory) output_data_directory = openfisca_survey_collection.config.get( 'data', 'output_directory') survey_name = "openfisca_data_{}".format(year) table = "input" hdf5_file_path = os.path.join(os.path.dirname(output_data_directory), "{}.h5".format(survey_name)) survey = Survey( name=survey_name, hdf5_file_path=hdf5_file_path, ) survey.insert_table(name=table, data_frame=data_frame) openfisca_survey_collection.surveys.append(survey) collections_directory = openfisca_survey_collection.config.get( 'collections', 'collections_directory') json_file_path = os.path.join(collections_directory, 'openfisca.json') openfisca_survey_collection.dump(json_file_path=json_file_path)
def build_empty_piketty_survey_collection(years=None): if years is None: log.error("A list of years to process is needed") survey_collection = SurveyCollection(name="piketty") survey_collection.set_config_files_directory(config_files_directory) input_data_directory = survey_collection.config.get( 'data', 'input_directory') output_data_directory = survey_collection.config.get( 'data', 'output_directory') tables = [ "indiv_conj", "indiv_ded", "indiv_demo", "indiv_logt", "indiv_rev" ] piketty_tables = dict() for year in [2006]: for table in tables: piketty_tables[table] = { "stata_file": os.path.join( os.path.dirname(input_data_directory), "revolution_fiscale", "Fichiers", "original", "{}_{}.dta".format(table, year), ), "year": year, } survey_name = u"piketty_{}".format(year) hdf5_file_path = os.path.join(os.path.dirname(output_data_directory), u"{}{}".format(survey_name, u".h5")) survey = Survey(name=survey_name, hdf5_file_path=hdf5_file_path) for table, table_kwargs in piketty_tables.iteritems(): survey.insert_table(name=table, **table_kwargs) surveys = survey_collection.surveys surveys[survey_name] = survey return survey_collection
def dump_data_frame(data_frame, year): from openfisca_france_data.build_openfisca_survey_data import utils utils.print_id(data_frame) # utils.control(data_frame, verbose = True) utils.check_structure(data_frame) survey_collection = SurveyCollection(name = "eipp") survey_collection.set_config_files_directory() output_data_directory = survey_collection.config.get('data', 'output_directory') survey_name = "eipp_data_{}".format(year) table = "input" hdf5_file_path = os.path.join( os.path.dirname(output_data_directory), "{}{}".format(survey_name, ".h5"), ) survey = Survey( name = survey_name, hdf5_file_path = hdf5_file_path, ) survey.insert_table(name = table) survey.fill_hdf(table, data_frame) survey_collection.surveys[survey_name] = survey survey_collection.dump(collection = "eipp")
def test_survey(): name = 'fake' data_dir = os.path.join( pkg_resources.get_distribution('openfisca-survey-manager').location, 'openfisca_survey_manager', 'tests', 'data_files', ) survey_collection = SurveyCollection(name=name, config_files_directory=data_dir, json_file_path=os.path.join( data_dir, 'fake.json')) saved_fake_survey_hdf5_file_path = os.path.join(data_dir, 'fake.hdf5') saved_fake_survey_file_path = os.path.join(data_dir, 'help.sas7bdat') survey = Survey( hdf5_file_path=saved_fake_survey_hdf5_file_path, name='fake_survey', sas_files=[saved_fake_survey_file_path], survey_collection=survey_collection, ) survey.insert_table(name='help') survey.fill_hdf(source_format='sas') print survey.tables
def build_empty_piketty_survey_collection(years= None): if years is None: log.error("A list of years to process is needed") survey_collection = SurveyCollection(name = "piketty") survey_collection.set_config_files_directory(config_files_directory) input_data_directory = survey_collection.config.get('data', 'input_directory') output_data_directory = survey_collection.config.get('data', 'output_directory') tables = ["indiv_conj", "indiv_ded", "indiv_demo", "indiv_logt", "indiv_rev"] piketty_tables = dict() for year in [2006]: for table in tables: piketty_tables[table] = { "stata_file": os.path.join( os.path.dirname(input_data_directory), "revolution_fiscale", "Fichiers", "original", "{}_{}.dta".format(table, year), ), "year": year, } survey_name = u"piketty_{}".format(year) hdf5_file_path = os.path.join( os.path.dirname(output_data_directory), u"{}{}".format(survey_name, u".h5") ) survey = Survey( name = survey_name, hdf5_file_path = hdf5_file_path ) for table, table_kwargs in piketty_tables.iteritems(): survey.insert_table(name = table, **table_kwargs) surveys = survey_collection.surveys surveys[survey_name] = survey return survey_collection
def dump_simulation_results_data_frame(survey_scenario, collection = None): assert collection is not None year = survey_scenario.year data_frame_by_entity = get_calculated_data_frame_by_entity(survey_scenario) openfisca_survey_collection = SurveyCollection.load(collection = "openfisca") output_data_directory = openfisca_survey_collection.config.get('data', 'output_directory') survey_name = "openfisca_data_{}".format(year) for entity, data_frame in data_frame_by_entity.iteritems(): print entity table = entity hdf5_file_path = os.path.join( os.path.dirname(output_data_directory), "{}{}".format(survey_name, ".h5"), ) survey = Survey( name = survey_name, hdf5_file_path = hdf5_file_path, ) survey.insert_table(name = table) survey.fill_hdf(table, data_frame) openfisca_survey_collection.surveys[survey_name] = survey openfisca_survey_collection.dump(collection = "openfisca")
def store_input_data_frame(data_frame=None, collection=None, survey=None, table=None): assert data_frame is not None assert collection is not None assert survey is not None try: openfisca_survey_collection = SurveyCollection.load( collection=collection) except Exception as e: openfisca_survey_collection = SurveyCollection(name=collection) log.debug("In collection {} the following survey are present: {}".format( collection, openfisca_survey_collection.surveys)) output_data_directory = openfisca_survey_collection.config.get( 'data', 'output_directory') if table is None: table = "input" # survey_name = survey hdf5_file_path = os.path.join(os.path.dirname(output_data_directory), "{}.h5".format(survey_name)) available_survey_names = [ survey_.name for survey_ in openfisca_survey_collection.surveys ] if survey_name in available_survey_names: survey = openfisca_survey_collection.get_survey(survey_name) else: survey = Survey(name=survey_name, hdf5_file_path=hdf5_file_path) survey.insert_table(name=table, data_frame=data_frame) openfisca_survey_collection.surveys.append(survey) collections_directory = openfisca_survey_collection.config.get( 'collections', 'collections_directory') json_file_path = os.path.join(collections_directory, '{}.json'.format(collection)) log.debug("In collection {} the following surveyx are present: {}".format( collection, openfisca_survey_collection.surveys)) openfisca_survey_collection.dump(json_file_path=json_file_path)
def build_empty_eipp_survey_collection(years= None): if years is None: log.error("A list of years to process is needed") base_eipp_survey_collection = SurveyCollection(name = "eipp") base_eipp_survey_collection.set_config_files_directory(config_files_directory) input_data_directory = base_eipp_survey_collection.config.get('data', 'input_directory') output_data_directory = base_eipp_survey_collection.config.get('data', 'output_directory') tables = ["base"] eipp_tables = dict() for year in years: for table in tables: eipp_tables[table] = { "stata_file": os.path.join( os.path.dirname(input_data_directory), "fichiers_eipp", "{}_{}.dta".format(table, year), ), "year": year, } survey_name = u"eipp_{}".format(year) hdf5_file_path = os.path.join( os.path.dirname(output_data_directory), u"{}{}".format(survey_name, u".h5") ) survey = Survey( name = survey_name, hdf5_file_path = hdf5_file_path ) for table, table_kwargs in eipp_tables.iteritems(): survey.insert_table(name = table, **table_kwargs) surveys = base_eipp_survey_collection.surveys surveys[survey_name] = survey return base_eipp_survey_collection
def run_all(year = None, check = False): assert year is not None pre_processing.create_indivim_menagem(year = year) pre_processing.create_enfants_a_naitre(year = year) # try: # imputation_loyer.imputation_loyer(year = year) # except Exception, e: # log.info('Do not impute loyer because of the following error: \n {}'.format(e)) # pass fip.create_fip(year = year) famille.famille(year = year) foyer.sif(year = year) foyer.foyer_all(year = year) rebuild.create_totals_first_pass(year = year) rebuild.create_totals_second_pass(year = year) rebuild.create_final(year = year) invalides.invalide(year = year) final.final(year = year, check = check) temporary_store = get_store(file_name = 'erfs') data_frame = temporary_store['input_{}'.format(year)] # Saving the data_frame openfisca_survey_collection = SurveyCollection(name = "openfisca", config_files_directory = config_files_directory) output_data_directory = openfisca_survey_collection.config.get('data', 'output_directory') survey_name = "openfisca_data_{}".format(year) table = "input" hdf5_file_path = os.path.join(os.path.dirname(output_data_directory), "{}.h5".format(survey_name)) survey = Survey( name = survey_name, hdf5_file_path = hdf5_file_path, ) survey.insert_table(name = table, data_frame = data_frame) openfisca_survey_collection.surveys.append(survey) collections_directory = openfisca_survey_collection.config.get('collections', 'collections_directory') json_file_path = os.path.join(collections_directory, 'openfisca.json') openfisca_survey_collection.dump(json_file_path = json_file_path)
data_frame = data_frame.T.groupby(level = 0).first().T log.info('Saving the openfisca indirect taxation input dataframe') try: openfisca_survey_collection = SurveyCollection.load( collection = 'openfisca_indirect_taxation', config_files_directory = config_files_directory) except: openfisca_survey_collection = SurveyCollection( name = 'openfisca_indirect_taxation', config_files_directory = config_files_directory) output_data_directory = openfisca_survey_collection.config.get('data', 'output_directory') survey_name = "openfisca_indirect_taxation_data_{}".format(year_calage) table = "input" hdf5_file_path = os.path.join(output_data_directory, "{}.h5".format(survey_name)) survey = Survey( name = survey_name, hdf5_file_path = hdf5_file_path, ) survey.insert_table(name = table, data_frame = data_frame) openfisca_survey_collection.surveys.append(survey) openfisca_survey_collection.dump() def run(years_calage): import time year_data_list = [1995, 2000, 2005, 2011] for year_calage in years_calage: start = time.time() run_all(year_calage, year_data_list) log.info("Finished {}".format(time.time() - start)) if __name__ == '__main__':