def build_survey_collection(name=None, erase_collection_json=False, overwrite_surveys=False, data_directory_path_by_year=None, source_format='sas'): assert name is not None assert data_directory_path_by_year is not None years = data_directory_path_by_year.keys() if years is None: log.error("A list of years to process is needed") if erase_collection_json: survey_collection = SurveyCollection( name=name, config_files_directory=config_files_directory) else: try: survey_collection = SurveyCollection.load( collection=name, config_files_directory=config_files_directory) except ConfigParser.NoOptionError: survey_collection = SurveyCollection( name=name, config_files_directory=config_files_directory) for year, data_directory_path in data_directory_path_by_year.iteritems(): if not os.path.isdir(data_directory_path): input_data_directory = survey_collection.config.get( 'data', 'input_directory') assert os.path.isdir(input_data_directory) data_directory_path = os.path.join(input_data_directory, data_directory_path) assert os.path.isdir(input_data_directory) data_file_by_format = create_data_file_by_format(data_directory_path) print data_file_by_format survey_name = '{}_{}'.format(name, year) add_survey_to_collection( survey_name=survey_name, survey_collection=survey_collection, sas_files=data_file_by_format[source_format], ) collections_directory = survey_collection.config.get( 'collections', 'collections_directory') collection_json_path = os.path.join(collections_directory, "{}.json".format(name)) survey_collection.dump(json_file_path=collection_json_path) surveys = [ survey for survey in survey_collection.surveys if survey.name.endswith(str(year)) ] survey_collection.fill_hdf(source_format=source_format, surveys=surveys, overwrite=overwrite_surveys) return survey_collection
def set_table_in_survey(input_dataframe, entity, period, collection, survey_name, survey_label = None, table_label = None, table_name = None): period = periods.period(period) if table_name is None: table_name = entity + '_' + str(period) if table_label is None: table_label = "Input data for entity {} at period {}".format(entity, period) try: survey_collection = SurveyCollection.load(collection = collection) except configparser.NoOptionError: survey_collection = SurveyCollection(name = collection) except configparser.NoSectionError: # For tests data_dir = os.path.join( pkg_resources.get_distribution('openfisca-survey-manager').location, 'openfisca_survey_manager', 'tests', 'data_files', ) survey_collection = SurveyCollection( name = collection, config_files_directory = data_dir, ) try: survey = survey_collection.get_survey(survey_name) except AssertionError: survey = Survey( name = survey_name, label = survey_label or None, survey_collection = survey_collection, ) if survey.hdf5_file_path is None: config = survey.survey_collection.config directory_path = config.get("data", "output_directory") if not os.path.isdir(directory_path): log.warn("{} who should be the HDF5 data directory does not exist: we create the directory".format( directory_path)) os.makedirs(directory_path) survey.hdf5_file_path = os.path.join(directory_path, survey.name + '.h5') assert survey.hdf5_file_path is not None survey.insert_table(label = table_label, name = table_name, dataframe = input_dataframe) survey_collection.surveys = [ kept_survey for kept_survey in survey_collection.surveys if kept_survey.name != survey_name ] survey_collection.surveys.append(survey) collections_directory = survey_collection.config.get('collections', 'collections_directory') assert os.path.isdir(collections_directory), """{} who should be the collections' directory does not exist. Fix the option collections_directory in the collections section of your config file.""".format(collections_directory) collection_json_path = os.path.join(collections_directory, "{}.json".format(collection)) survey_collection.dump(json_file_path = collection_json_path)
def build_bdf_survey_collection(years=None, erase=False, overwrite=False): if years is None: log.error("A list of years to process is needed") if erase: bdf_survey_collection = SurveyCollection( name="budget_des_familles", config_files_directory=config_files_directory) else: try: bdf_survey_collection = SurveyCollection.load( collection='budget_des_familles', config_files_directory=config_files_directory) except ConfigParser.NoOptionError: bdf_survey_collection = SurveyCollection( name="budget_des_familles", config_files_directory=config_files_directory) input_data_directory = bdf_survey_collection.config.get( 'data', 'input_directory') if getpass.getuser() == 'benjello': input_data_directory = os.path.join( os.path.dirname(input_data_directory), 'INSEE') else: input_data_directory = os.path.dirname(input_data_directory) for year in years: data_directory_path = os.path.join( input_data_directory, 'budget_des_familles/{}'.format(year)) data_file_by_format = create_data_file_by_format(data_directory_path) survey_name = 'budget_des_familles_{}'.format(year) add_survey_to_collection( survey_name=survey_name, survey_collection=bdf_survey_collection, stata_files=data_file_by_format['stata'], ) collections_directory = bdf_survey_collection.config.get( 'collections', 'collections_directory') collection_json_path = os.path.join(collections_directory, "budget_des_familles" + ".json") bdf_survey_collection.dump(json_file_path=collection_json_path) surveys = [ survey for survey in bdf_survey_collection.surveys if survey.name.endswith(str(year)) ] bdf_survey_collection.fill_hdf(source_format='stata', surveys=surveys, overwrite=overwrite) return bdf_survey_collection
def test_survey(): name = 'fake' data_dir = os.path.join( pkg_resources.get_distribution('openfisca-survey-manager').location, 'openfisca_survey_manager', 'tests', 'data_files', ) survey_collection = SurveyCollection(name=name, config_files_directory=data_dir, json_file_path=os.path.join( data_dir, 'fake.json')) saved_fake_survey_hdf5_file_path = os.path.join(data_dir, 'fake.hdf5') saved_fake_survey_file_path = os.path.join(data_dir, 'help.sas7bdat') survey = Survey( hdf5_file_path=saved_fake_survey_hdf5_file_path, name='fake_survey', sas_files=[saved_fake_survey_file_path], survey_collection=survey_collection, ) survey.insert_table(name='help') survey.fill_hdf(source_format='sas') print survey.tables
def build(year=None): assert year is not None # preprocessing.build_merged_dataframes(year=year) # # imputation_loyer.imputation_loyer(year = year) # openfisca_survey_collection = SurveyCollection(name='openfisca') stata_directory = openfisca_survey_collection.config.get( 'data', 'stata_directory') stata_file = os.path.join(stata_directory, 'log_men_ERFS.dta') imputation_loyer.merge_imputation_loyer(stata_file=stata_file, year=year) # variables_individuelles.build_variables_individuelles(year=year) famille.build_famille(year=year) final.create_input_data_frame(year=year) # temporary_store = get_store(file_name='erfs_fpr') data_frame = temporary_store['input_{}'.format(year)] # Save the data_frame in a collection store_input_data_frame( data_frame=data_frame, collection="openfisca_erfs_fpr", survey="openfisca_erfs_fpr_data_{}".format(year), )
def run_all(year = None, filename = "test", check = False): assert year is not None pre_processing.create_indivim_menage_en_mois(year = year) pre_processing.create_enfants_a_naitre(year = year) # imputation_loyer.imputation_loyer(year = year) fip.create_fip(year = year) famille.famille(year = year) foyer.sif(year = year) foyer.foyer_all(year = year) rebuild.create_totals(year = year) rebuild.create_final(year = year) invalides.invalide(year = year) data_frame = final.final(year = year, check = check) # Saving the data_frame openfisca_survey_collection = SurveyCollection(name = "openfisca", config_files_directory = config_files_directory) output_data_directory = openfisca_survey_collection.config.get('data', 'output_directory') survey_name = "openfisca_data_{}".format(year) table = "input" hdf5_file_path = os.path.join(os.path.dirname(output_data_directory), "{}.h5".format(survey_name)) survey = Survey( name = survey_name, hdf5_file_path = hdf5_file_path, ) survey.insert_table(name = table, data_frame = data_frame) openfisca_survey_collection.surveys.append(survey) collections_directory = openfisca_survey_collection.config.get('collections', 'collections_directory') json_file_path = os.path.join(collections_directory, 'openfisca.json') openfisca_survey_collection.dump(json_file_path = json_file_path)
def test_add_survey_to_collection(): if is_travis or is_circleci: return name = 'fake' survey_name = 'fake_survey' data_dir = os.path.join( pkg_resources.get_distribution('openfisca-survey-manager').location, 'openfisca_survey_manager', 'tests', 'data_files', ) survey_collection = SurveyCollection(name=name) saved_fake_survey_file_path = os.path.join(data_dir, 'help.sas7bdat') add_survey_to_collection(survey_name=survey_name, survey_collection=survey_collection, sas_files=[saved_fake_survey_file_path], stata_files=[]) ordered_dict = survey_collection.to_json() assert survey_name in list(ordered_dict['surveys'].keys())
def test_add_survey_to_collection(): name = 'fake' survey_name = 'fake_survey' survey_collection = SurveyCollection(name = name) data_dir = os.path.join( pkg_resources.get_distribution('openfisca-survey-manager').location, 'openfisca_survey_manager', 'tests', 'data_files', ) # saved_fake_survey_hdf5_file_path = os.path.join(data_dir, 'fake.hdf5') saved_fake_survey_file_path = os.path.join(data_dir, 'help.sas7bdat') add_survey_to_collection(survey_name = survey_name, survey_collection = survey_collection, sas_files = [saved_fake_survey_file_path], stata_files = []) ordered_dict = survey_collection.to_json() # print ordered_dict assert ordered_dict['surveys'].keys() == [survey_name]
def build(year=None, check=False): assert year is not None pre_processing.create_indivim_menagem(year=year) pre_processing.create_enfants_a_naitre(year=year) # try: # imputation_loyer.imputation_loyer(year = year) # except Exception, e: # log.info('Do not impute loyer because of the following error: \n {}'.format(e)) # pass fip.create_fip(year=year) famille.famille(year=year) foyer.sif(year=year) foyer.foyer_all(year=year) rebuild.create_totals_first_pass(year=year) rebuild.create_totals_second_pass(year=year) rebuild.create_final(year=year) invalides.invalide(year=year) final.final(year=year, check=check) temporary_store = get_store(file_name='erfs') data_frame = temporary_store['input_{}'.format(year)] # Saving the data_frame openfisca_survey_collection = SurveyCollection( name="openfisca", config_files_directory=config_files_directory) output_data_directory = openfisca_survey_collection.config.get( 'data', 'output_directory') survey_name = "openfisca_data_{}".format(year) table = "input" hdf5_file_path = os.path.join(os.path.dirname(output_data_directory), "{}.h5".format(survey_name)) survey = Survey( name=survey_name, hdf5_file_path=hdf5_file_path, ) survey.insert_table(name=table, data_frame=data_frame) openfisca_survey_collection.surveys.append(survey) collections_directory = openfisca_survey_collection.config.get( 'collections', 'collections_directory') json_file_path = os.path.join(collections_directory, 'openfisca.json') openfisca_survey_collection.dump(json_file_path=json_file_path)
def store_input_data_frame(data_frame=None, collection=None, survey=None, table=None): assert data_frame is not None assert collection is not None assert survey is not None try: openfisca_survey_collection = SurveyCollection.load( collection=collection) except Exception as e: openfisca_survey_collection = SurveyCollection(name=collection) log.debug("In collection {} the following survey are present: {}".format( collection, openfisca_survey_collection.surveys)) output_data_directory = openfisca_survey_collection.config.get( 'data', 'output_directory') if table is None: table = "input" # survey_name = survey hdf5_file_path = os.path.join(os.path.dirname(output_data_directory), "{}.h5".format(survey_name)) available_survey_names = [ survey_.name for survey_ in openfisca_survey_collection.surveys ] if survey_name in available_survey_names: survey = openfisca_survey_collection.get_survey(survey_name) else: survey = Survey(name=survey_name, hdf5_file_path=hdf5_file_path) survey.insert_table(name=table, data_frame=data_frame) openfisca_survey_collection.surveys.append(survey) collections_directory = openfisca_survey_collection.config.get( 'collections', 'collections_directory') json_file_path = os.path.join(collections_directory, '{}.json'.format(collection)) log.debug("In collection {} the following surveyx are present: {}".format( collection, openfisca_survey_collection.surveys)) openfisca_survey_collection.dump(json_file_path=json_file_path)
'deci', 'hnph2', 'iaat_bis', 'lmlm', 'magtr', 'mcs8', 'mdiplo', 'mtybd', 'qex', 'statut_occupation', 'tu99_recoded', # 'ident', ] logement = Logement[kept_variables].copy() # logement.rename(columns = {'qex': 'wprm'}, inplace = True) return logement if __name__ == '__main__': import sys logging.basicConfig(level=logging.INFO, stream=sys.stdout) year = 2012 from openfisca_france_data.erfs_fpr.input_data_builder import step_01_preprocessing step_01_preprocessing.build_merged_dataframes(year=year) openfisca_survey_collection = SurveyCollection(name='openfisca') output_data_directory = openfisca_survey_collection.config.get( 'data', 'output_directory') stata_file = os.path.join(output_data_directory, 'log_men_ERFS.dta') menages = merge_imputation_loyer(stata_file=stata_file, year=year)
data_frame.reset_index(inplace = True) except ValueError, e: log.info('ignoring reset_index because {}'.format(e)) # Remove duplicated colums causing bug with HDFStore # according to https://github.com/pydata/pandas/issues/6240 # using solution form stackoverflow # http://stackoverflow.com/questions/16938441/how-to-remove-duplicate-columns-from-a-dataframe-using-python-pandas data_frame = data_frame.T.groupby(level = 0).first().T log.info('Saving the openfisca indirect taxation input dataframe') try: openfisca_survey_collection = SurveyCollection.load( collection = 'openfisca_indirect_taxation', config_files_directory = config_files_directory) except: openfisca_survey_collection = SurveyCollection( name = 'openfisca_indirect_taxation', config_files_directory = config_files_directory) output_data_directory = openfisca_survey_collection.config.get('data', 'output_directory') survey_name = "openfisca_indirect_taxation_data_{}".format(year_calage) table = "input" hdf5_file_path = os.path.join(output_data_directory, "{}.h5".format(survey_name)) survey = Survey( name = survey_name, hdf5_file_path = hdf5_file_path, ) survey.insert_table(name = table, data_frame = data_frame) openfisca_survey_collection.surveys.append(survey) openfisca_survey_collection.dump() def run(years_calage):
def build_survey_collection(collection_name=None, replace_metadata=False, replace_data=False, data_directory_path_by_survey_suffix=None, source_format='sas'): assert collection_name is not None assert data_directory_path_by_survey_suffix is not None surveys_name = data_directory_path_by_survey_suffix.keys() assert surveys_name is not None, "A list of surveys to process is needed" if replace_metadata: survey_collection = SurveyCollection( name=collection_name, config_files_directory=config_files_directory) else: try: survey_collection = SurveyCollection.load( collection=collection_name, config_files_directory=config_files_directory) except ConfigParser.NoOptionError: survey_collection = SurveyCollection( name=collection_name, config_files_directory=config_files_directory) for survey_suffix, data_directory_path in data_directory_path_by_survey_suffix.iteritems( ): assert os.path.isdir( data_directory_path), '{} is not a valid directory path'.format( data_directory_path) data_file_by_format = create_data_file_by_format(data_directory_path) survey_name = '{}_{}'.format(collection_name, survey_suffix) add_survey_to_collection( survey_name=survey_name, survey_collection=survey_collection, sas_files=data_file_by_format.get('sas'), stata_files=data_file_by_format.get('stata'), ) valid_source_format = [ _format for _format in data_file_by_format.keys() if data_file_by_format.get((_format)) ] log.info("Valid source formats are: {}".format(valid_source_format)) source_format = valid_source_format[0] log.info("Using the following format: {}".format(source_format)) collections_directory = survey_collection.config.get( 'collections', 'collections_directory') assert os.path.isdir( collections_directory ), """{} who should be the collections' directory does not exist. Fix the option collections_directory in the collections section of your config file.""".format( collections_directory) collection_json_path = os.path.join(collections_directory, "{}.json".format(collection_name)) survey_collection.dump(json_file_path=collection_json_path) surveys = [ survey for survey in survey_collection.surveys if survey.name.endswith(str(survey_suffix)) ] survey_collection.fill_hdf(source_format=source_format, surveys=surveys, overwrite=replace_data) return survey_collection