def build_survey_collection(name=None, erase_collection_json=False, overwrite_surveys=False, data_directory_path_by_year=None, source_format='sas'): assert name is not None assert data_directory_path_by_year is not None years = data_directory_path_by_year.keys() if years is None: log.error("A list of years to process is needed") if erase_collection_json: survey_collection = SurveyCollection( name=name, config_files_directory=config_files_directory) else: try: survey_collection = SurveyCollection.load( collection=name, config_files_directory=config_files_directory) except ConfigParser.NoOptionError: survey_collection = SurveyCollection( name=name, config_files_directory=config_files_directory) for year, data_directory_path in data_directory_path_by_year.iteritems(): if not os.path.isdir(data_directory_path): input_data_directory = survey_collection.config.get( 'data', 'input_directory') assert os.path.isdir(input_data_directory) data_directory_path = os.path.join(input_data_directory, data_directory_path) assert os.path.isdir(input_data_directory) data_file_by_format = create_data_file_by_format(data_directory_path) print data_file_by_format survey_name = '{}_{}'.format(name, year) add_survey_to_collection( survey_name=survey_name, survey_collection=survey_collection, sas_files=data_file_by_format[source_format], ) collections_directory = survey_collection.config.get( 'collections', 'collections_directory') collection_json_path = os.path.join(collections_directory, "{}.json".format(name)) survey_collection.dump(json_file_path=collection_json_path) surveys = [ survey for survey in survey_collection.surveys if survey.name.endswith(str(year)) ] survey_collection.fill_hdf(source_format=source_format, surveys=surveys, overwrite=overwrite_surveys) return survey_collection
def build_survey_collection( config_files_directory: str, collection_name = None, replace_metadata = False, replace_data = False, data_directory_path_by_survey_suffix = None, source_format = 'sas', ): assert collection_name is not None assert data_directory_path_by_survey_suffix is not None surveys_name = list(data_directory_path_by_survey_suffix.keys()) assert surveys_name is not None, "A list of surveys to process is needed" if replace_metadata: survey_collection = SurveyCollection( name = collection_name, config_files_directory = config_files_directory) else: try: survey_collection = SurveyCollection.load( collection = collection_name, config_files_directory = config_files_directory) except configparser.NoOptionError: survey_collection = SurveyCollection( name = collection_name, config_files_directory = config_files_directory) for survey_suffix, data_directory_path in data_directory_path_by_survey_suffix.items(): assert os.path.isdir(data_directory_path), '{} is not a valid directory path'.format(data_directory_path) data_file_by_format = create_data_file_by_format(data_directory_path) survey_name = '{}_{}'.format(collection_name, survey_suffix) add_survey_to_collection( survey_name = survey_name, survey_collection = survey_collection, sas_files = data_file_by_format.get('sas'), stata_files = data_file_by_format.get('stata'), ) valid_source_format = [ _format for _format in list(data_file_by_format.keys()) if data_file_by_format.get((_format)) ] log.info("Valid source formats are: {}".format(valid_source_format)) source_format = valid_source_format[0] log.info("Using the following format: {}".format(source_format)) collections_directory = survey_collection.config.get('collections', 'collections_directory') assert os.path.isdir(collections_directory), """{} who should be the collections' directory does not exist. Fix the option collections_directory in the collections section of your config file.""".format(collections_directory) collection_json_path = os.path.join(collections_directory, "{}.json".format(collection_name)) survey_collection.dump(json_file_path = collection_json_path) surveys = [survey for survey in survey_collection.surveys if survey.name.endswith(str(survey_suffix))] survey_collection.fill_hdf(source_format = source_format, surveys = surveys, overwrite = replace_data) return survey_collection
def build_bdf_survey_collection(years=None, erase=False, overwrite=False): if years is None: log.error("A list of years to process is needed") if erase: bdf_survey_collection = SurveyCollection( name="budget_des_familles", config_files_directory=config_files_directory) else: try: bdf_survey_collection = SurveyCollection.load( collection='budget_des_familles', config_files_directory=config_files_directory) except ConfigParser.NoOptionError: bdf_survey_collection = SurveyCollection( name="budget_des_familles", config_files_directory=config_files_directory) input_data_directory = bdf_survey_collection.config.get( 'data', 'input_directory') if getpass.getuser() == 'benjello': input_data_directory = os.path.join( os.path.dirname(input_data_directory), 'INSEE') else: input_data_directory = os.path.dirname(input_data_directory) for year in years: data_directory_path = os.path.join( input_data_directory, 'budget_des_familles/{}'.format(year)) data_file_by_format = create_data_file_by_format(data_directory_path) survey_name = 'budget_des_familles_{}'.format(year) add_survey_to_collection( survey_name=survey_name, survey_collection=bdf_survey_collection, stata_files=data_file_by_format['stata'], ) collections_directory = bdf_survey_collection.config.get( 'collections', 'collections_directory') collection_json_path = os.path.join(collections_directory, "budget_des_familles" + ".json") bdf_survey_collection.dump(json_file_path=collection_json_path) surveys = [ survey for survey in bdf_survey_collection.surveys if survey.name.endswith(str(year)) ] bdf_survey_collection.fill_hdf(source_format='stata', surveys=surveys, overwrite=overwrite) return bdf_survey_collection
def build_erfs_survey_collection(years = None, erase = False, overwrite = False): if years is None: log.error("A list of years to process is needed") if erase: erfs_survey_collection = SurveyCollection( name = "erfs", config_files_directory = config_files_directory) else: try: erfs_survey_collection = SurveyCollection.load( collection = 'erfs', config_files_directory = config_files_directory) except ConfigParser.NoOptionError: erfs_survey_collection = SurveyCollection( name = "erfs", config_files_directory = config_files_directory) input_data_directory = erfs_survey_collection.config.get('data', 'input_directory') if getpass.getuser() == 'benjello': input_data_directory = os.path.join(os.path.dirname(input_data_directory), 'INSEE') else: input_data_directory = os.path.dirname(input_data_directory) for year in years: data_directory_path = os.path.join( input_data_directory, 'ERF/ERFS_{}'.format(year) ) data_file_by_format = create_data_file_by_format(data_directory_path) survey_name = 'erfs_{}'.format(year) add_survey_to_collection( survey_name = survey_name, survey_collection = erfs_survey_collection, sas_files = data_file_by_format['sas'], ) collections_directory = erfs_survey_collection.config.get('collections', 'collections_directory') collection_json_path = os.path.join(collections_directory, "erfs" + ".json") erfs_survey_collection.dump(json_file_path = collection_json_path) surveys = [survey for survey in erfs_survey_collection.surveys if survey.name.endswith(str(year))] erfs_survey_collection.fill_hdf(source_format = 'sas', surveys = surveys, overwrite = overwrite) return erfs_survey_collection
def build_survey_collection(name = None, erase_collection_json = False, overwrite_surveys = False, data_directory_path_by_year = None, source_format = 'sas'): assert name is not None assert data_directory_path_by_year is not None years = data_directory_path_by_year.keys() if years is None: log.error("A list of years to process is needed") if erase_collection_json: survey_collection = SurveyCollection( name = name, config_files_directory = config_files_directory) else: try: survey_collection = SurveyCollection.load( collection = name, config_files_directory = config_files_directory) except ConfigParser.NoOptionError: survey_collection = SurveyCollection( name = name, config_files_directory = config_files_directory) for year, data_directory_path in data_directory_path_by_year.iteritems(): if not os.path.isdir(data_directory_path): input_data_directory = survey_collection.config.get('data', 'input_directory') assert os.path.isdir(input_data_directory) data_directory_path = os.path.join(input_data_directory, data_directory_path) assert os.path.isdir(input_data_directory) data_file_by_format = create_data_file_by_format(data_directory_path) print data_file_by_format survey_name = '{}_{}'.format(name, year) add_survey_to_collection( survey_name = survey_name, survey_collection = survey_collection, sas_files = data_file_by_format[source_format], ) collections_directory = survey_collection.config.get('collections', 'collections_directory') collection_json_path = os.path.join(collections_directory, "{}.json".format(name)) survey_collection.dump(json_file_path = collection_json_path) surveys = [survey for survey in survey_collection.surveys if survey.name.endswith(str(year))] survey_collection.fill_hdf(source_format = source_format, surveys = surveys, overwrite = overwrite_surveys) return survey_collection
def build_survey_collection(collection_name = None, replace_metadata = False, replace_data = False, data_directory_path_by_survey_suffix = None, source_format = 'sas'): assert collection_name is not None assert data_directory_path_by_survey_suffix is not None surveys_name = data_directory_path_by_survey_suffix.keys() assert surveys_name is not None, "A list of surveys to process is needed" if replace_metadata: survey_collection = SurveyCollection( name = collection_name, config_files_directory = config_files_directory) else: try: survey_collection = SurveyCollection.load( collection = collection_name, config_files_directory = config_files_directory) except ConfigParser.NoOptionError: survey_collection = SurveyCollection( name = collection_name, config_files_directory = config_files_directory) for survey_suffix, data_directory_path in data_directory_path_by_survey_suffix.iteritems(): assert os.path.isdir(data_directory_path) data_file_by_format = create_data_file_by_format(data_directory_path) print data_file_by_format survey_name = '{}_{}'.format(collection_name, survey_suffix) add_survey_to_collection( survey_name = survey_name, survey_collection = survey_collection, sas_files = data_file_by_format[source_format], ) collections_directory = survey_collection.config.get('collections', 'collections_directory') assert os.path.isdir(collections_directory), """{} who should be the collections' directory does not exist. Fix the option collections_directory in the collections section of your config file.""".format(collections_directory) collection_json_path = os.path.join(collections_directory, "{}.json".format(collection_name)) survey_collection.dump(json_file_path = collection_json_path) surveys = [survey for survey in survey_collection.surveys if survey.name.endswith(str(survey_suffix))] survey_collection.fill_hdf(source_format = source_format, surveys = surveys, overwrite = replace_data) return survey_collection
def build_survey_collection(collection_name=None, replace_metadata=False, replace_data=False, data_directory_path_by_survey_suffix=None, source_format='sas'): assert collection_name is not None assert data_directory_path_by_survey_suffix is not None surveys_name = data_directory_path_by_survey_suffix.keys() assert surveys_name is not None, "A list of surveys to process is needed" if replace_metadata: survey_collection = SurveyCollection( name=collection_name, config_files_directory=config_files_directory) else: try: survey_collection = SurveyCollection.load( collection=collection_name, config_files_directory=config_files_directory) except ConfigParser.NoOptionError: survey_collection = SurveyCollection( name=collection_name, config_files_directory=config_files_directory) for survey_suffix, data_directory_path in data_directory_path_by_survey_suffix.iteritems( ): assert os.path.isdir( data_directory_path), '{} is not a valid directory path'.format( data_directory_path) data_file_by_format = create_data_file_by_format(data_directory_path) survey_name = '{}_{}'.format(collection_name, survey_suffix) add_survey_to_collection( survey_name=survey_name, survey_collection=survey_collection, sas_files=data_file_by_format.get('sas'), stata_files=data_file_by_format.get('stata'), ) valid_source_format = [ _format for _format in data_file_by_format.keys() if data_file_by_format.get((_format)) ] log.info("Valid source formats are: {}".format(valid_source_format)) source_format = valid_source_format[0] log.info("Using the following format: {}".format(source_format)) collections_directory = survey_collection.config.get( 'collections', 'collections_directory') assert os.path.isdir( collections_directory ), """{} who should be the collections' directory does not exist. Fix the option collections_directory in the collections section of your config file.""".format( collections_directory) collection_json_path = os.path.join(collections_directory, "{}.json".format(collection_name)) survey_collection.dump(json_file_path=collection_json_path) surveys = [ survey for survey in survey_collection.surveys if survey.name.endswith(str(survey_suffix)) ] survey_collection.fill_hdf(source_format=source_format, surveys=surveys, overwrite=replace_data) return survey_collection