} df_log_ = common.log__step('0', params, process_date, '', 0, '', 'init') #----------------------------------------- INPUT DATASET print '0/6 Processing input dataset...' df = dataiku.Dataset(input_).get_dataframe(columns=columns) if P_COLUMN_STATES_LOWER is True: df[P_COLUMN_STATES] = df[P_COLUMN_STATES].map(lambda x: x.lower()) print 'Creating States list...' state_list_ = list(np.unique(df[P_COLUMN_STATES])) state_conversion = common.state_to_2letters_format(P_STATES_TYPE_NAME, state_list_) state_list = state_conversion[0] state_list_rejected = state_conversion[1] dict_states = state_conversion[2] s_found = len(state_list) s_rejected = len(state_list_rejected) print '----------------------------------------' print 'First diagnostic on input dataset' print '----------------------------------------' ##### define the folder containing the segments. template_fields_def = census_resources.dict_vintage_[P_CENSUS_TYPE][ P_CENSUS_CONTENT]['fields_definition']
def generate_rows(self, dataset_schema=None, dataset_partitioning=None, partition_id=None, records_limit=-1): path_datadir_tmp = os.getenv("DIP_HOME") + '/tmp/' FOLDER_NAME = 'tmp_census_us_' + self.P_CENSUS_CONTENT P_CENSUS_TYPE = self.P_CENSUS_CONTENT[:3] CENSUS_TYPE = str( census_resources.dict_vintage_[self.P_CENSUS_CONTENT[:3]]) fields_list = self.P_census_fields.split(',') #----------------------------------------- BASE FOLDER logger.info('1/6 Creating base folders...') common.create_folder(path_datadir_tmp, FOLDER_NAME, False) common.create_folder(path_datadir_tmp + '/' + FOLDER_NAME + '/', self.P_CENSUS_LEVEL, False) #----------------------------------------- SOURCE HARVESTER state_list_ = self.P_state_list_str.split(',') state_conversion = common.state_to_2letters_format( self.P_STATES_TYPE_NAME, state_list_) state_list = state_conversion[0] state_list_rejected = state_conversion[1] dict_states = state_conversion[2] s_found = len(state_list) s_rejected = len(state_list_rejected) logger.info('----------------------------------------') logger.info('First diagnostic on input dataset') logger.info('----------------------------------------') if s_found > 0: logger.info( 'States expected to be processed if enough records for feature selection:' ) logger.info(state_list) logger.info('States rejected:') if s_rejected < 60: logger.info(state_list_rejected) else: logger.info( '...too many elements rejected for displaying it in the log...' ) if self.P_USE_PREVIOUS_SOURCES is False: logger.info('2/6 Collecting US Census Data...') else: logger.info('2/6 Re using US Census Data if available...') sources_collector = common.us_census_source_collector( self.P_USE_PREVIOUS_SOURCES, P_CENSUS_TYPE, self.P_CENSUS_CONTENT, self.P_CENSUS_LEVEL, path_datadir_tmp, FOLDER_NAME, state_list, dict_states) sumlevel_val = sources_collector[0] fdef_dir = sources_collector[1] geo_header_file = sources_collector[2] dict_pattern_files = sources_collector[3] geo_header_file_dir = fdef_dir + '/' + geo_header_file geo_header = pd.read_excel(geo_header_file_dir, sheet_name=0, header=0) #sheetname census_level_code_len = census_resources.dict_level_corresp['v1'][ self.P_CENSUS_LEVEL]['code_len'] logger.info('4/6 Generating census...') final_output_df = pd.DataFrame() for state in state_list: logger.info('Processing this state: %s' % (state)) state_dir = path_datadir_tmp + FOLDER_NAME + '/' + state if self.P_CENSUS_LEVEL in ('TRACT', 'BLOCK_GROUP'): ziptocollect = dict_pattern_files['v1']['TB'] state_dir_level = state_dir + '/' + 'TRACT_BG_SEG' else: ziptocollect = dict_pattern_files['v1']['OT'] state_dir_level = state_dir + '/' + 'NO_TRACT_BG_SEG' ustate = state.upper() state_name = dict_states[state]['attributes'][ 'state_fullname_w1'] state_number = dict_states[state]['attributes'][ 'state_2digits'] vint = census_resources.dict_vintage_[P_CENSUS_TYPE][ self.P_CENSUS_CONTENT] master_segment_file = state_dir_level + '/' + vint[ 'master_segment_file_pattern'] + vint[ 'vintage_pattern'] + state + '.csv' geo_source_df = pd.read_csv(master_segment_file, sep=',', header=None, names=geo_header.columns) geo_level_df = geo_source_df[geo_source_df['SUMLEVEL'].isin( sumlevel_val)].copy() geo_level_df['GEOID_DKU'] = geo_level_df['GEOID'].map( lambda x: x.split('US')[1]) geo_level_df[self.P_CENSUS_LEVEL] = geo_level_df[ 'GEOID_DKU'].map(lambda x: x[:census_level_code_len]) keep_cols = [ 'FILEID', 'SUMLEVEL', 'GEOID_DKU', 'STUSAB', 'LOGRECNO' ] geo_level_df = geo_level_df[keep_cols] geo_level_df['STUSAB'] = geo_level_df['STUSAB'].map( lambda x: x.lower()) ## basically the state lower del geo_level_df['FILEID'] del geo_level_df['SUMLEVEL'] ### added n = 0 for fr in os.listdir(state_dir_level): if fr.startswith( vint['segments_estimations_files_pattern']): n += 1 segment_list = [] for i in range(1, n + 1): if i < 10: segment_list.append('000' + str(i)) if i in range(10, 100): segment_list.append('00' + str(i)) if i >= 100: segment_list.append('0' + str(i)) nb_segments = len(segment_list) i = 1 for segment_number in segment_list: i = i + 1 logger.info('Processing segment: %s/%s' % (i, nb_segments)) template_fields_def = census_resources.dict_vintage_[ P_CENSUS_TYPE][ self.P_CENSUS_CONTENT]['fields_definition'] seq_folder_name = template_fields_def['folder_name'] ## For taking into account that some vintage like ACS52013 does not have a structure with the template and a folder ## If no template, recreate the same structure as the alternative one. if seq_folder_name == '': seq_folder_name = template_fields_def[ 'geo_header_template_folder_name'] try: HEADER_PATH_FILE = fdef_dir + '/' + seq_folder_name + '/Seq' + str( int(segment_number) ) + template_fields_def['seq_files_extension'] header_df = pd.read_excel( HEADER_PATH_FILE, sheet_name=0) ### 0 = 'E' #sheetname except: HEADER_PATH_FILE = fdef_dir + '/' + seq_folder_name + '/seq' + str( int(segment_number) ) + template_fields_def['seq_files_extension'] header_df = pd.read_excel( HEADER_PATH_FILE, sheet_name=0) ### 0 = 'E' #sheetname ### Adjust the header to fit what we need. kh_list = [ 'FILEID', 'FILETYPE', 'STUSAB', 'CHARITER', 'SEQUENCE', 'LOGRECNO' ] f_list = [x for x in header_df.columns if x not in kh_list] E_list = [x + 'E' for x in f_list] newcolz_list = kh_list + E_list t_ = [c for c in newcolz_list if c in fields_list] if len(t_) > 0: SEGMENT_PATH_FILE = state_dir_level + '/' + vint[ 'segments_estimations_files_pattern'] + vint[ 'vintage_pattern'] + state + segment_number + '000.txt' segment_df = pd.read_csv(SEGMENT_PATH_FILE, sep=',', names=newcolz_list, low_memory=False) out_list = kh_list + t_ out_list.remove('FILEID') out_list.remove('FILETYPE') out_list.remove('CHARITER') out_list.remove('SEQUENCE') segment_df = segment_df[out_list] geo_level_df = pd.merge( left=geo_level_df, right=segment_df, how='inner', left_on=['STUSAB', 'LOGRECNO'], right_on=['STUSAB', 'LOGRECNO']) logger.info('-------------- volumes check------------------') logger.info(geo_level_df.groupby('STUSAB').size()) logger.info('Check Tallies here :') logger.info( 'https://www.census.gov/geo/maps-data/data/tallies/tractblock.html' ) logger.info('----------------------------------------------') #del geo_level_df['STUSAB'] del geo_level_df['LOGRECNO'] if self.P_STATES_TYPE_NAME is not 'state_2letters': geo_level_df[self.P_STATES_TYPE_NAME] = dict_states[state][ 'attributes'][self.P_STATES_TYPE_NAME] logger.info('5/6 Building final output...') final_output_df = pd.concat((final_output_df, geo_level_df), axis=0) if self.P_DELETE_US_CENSUS_SOURCES is True: logger.info('6/6 Removing US Census temp data from: %s' % (path_datadir_tmp + FOLDER_NAME)) cmd = "rm -rf %s" % (path_datadir_tmp + FOLDER_NAME) os.system(cmd) else: logger.info('6/6 Keeping US Census data sources in: %s' % (path_datadir_tmp + FOLDER_NAME)) for f in os.listdir(path_datadir_tmp + FOLDER_NAME): if not f.endswith('.zip'): cmd = "rm -rf %s" % (path_datadir_tmp + FOLDER_NAME + '/' + f) os.system(cmd) for i, line in final_output_df.iterrows(): yield line.to_dict() else: logger.info('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!') logger.info( 'US Census CANNOT be built, no states available in your dataset...' ) logger.info('Check the following settings :') logger.info( '-> are the states in the right format regarding the plugin set by the user ?' ) logger.info('-> is the column really containing states ?') logger.info('----------------------------------------')