def write_to_bq(self, dataset, gcs_bucket): """Writes population data to BigQuery from the provided GCS bucket dataset: The BigQuery dataset to write to gcs_bucket: The name of the gcs bucket to read the data from""" # TODO change this to have it read metadata from GCS bucket metadata = fetch_acs_metadata(self.base_acs_url) var_map = parse_acs_metadata(metadata, list(GROUPS.keys())) race_and_hispanic_frame = gcs_to_bq_util.load_values_as_dataframe( gcs_bucket, self.get_filename(HISPANIC_BY_RACE_CONCEPT)) race_and_hispanic_frame = update_col_types(race_and_hispanic_frame) race_and_hispanic_frame = standardize_frame( race_and_hispanic_frame, get_vars_for_group(HISPANIC_BY_RACE_CONCEPT, var_map, 2), [HISPANIC_COL, RACE_COL], self.county_level, POPULATION_COL) total_frame = gcs_to_bq_util.load_values_as_dataframe( gcs_bucket, self.add_filename_suffix(TOTAL_POP_VARIABLE_ID)) total_frame = update_col_types(total_frame) total_frame = standardize_frame( total_frame, {TOTAL_POP_VARIABLE_ID: ['Total']}, [RACE_OR_HISPANIC_COL], self.county_level, POPULATION_COL) sex_by_age_frames = {} for concept in SEX_BY_AGE_CONCEPTS_TO_RACE: sex_by_age_frame = gcs_to_bq_util.load_values_as_dataframe( gcs_bucket, self.get_filename(concept)) sex_by_age_frame = update_col_types(sex_by_age_frame) sex_by_age_frames[concept] = sex_by_age_frame frames = { self.get_table_name_by_race(): self.get_all_races_frame( race_and_hispanic_frame, total_frame), self.get_table_name_by_sex_age_race(): self.get_sex_by_age_and_race( var_map, sex_by_age_frames) } for table_name, df in frames.items(): # All breakdown columns are strings column_types = {c: 'STRING' for c in df.columns} column_types[POPULATION_COL] = 'INT64' gcs_to_bq_util.add_dataframe_to_bq( df, dataset, table_name, column_types=column_types)
def write_to_bq(self, dataset, gcs_bucket, filename): """Writes state names to BigQuery from the provided GCS bucket dataset: The BigQuery dataset to write to table_name: The name of the biquery table to write to gcs_bucket: The name of the gcs bucket to read the data from filename: The name of the file in the gcs bucket to read from""" try: frame = gcs_to_bq_util.load_values_as_dataframe( gcs_bucket, filename) frame = frame.rename(columns={ 'state': 'state_fips_code', 'NAME': 'state_name' }) column_types = { 'state_fips_code': 'STRING', 'state_name': 'STRING' } gcs_to_bq_util.append_dataframe_to_bq( frame, dataset, self.get_staging_table_name(), column_types=column_types) except json.JSONDecodeError as err: logging.error( 'Unable to write to BigQuery due to improperly formatted data: %s', err)