Exemple #1
0
    def write_to_bq(self, dataset, gcs_bucket):
        """Writes population data to BigQuery from the provided GCS bucket

        dataset: The BigQuery dataset to write to
        gcs_bucket: The name of the gcs bucket to read the data from"""
        # TODO change this to have it read metadata from GCS bucket
        metadata = fetch_acs_metadata(self.base_acs_url)
        var_map = parse_acs_metadata(metadata, list(GROUPS.keys()))

        race_and_hispanic_frame = gcs_to_bq_util.load_values_as_dataframe(
            gcs_bucket, self.get_filename(HISPANIC_BY_RACE_CONCEPT))
        race_and_hispanic_frame = update_col_types(race_and_hispanic_frame)

        race_and_hispanic_frame = standardize_frame(
            race_and_hispanic_frame,
            get_vars_for_group(HISPANIC_BY_RACE_CONCEPT, var_map, 2),
            [HISPANIC_COL, RACE_COL],
            self.county_level,
            POPULATION_COL)

        total_frame = gcs_to_bq_util.load_values_as_dataframe(
            gcs_bucket, self.add_filename_suffix(TOTAL_POP_VARIABLE_ID))
        total_frame = update_col_types(total_frame)
        total_frame = standardize_frame(
            total_frame,
            {TOTAL_POP_VARIABLE_ID: ['Total']},
            [RACE_OR_HISPANIC_COL],
            self.county_level,
            POPULATION_COL)

        sex_by_age_frames = {}
        for concept in SEX_BY_AGE_CONCEPTS_TO_RACE:
            sex_by_age_frame = gcs_to_bq_util.load_values_as_dataframe(
                gcs_bucket, self.get_filename(concept))
            sex_by_age_frame = update_col_types(sex_by_age_frame)
            sex_by_age_frames[concept] = sex_by_age_frame

        frames = {
            self.get_table_name_by_race(): self.get_all_races_frame(
                race_and_hispanic_frame, total_frame),
            self.get_table_name_by_sex_age_race(): self.get_sex_by_age_and_race(
                var_map, sex_by_age_frames)
        }

        for table_name, df in frames.items():
            # All breakdown columns are strings
            column_types = {c: 'STRING' for c in df.columns}
            column_types[POPULATION_COL] = 'INT64'
            gcs_to_bq_util.add_dataframe_to_bq(
                df, dataset, table_name, column_types=column_types)
    def write_to_bq(self, dataset, gcs_bucket, filename):
        """Writes state names to BigQuery from the provided GCS bucket

        dataset: The BigQuery dataset to write to
        table_name: The name of the biquery table to write to
        gcs_bucket: The name of the gcs bucket to read the data from
        filename: The name of the file in the gcs bucket to read from"""
        try:
            frame = gcs_to_bq_util.load_values_as_dataframe(
                gcs_bucket, filename)
            frame = frame.rename(columns={
                'state': 'state_fips_code',
                'NAME': 'state_name'
            })
            column_types = {
                'state_fips_code': 'STRING',
                'state_name': 'STRING'
            }
            gcs_to_bq_util.append_dataframe_to_bq(
                frame,
                dataset,
                self.get_staging_table_name(),
                column_types=column_types)
        except json.JSONDecodeError as err:
            logging.error(
                'Unable to write to BigQuery due to improperly formatted data: %s',
                err)