def write_to_bq(self, dataset, gcs_bucket, filename): """Writes county adjacencies to BigQuery from the provided GCS bucket dataset: The BigQuery dataset to write to table_name: The name of the biquery table to write to gcs_bucket: The name of the gcs bucket to read the data from filename: The name of the file in the gcs bucket to read from""" frame = gcs_to_bq_util.load_csv_as_dataframe(gcs_bucket, filename, dtype={ 'fipscounty': 'string', 'fipsneighbor': 'string' }) frame = frame[['fipscounty', 'fipsneighbor']] frame = frame.rename(columns={ 'fipscounty': 'county_geoid', 'fipsneighbor': 'neighbor_geoids' }) frame = frame.groupby('county_geoid', as_index=False).agg(list) column_types = { 'county_geoid': 'STRING', 'neighbor_geoids': 'STRING' } col_modes = {'neighbor_geoids': 'REPEATED'} gcs_to_bq_util.add_dataframe_to_bq( frame, dataset, self.get_table_name(), column_types=column_types, col_modes=col_modes)
def write_to_bq(self, dataset, gcs_bucket, **attrs): gcs_files = self.get_attr(attrs, 'filename') # In this instance, we expect filename to be a string with # comma-separated CSV filenames. if ',' not in gcs_files: raise ValueError('filename passed to write_to_bq is not a ' 'comma-separated list of files') files = gcs_files.split(',') # For each of the files, we load it as a dataframe and add it as a # table in the BigQuery dataset. We expect that all aggregation and # standardization of the data has been done by this point. str_cols = [ std_col.COUNTY_NAME_COL, std_col.STATE_NAME_COL, std_col.RACE_OR_HISPANIC_COL, std_col.AGE_COL, std_col.SEX_COL ] for f in files: df = gcs_to_bq_util.load_csv_as_dataframe(gcs_bucket, f) # All columns are int, except certain geo and breakdown columns. column_types = {c: 'INT64' for c in df.columns} for col in str_cols: if col in column_types: column_types[col] = 'STRING' # Clean up column names. self.clean_frame_column_names(df) table_name = f.removesuffix('.csv') # Table name is file name gcs_to_bq_util.append_dataframe_to_bq(df, dataset, table_name, column_types=column_types)
def write_to_bq(self, dataset, gcs_bucket, **attrs): gcs_file = self.get_attr(attrs, 'filename') # Download the raw data df = gcs_to_bq_util.load_csv_as_dataframe(gcs_bucket, gcs_file) self.clean_frame_column_names(df) # Standardize the data # The metadata currently only has information for cases and deaths, # not tests or hospitalizations. keep_cols = [ 'state_postal_abbreviation', 'api_death', 'defines_other_death', 'race_ethnicity_separately_death', 'race_ethnicity_combined_death', 'race_mutually_exclusive_death', 'combined_category_other_than_api_death', 'race_death', 'ethnicity_death', 'api_cases', 'defines_other_cases', 'race_ethnicity_separately_cases', 'race_ethnicity_combined_cases', 'race_mutually_exclusive_cases', 'combined_category_other_than_api_cases', 'race_cases', 'ethnicity_cases'] df = df[keep_cols] df = df.melt(id_vars=['state_postal_abbreviation']) df[['col_name', 'variable_type']] = df.variable.str.rsplit( '_', 1, expand=True) df.drop('variable', axis=1, inplace=True) df = df.pivot( index=['state_postal_abbreviation', 'variable_type'], columns='col_name', values='value').reset_index() df.replace({'variable_type': {'death': 'deaths'}}, inplace=True) df.rename_axis(None, inplace=True) df.rename(columns=self._metadata_columns_map(), inplace=True) # Write to BQ gcs_to_bq_util.append_dataframe_to_bq(df, dataset, self.get_table_name())
def write_to_bq(self, dataset, gcs_bucket, **attrs): gcs_file = self.get_attr(attrs, "filename") # Download the raw data df = gcs_to_bq_util.load_csv_as_dataframe(gcs_bucket, gcs_file) df = self.standardize(df) # Write to BQ gcs_to_bq_util.add_dataframe_to_bq(df, dataset, self.get_table_name())
def write_to_bq(self, dataset, gcs_bucket, _): """Writes state names to BigQuery from the provided GCS bucket dataset: The BigQuery dataset to write to gcs_bucket: The name of the gcs bucket to read the data from""" dataset = os.environ['MANUAL_UPLOADS_DATASET'] manual_uploads_project = os.environ['MANUAL_UPLOADS_PROJECT'] bucket_files = gcs_to_bq_util.list_bucket_files(gcs_bucket) for file_name in bucket_files: table_name = file_name.split('.')[0] chunked_frame = gcs_to_bq_util.load_csv_as_dataframe( gcs_bucket, file_name, chunksize=1000) for chunk in chunked_frame: super().clean_frame_column_names(chunk) gcs_to_bq_util.append_dataframe_to_bq_as_str_values( chunk, dataset, table_name, project=manual_uploads_project)
def write_to_bq(self, dataset, gcs_bucket, **attrs): filename = self.get_attr(attrs, "filename") df = gcs_to_bq_util.load_csv_as_dataframe( gcs_bucket, filename, parse_dates=["Date"], thousands=",") df = self.standardize(df) # Get the metadata table metadata = self._download_metadata(dataset) if len(metadata.index) == 0: raise RuntimeError("BigQuery call to {} returned 0 rows".format(dataset)) merged = CovidTrackingProject.merge_with_metadata(df, metadata) # Split into separate tables by variable type for variable_type in ["cases", "deaths", "tests", "hosp"]: result = merged.copy() result = result.loc[result["variable_type"] == variable_type] result.rename(columns={"value": variable_type}, inplace=True) result.drop("variable_type", axis="columns", inplace=True) # Write to BQ gcs_to_bq_util.add_dataframe_to_bq( result, dataset, self.get_table_name() + "_" + variable_type)