Python load_csv_as_dataframe Examples, ingestion.gcs_to_bq_util.load_csv_as_dataframe Python Examples

Example #1

0

Show file

File: county_adjacency.py Project: vanshkumar/health-equity-tracker

    def write_to_bq(self, dataset, gcs_bucket, filename):
        """Writes county adjacencies to BigQuery from the provided GCS bucket

        dataset: The BigQuery dataset to write to
        table_name: The name of the biquery table to write to
        gcs_bucket: The name of the gcs bucket to read the data from
        filename: The name of the file in the gcs bucket to read from"""
        frame = gcs_to_bq_util.load_csv_as_dataframe(gcs_bucket, filename, dtype={
            'fipscounty': 'string',
            'fipsneighbor': 'string'
        })
        frame = frame[['fipscounty', 'fipsneighbor']]
        frame = frame.rename(columns={
            'fipscounty': 'county_geoid',
            'fipsneighbor': 'neighbor_geoids'
        })
        frame = frame.groupby('county_geoid', as_index=False).agg(list)

        column_types = {
            'county_geoid': 'STRING',
            'neighbor_geoids': 'STRING'
        }
        col_modes = {'neighbor_geoids': 'REPEATED'}
        gcs_to_bq_util.add_dataframe_to_bq(
            frame, dataset, self.get_table_name(), column_types=column_types,
            col_modes=col_modes)

Example #2

0

Show file

    def write_to_bq(self, dataset, gcs_bucket, **attrs):
        gcs_files = self.get_attr(attrs, 'filename')

        # In this instance, we expect filename to be a string with
        # comma-separated CSV filenames.
        if ',' not in gcs_files:
            raise ValueError('filename passed to write_to_bq is not a '
                             'comma-separated list of files')
        files = gcs_files.split(',')

        # For each of the files, we load it as a dataframe and add it as a
        # table in the BigQuery dataset. We expect that all aggregation and
        # standardization of the data has been done by this point.
        str_cols = [
            std_col.COUNTY_NAME_COL, std_col.STATE_NAME_COL,
            std_col.RACE_OR_HISPANIC_COL, std_col.AGE_COL, std_col.SEX_COL
        ]
        for f in files:
            df = gcs_to_bq_util.load_csv_as_dataframe(gcs_bucket, f)

            # All columns are int, except certain geo and breakdown columns.
            column_types = {c: 'INT64' for c in df.columns}
            for col in str_cols:
                if col in column_types:
                    column_types[col] = 'STRING'

            # Clean up column names.
            self.clean_frame_column_names(df)

            table_name = f.removesuffix('.csv')  # Table name is file name
            gcs_to_bq_util.append_dataframe_to_bq(df,
                                                  dataset,
                                                  table_name,
                                                  column_types=column_types)

Example #3

0

Show file

File: covid_tracking_project_metadata.py Project: joshzarrabi/health-equity-tracker

    def write_to_bq(self, dataset, gcs_bucket, **attrs):
        gcs_file = self.get_attr(attrs, 'filename')

        # Download the raw data
        df = gcs_to_bq_util.load_csv_as_dataframe(gcs_bucket, gcs_file)
        self.clean_frame_column_names(df)

        # Standardize the data
        # The metadata currently only has information for cases and deaths,
        # not tests or hospitalizations.
        keep_cols = [
            'state_postal_abbreviation', 'api_death',
            'defines_other_death', 'race_ethnicity_separately_death',
            'race_ethnicity_combined_death', 'race_mutually_exclusive_death',
            'combined_category_other_than_api_death', 'race_death',
            'ethnicity_death', 'api_cases', 'defines_other_cases',
            'race_ethnicity_separately_cases', 'race_ethnicity_combined_cases',
            'race_mutually_exclusive_cases', 'combined_category_other_than_api_cases',
            'race_cases', 'ethnicity_cases']
        df = df[keep_cols]
        df = df.melt(id_vars=['state_postal_abbreviation'])
        df[['col_name', 'variable_type']] = df.variable.str.rsplit(
            '_', 1, expand=True)
        df.drop('variable', axis=1, inplace=True)
        df = df.pivot(
            index=['state_postal_abbreviation', 'variable_type'],
            columns='col_name', values='value').reset_index()
        df.replace({'variable_type': {'death': 'deaths'}}, inplace=True)
        df.rename_axis(None, inplace=True)
        df.rename(columns=self._metadata_columns_map(), inplace=True)

        # Write to BQ
        gcs_to_bq_util.append_dataframe_to_bq(df, dataset, self.get_table_name())

Example #4

0

Show file

    def write_to_bq(self, dataset, gcs_bucket, **attrs):
        gcs_file = self.get_attr(attrs, "filename")

        # Download the raw data
        df = gcs_to_bq_util.load_csv_as_dataframe(gcs_bucket, gcs_file)
        df = self.standardize(df)
        # Write to BQ
        gcs_to_bq_util.add_dataframe_to_bq(df, dataset, self.get_table_name())

Example #5

0

Show file

    def write_to_bq(self, dataset, gcs_bucket, _):
        """Writes state names to BigQuery from the provided GCS bucket

        dataset: The BigQuery dataset to write to
        gcs_bucket: The name of the gcs bucket to read the data from"""
        dataset = os.environ['MANUAL_UPLOADS_DATASET']
        manual_uploads_project = os.environ['MANUAL_UPLOADS_PROJECT']
        bucket_files = gcs_to_bq_util.list_bucket_files(gcs_bucket)
        for file_name in bucket_files:
            table_name = file_name.split('.')[0]
            chunked_frame = gcs_to_bq_util.load_csv_as_dataframe(
                gcs_bucket, file_name, chunksize=1000)
            for chunk in chunked_frame:
                super().clean_frame_column_names(chunk)
                gcs_to_bq_util.append_dataframe_to_bq_as_str_values(
                    chunk, dataset, table_name, project=manual_uploads_project)

Example #6

0

Show file

File: covid_tracking_project.py Project: vanshkumar/health-equity-tracker

    def write_to_bq(self, dataset, gcs_bucket, **attrs):
        filename = self.get_attr(attrs, "filename")

        df = gcs_to_bq_util.load_csv_as_dataframe(
            gcs_bucket, filename, parse_dates=["Date"], thousands=",")
        df = self.standardize(df)

        # Get the metadata table
        metadata = self._download_metadata(dataset)
        if len(metadata.index) == 0:
            raise RuntimeError("BigQuery call to {} returned 0 rows".format(dataset))
        merged = CovidTrackingProject.merge_with_metadata(df, metadata)

        # Split into separate tables by variable type
        for variable_type in ["cases", "deaths", "tests", "hosp"]:
            result = merged.copy()
            result = result.loc[result["variable_type"] == variable_type]
            result.rename(columns={"value": variable_type}, inplace=True)
            result.drop("variable_type", axis="columns", inplace=True)
            # Write to BQ
            gcs_to_bq_util.add_dataframe_to_bq(
                result, dataset, self.get_table_name() + "_" + variable_type)