Beispiel #1
0
def _upload_entity_df_into_bigquery(
    client: Client,
    project: str,
    dataset_name: str,
    dataset_project: str,
    entity_df: Union[pandas.DataFrame, str],
) -> Table:
    """Uploads a Pandas entity dataframe into a BigQuery table and returns the resulting table"""

    table_id = _get_table_id_for_new_entity(client, project, dataset_name,
                                            dataset_project)

    if type(entity_df) is str:
        job = client.query(f"CREATE TABLE {table_id} AS ({entity_df})")
        job.result()
    elif isinstance(entity_df, pandas.DataFrame):
        # Drop the index so that we dont have unnecessary columns
        entity_df.reset_index(drop=True, inplace=True)

        # Upload the dataframe into BigQuery, creating a temporary table
        job_config = bigquery.LoadJobConfig()
        job = client.load_table_from_dataframe(entity_df,
                                               table_id,
                                               job_config=job_config)
        job.result()
    else:
        raise ValueError(
            f"The entity dataframe you have provided must be a Pandas DataFrame or BigQuery SQL query, "
            f"but we found: {type(entity_df)} ")

    # Ensure that the table expires after some time
    table = client.get_table(table=table_id)
    table.expires = datetime.utcnow() + timedelta(minutes=30)
    client.update_table(table, ["expires"])

    return table
Beispiel #2
0
def _upload_entity_df_and_get_entity_schema(
    client: Client,
    table_name: str,
    entity_df: Union[pandas.DataFrame, str],
) -> Dict[str, np.dtype]:
    """Uploads a Pandas entity dataframe into a BigQuery table and returns the resulting table"""

    if type(entity_df) is str:
        job = client.query(f"CREATE TABLE {table_name} AS ({entity_df})")
        block_until_done(client, job)

        limited_entity_df = (client.query(
            f"SELECT * FROM {table_name} LIMIT 1").result().to_dataframe())
        entity_schema = dict(
            zip(limited_entity_df.columns, limited_entity_df.dtypes))
    elif isinstance(entity_df, pandas.DataFrame):
        # Drop the index so that we dont have unnecessary columns
        entity_df.reset_index(drop=True, inplace=True)

        # Upload the dataframe into BigQuery, creating a temporary table
        job_config = bigquery.LoadJobConfig()
        job = client.load_table_from_dataframe(entity_df,
                                               table_name,
                                               job_config=job_config)
        block_until_done(client, job)

        entity_schema = dict(zip(entity_df.columns, entity_df.dtypes))
    else:
        raise InvalidEntityType(type(entity_df))

    # Ensure that the table expires after some time
    table = client.get_table(table=table_name)
    table.expires = datetime.utcnow() + timedelta(minutes=30)
    client.update_table(table, ["expires"])

    return entity_schema
def apply_schema_differences(
    schema_diffs: _SchemaDiffs,
    bigquery_client: BigQueryClient,
) -> None:
    print("Applying changes...")
    for table_identifier, difference in schema_diffs.items():
        if isinstance(difference, MissingTable):
            print("Creating table...")
            table = Table(
                table_identifier,
                schema=difference.local_table.get_schema_fields(),
            )
            if difference.local_table.time_partitioning:
                table.time_partitioning = difference.local_table.time_partitioning
            remote_table = bigquery_client.create_table(table)
            print(remote_table)
        elif isinstance(difference, ExistingTable):
            difference.remote_table.schema = difference.local_table.get_schema_fields(
            )
            print(
                bigquery_client.update_table(difference.remote_table,
                                             ["schema"]))