Esempio n. 1
0
def exist_table(client: bq.Client, dataset_id: str, table_id: str) -> bool:
    table_full_id = get_full_table_name(client, dataset_id, table_id)
    try:
        client.get_table(table_full_id)
    except NotFound:
        return False
    return True
Esempio n. 2
0
def test_list_rows_page_size(bigquery_client: bigquery.Client, table_id: str):
    num_items = 7
    page_size = 3
    num_pages, num_last_page = divmod(num_items, page_size)

    to_insert = [{
        "string_col": "item%d" % i,
        "rowindex": i
    } for i in range(num_items)]
    bigquery_client.load_table_from_json(to_insert, table_id).result()

    df = bigquery_client.list_rows(
        table_id,
        selected_fields=[
            bigquery.SchemaField("string_col", enums.SqlTypeNames.STRING)
        ],
        page_size=page_size,
    )
    pages = df.pages

    for i in range(num_pages):
        page = next(pages)
        assert page.num_items == page_size
    page = next(pages)
    assert page.num_items == num_last_page
Esempio n. 3
0
def _upload_entity_df(
    client: Client,
    table_name: str,
    entity_df: Union[pd.DataFrame, str],
) -> Table:
    """Uploads a Pandas entity dataframe into a BigQuery table and returns the resulting table"""

    if isinstance(entity_df, str):
        job = client.query(f"CREATE TABLE {table_name} AS ({entity_df})")

    elif isinstance(entity_df, pd.DataFrame):
        # Drop the index so that we don't have unnecessary columns
        entity_df.reset_index(drop=True, inplace=True)
        job = client.load_table_from_dataframe(entity_df, table_name)
    else:
        raise InvalidEntityType(type(entity_df))

    block_until_done(client, job)

    # Ensure that the table expires after some time
    table = client.get_table(table=table_name)
    table.expires = datetime.utcnow() + timedelta(minutes=30)
    client.update_table(table, ["expires"])

    return table
Esempio n. 4
0
        def load_task():
            client = Client()
            job_config = LoadJobConfig()
            schema_path = os.path.join(
                dags_folder,
                'resources/stages/raw/schemas/{task}.json'.format(task=task))
            job_config.schema = read_bigquery_schema_from_file(schema_path)
            job_config.source_format = SourceFormat.CSV if file_format == 'csv' else SourceFormat.NEWLINE_DELIMITED_JSON
            if file_format == 'csv':
                job_config.skip_leading_rows = 1
            job_config.write_disposition = 'WRITE_TRUNCATE'
            job_config.allow_quoted_newlines = allow_quoted_newlines
            job_config.ignore_unknown_values = True

            export_location_uri = 'gs://{bucket}/export'.format(
                bucket=output_bucket)
            uri = '{export_location_uri}/{task}/*.{file_format}'.format(
                export_location_uri=export_location_uri,
                task=task,
                file_format=file_format)
            table_ref = client.dataset(dataset_name_raw).table(task)
            load_job = client.load_table_from_uri(uri,
                                                  table_ref,
                                                  job_config=job_config)
            submit_bigquery_job(load_job, job_config)
            assert load_job.state == 'DONE'
Esempio n. 5
0
        def create_view_task(ds, **kwargs):

            template_context = kwargs.copy()
            template_context['ds'] = ds
            template_context['params'] = environment

            client = Client()

            dest_table_name = '{task}'.format(task=task)
            dest_table_ref = client.dataset(
                dataset_name,
                project=destination_dataset_project_id).table(dest_table_name)
            table = Table(dest_table_ref)

            sql_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/sqls/{task}.sql'.format(task=task))
            sql_template = read_file(sql_path)
            sql = kwargs['task'].render_template('', sql_template,
                                                 template_context)
            table.view_query = sql

            description_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/descriptions/{task}.txt'.format(
                    task=task))
            table.description = read_file(description_path)
            logging.info('Creating view: ' + json.dumps(table.to_api_repr()))

            try:
                table = client.create_table(table)
            except Conflict:
                # https://cloud.google.com/bigquery/docs/managing-views
                table = client.update_table(table, ['view_query'])
            assert table.table_id == dest_table_name
Esempio n. 6
0
def dataset_id(bigquery_client: bigquery.Client, project_id: str):
    dataset_id = prefixer.create_prefix()
    full_dataset_id = f"{project_id}.{dataset_id}"
    dataset = bigquery.Dataset(full_dataset_id)
    bigquery_client.create_dataset(dataset)
    yield dataset_id
    bigquery_client.delete_dataset(dataset, delete_contents=True, not_found_ok=True)
Esempio n. 7
0
def dataset_id(client: bigquery.Client):
    project_id = client.project
    dataset_id = prefixer.create_prefix()
    dataset = bigquery.Dataset(f"{project_id}.{dataset_id}")
    dataset = client.create_dataset(dataset)
    yield dataset_id
    client.delete_dataset(dataset_id, delete_contents=True)
Esempio n. 8
0
def load_parquet(
    client: bigquery.Client,
    dataframe: pandas.DataFrame,
    destination_table_ref: bigquery.TableReference,
    location: Optional[str],
    schema: Optional[Dict[str, Any]],
    billing_project: Optional[str] = None,
):
    job_config = bigquery.LoadJobConfig()
    job_config.write_disposition = "WRITE_APPEND"
    job_config.create_disposition = "CREATE_NEVER"
    job_config.source_format = "PARQUET"

    if schema is not None:
        schema = pandas_gbq.schema.remove_policy_tags(schema)
        job_config.schema = pandas_gbq.schema.to_google_cloud_bigquery(schema)
        dataframe = cast_dataframe_for_parquet(dataframe, schema)

    try:
        client.load_table_from_dataframe(
            dataframe,
            destination_table_ref,
            job_config=job_config,
            location=location,
            project=billing_project,
        ).result()
    except pyarrow.lib.ArrowInvalid as exc:
        raise exceptions.ConversionError(
            "Could not convert DataFrame to Parquet.") from exc
def main():
    args = get_sysargs(sys.argv[1:])
    exporter = Exporter()
    jinjafy = Jinjafy('../data/queries/marreco/datajet/')

    client = Client()
    dataset = client.dataset(args.dataset)
    table = dataset.table(args.table)

    for day in range(args.days_init, args.days_end - 1, -1):
        print('processing day: ', day)
        for idx, file_ in enumerate(['productview.sql',
                                     'search.sql',
                                     'purchase.sql']):
                    
             query = build_query(jinjafy,
                                 file_,
                                 {'dataset': '40663402', 
                                 'days_interval': day,
                                 'days_interval_end': day})

             exporter.bq_to_gcs(client,
                                query,
                                {'destination': table,
                                 'maximum_bytes_billed': 1000000000000,
                                 'write_disposition': 'WRITE_TRUNCATE'},
                                {'uri': args.uri.format(day=day, idx=idx),
                                 'table': table,
                                 'compression': 'GZIP',
                                 'destination_format': 'NEWLINE_DELIMITED_JSON'})
Esempio n. 10
0
def load_data_to_bq(df=None,
                    table_name='CRY',
                    dataset='price_data',
                    project=None):
    client = Client()
    table = client.get_table(".".join([client.project, dataset, table_name]))
    if table.num_rows == 0 and df is not None:
        df.to_gbq(".".join([dataset, table_name]), if_exists='append')
    else:
        delete_qry = f'''DELETE FROM `{dataset+"."+table_name}` AS t2
                         WHERE concat(symbol, cast(date as string)) IN
                         (SELECT concat(symbol, cast(MAX(date) as string))
                         FROM `{dataset+"."+table_name}`
                         GROUP BY symbol) AND symbol IN
                         {'("'+'","'.join(df.symbol.unique())+'")'}'''
        delete_DML = client.query(delete_qry)
        delete_DML.result()
        existing = pd.read_gbq(f'''select symbol, max(date) as max_date
                               from {dataset+"."+table_name}
                               group by symbol''',
                               dialect="legacy")
        df = df.merge(existing, on='symbol', how='left')
        df = df.loc[(df.date.dt.tz_localize('UTC') > df.max_date)
                    | df.max_date.isnull(), :]
        df.drop('max_date', axis=1, inplace=True)
        df.to_gbq(".".join([dataset, table_name]),
                  if_exists='append',
                  project_id=project)
Esempio n. 11
0
def _add_new_columns(client: bigquery.Client, table_id: str,
                     columns: List[str]) -> List[Dict]:
    """Adds any new columns if they are missing.

  Creates new string columns for every column if it doesn't exist.

  Args:
    client: The BigQuery client.
    table_id: Table id.
    columns: List of columns.

  Returns:
    The table schema.
  """
    try:
        table = client.get_table(table_id)
    except google.api_core.exceptions.NotFound:
        logging.error(
            'Table: \'%s\' not found - please create the table. It is okay to create it with no columns.',
            table_id)
        raise
    new_fields = []
    for c in columns:
        field = bigquery.SchemaField(c, 'STRING')
        if field not in table.schema:
            new_fields.append(field)

    if new_fields:
        logging.info('Found new fields: %s', new_fields)
        table.schema += new_fields
        client.update_table(table, ['schema'])

    return table.schema
Esempio n. 12
0
 def _connect(self):
     if self._bq_creds_file is not None:
         if Path(self._bq_creds_file).exists():
             os.environ.setdefault('GOOGLE_APPLICATION_CREDENTIALS', self._bq_creds_file)
         else:
             _log.warning('Path set by creds file does not exist: %s', self._bq_creds_file)
     self._conn = Client(**self._conn_kwargs)
Esempio n. 13
0
def update_recently_unixtime(client: bigquery.Client, df_unixtime):

    table_id = f'{project_id}.{dataset}.{recently_unixtime_table}'
    # unixtimeデータフレームをunixtime管理テーブルへinsert
    client.insert_rows_from_dataframe(client.get_table(table_id), df_unixtime)

    # unixtime管理テーブルでTABLE_NAMEカラムが重複してるデータを削除
    duplicate_query = f"""
        SELECT
            * EXCEPT(rowNumber)
        FROM (
            SELECT
                *,
                ROW_NUMBER() OVER (
                    PARTITION BY
                        TABLE_NAME
                    ORDER BY
                        UNIX_TIME DESC
                ) as rowNumber
            FROM
                {table_id}
        )
        WHERE
            rowNumber = 1;
    """

    job_config = bigquery.QueryJobConfig()
    job_config.destination = table_id
    job_config.write_disposition = 'WRITE_TRUNCATE'
    job = client.query(duplicate_query, job_config=job_config)
    job.result()
Esempio n. 14
0
def dest_partitioned_table_allow_jagged(bq: bigquery.Client, dest_dataset,
                                        monkeypatch) -> bigquery.Table:
    public_table: bigquery.Table = bq.get_table(
        bigquery.TableReference.from_string(
            "bigquery-public-data.new_york_311.311_service_requests"))
    schema = public_table.schema

    if os.getenv('GCP_PROJECT') is None:
        monkeypatch.setenv("GCP_PROJECT", bq.project)

    extra_field_for_jagged_row_test = bigquery.schema.SchemaField(
        "extra_jagged_row_test_column", "STRING")
    schema.append(extra_field_for_jagged_row_test)
    table: bigquery.Table = bigquery.Table(
        f"{os.getenv('GCP_PROJECT')}"
        f".{dest_dataset.dataset_id}.cf_test_nyc_311_"
        f"{str(uuid.uuid4()).replace('-', '_')}",
        schema=schema,
    )

    table.time_partitioning = bigquery.TimePartitioning()
    table.time_partitioning.type_ = bigquery.TimePartitioningType.HOUR
    table.time_partitioning.field = "created_date"

    table = bq.create_table(table)
    return table
Esempio n. 15
0
  def _table_exists(self, bq: bigquery.Client, table_ref: bigquery.TableReference) -> bool:
    try:
        bq.get_table(table_ref)
        return True

    except NotFound:
        return False
Esempio n. 16
0
def dest_partitioned_table(request, bq: bigquery.Client, mock_env,
                           dest_dataset) -> bigquery.Table:
    public_table: bigquery.Table = bq.get_table(
        bigquery.TableReference.from_string(
            "bigquery-public-data.new_york_311.311_service_requests"))
    schema = public_table.schema

    table: bigquery.Table = bigquery.Table(
        f"{os.environ.get('GCP_PROJECT')}"
        f".{dest_dataset.dataset_id}.cf_test_nyc_311_"
        f"{str(uuid.uuid4()).replace('-','_')}",
        schema=schema,
    )

    table.time_partitioning = bigquery.TimePartitioning()
    table.time_partitioning.type_ = bigquery.TimePartitioningType.HOUR
    table.time_partitioning.field = "created_date"

    table = bq.create_table(table)

    def teardown():
        bq.delete_table(table, not_found_ok=True)

    request.addfinalizer(teardown)
    return table
def update_or_create_table_from_csv(
    client: bigquery.Client,
    table_name: str,
    source_file: str,
    dataset: str,
    source_schema_file: str,
):
    LOGGER.debug("update_or_create_table_from_csv: %s=%s", table_name,
                 [source_file])
    dataset_ref = client.dataset(dataset)
    table_ref = dataset_ref.table(table_name)

    job_config = LoadJobConfig()
    job_config.source_format = "CSV"
    job_config.skip_leading_rows = 1
    if Path(source_schema_file).exists():
        job_config.schema = get_table_schema(source_schema_file)
    else:
        job_config.autodetect = True
    job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE

    with open(source_file, "rb") as source_fp:
        load_job = client.load_table_from_file(source_fp,
                                               destination=table_ref,
                                               job_config=job_config)

    # wait for job to complete
    load_job.result()

    LOGGER.info("updated config table: %s", table_ref.table_id)
Esempio n. 18
0
def retry_query(gcs_client: storage.Client, bq_client: bigquery.Client,
                lock_blob: storage.Blob, failed_job_id: str,
                table: bigquery.TableReference, retry_attempt_cnt):
    """Retry a query that failed"""
    if retry_attempt_cnt > 1:
        # if this is not the first retry, truncate over the previous
        # job_id retry attempt suffix '_xx' (3 chars)
        retry_job_id = f"{failed_job_id[:-3]}_{retry_attempt_cnt:02}"  # pad with zero
    else:
        retry_job_id = f"{failed_job_id}_{retry_attempt_cnt:02}"  # pad with zero
    failed_job: bigquery.QueryJob = bq_client.get_job(failed_job_id)
    job_config: bigquery.QueryJobConfig = bigquery.QueryJobConfig(
        table_definitions=failed_job.table_definitions, use_legacy_sql=False)
    retry_job = bq_client.query(failed_job.query,
                                job_config=job_config,
                                job_id=retry_job_id)
    # To keep track of retry attempts between cloud
    # function invocations, the retry count state is
    # kept in the _bqlock lock file.
    utils.handle_bq_lock(gcs_client,
                         lock_blob,
                         retry_job_id,
                         table,
                         retry_attempt_cnt=retry_attempt_cnt)
    logging.log_bigquery_job(
        retry_job, table, f"Submitted asynchronous query job: {retry_job_id}")
Esempio n. 19
0
def _export_table(
    client: bigquery.Client,
    project_id: str,
    dataset_id: str,
    table: str,
    bucket: str,
    storage_client: storage.Client,
):
    """Export a single table or view to GCS as JSON."""
    # since views cannot get exported directly, write data into a temporary table
    job = client.query(
        f"""
        SELECT *
        FROM {dataset_id}.{table}
    """
    )

    job.result()

    destination_uri = f"gs://{bucket}/{table}.ndjson"
    dataset_ref = bigquery.DatasetReference(project_id, job.destination.dataset_id)
    table_ref = dataset_ref.table(job.destination.table_id)

    logger.info(f"Export table {table} to {destination_uri}")

    job_config = bigquery.ExtractJobConfig()
    job_config.destination_format = "NEWLINE_DELIMITED_JSON"
    extract_job = client.extract_table(
        table_ref, destination_uri, location="US", job_config=job_config
    )
    extract_job.result()

    # convert ndjson to json
    _convert_ndjson_to_json(bucket, table, storage_client)
Esempio n. 20
0
def delete_views_or_table(client: bigquery.Client, view_or_table_name: str,
                          dataset: str):
    LOGGER.debug("delete_views_or_tables: %s", view_or_table_name)
    dataset_ref = client.dataset(dataset)
    table_ref = dataset_ref.table(view_or_table_name)
    client.delete_table(table_ref)
    LOGGER.info("deleted view or table: %s", view_or_table_name)
Esempio n. 21
0
        def create_view_task():

            client = Client()

            dest_table_name = '{task}'.format(task=task)
            dest_table_ref = client.dataset(
                dataset_name,
                project=destination_dataset_project_id).table(dest_table_name)
            table = Table(dest_table_ref)

            sql_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/sqls/{task}.sql'.format(task=task))
            sql = read_file(sql_path, environment)
            table.view_query = sql

            description_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/descriptions/{task}.txt'.format(
                    task=task))
            table.description = read_file(description_path)
            logging.info('Creating view: ' + json.dumps(table.to_api_repr()))

            try:
                table = client.create_table(table)
            except Conflict:
                # https://cloud.google.com/bigquery/docs/managing-views
                table = client.update_table(table, ['view_query'])
            assert table.table_id == dest_table_name
Esempio n. 22
0
def execute_query(bq_client: bigquery.Client, env_vars: {}, query_path: object,
                  output_table_name: str, time_partition: bool) -> None:
    """Executes transformation query to a new destination table.
    Args:
        bq_client: bigquery.Client object
        env_vars: Dictionary of key: value, where value is environment variable
        query_path: Object representing location of SQL query to execute
        output_table_name: String representing name of table that holds output
        time_partition: Boolean indicating whether to time-partition output
    """
    dataset_ref = bq_client.get_dataset(
        bigquery.DatasetReference(project=bq_client.project,
                                  dataset_id=env_vars['corrected_dataset_id']))
    table_ref = dataset_ref.table(output_table_name)
    job_config = bigquery.QueryJobConfig()
    job_config.destination = table_ref
    job_config.write_disposition = bigquery.WriteDisposition().WRITE_TRUNCATE

    # Time Partitioning table is only needed for final output query
    if time_partition:
        job_config.time_partitioning = bigquery.TimePartitioning(
            field='usage_start_time', expiration_ms=None)
    logging.info('Attempting query...')
    # Execute Query
    query_job = bq_client.query(query=render_template(query_path, env_vars),
                                job_config=job_config)

    query_job.result()  # Waits for the query to finish
Esempio n. 23
0
def exist_dataset(client: bq.Client, dataset_id: str) -> bool:
    dataset_full_id = get_full_dataset_name(client, dataset_id)
    try:
        client.get_dataset(dataset_full_id)
    except NotFound:
        return False
    return True
def get_or_create_table(client: bigquery.Client, dataset_id: str,
                        table_id: str) -> bigquery.Table:
    """
    BigQueryのデータセットとテーブルを作成する。既に存在する場合は取得する。
    """
    logging.info(f'Creating dataset {dataset_id} if not exists...')
    dataset = client.create_dataset(dataset_id,
                                    exists_ok=True)  # データセットを作成または取得する。

    logging.info(f'Creating table {dataset_id}.{table_id} if not exists...')
    table_ref = dataset.table(table_id)
    return client.create_table(  # テーブルを作成または取得する。
        bigquery.Table(table_ref,
                       schema=[
                           bigquery.SchemaField('id',
                                                'string',
                                                description='ツイートのID'),
                           bigquery.SchemaField('lang',
                                                'string',
                                                description='ツイートの言語'),
                           bigquery.SchemaField('screen_name',
                                                'string',
                                                description='ユーザー名'),
                           bigquery.SchemaField('text',
                                                'string',
                                                description='ツイートの本文'),
                           bigquery.SchemaField('created_at',
                                                'timestamp',
                                                description='ツイートの日時'),
                       ]),
        exists_ok=True)
Esempio n. 25
0
def main():
    from google.cloud.bigquery import Client

    tables = {}
    with open("/vol/required_tables.txt") as rt:
        table_names = rt.read().split()
        bq_client = Client()
        for table_name in table_names:
            splited_table_name = table_name.split(".")
            if len(splited_table_name) == 3:
                dataset_ref = bq_client.dataset(splited_table_name[1],
                                                project=splited_table_name[0])
            else:
                dataset_ref = bq_client.dataset(splited_table_name[0])
            table_ref = dataset_ref.table(splited_table_name[-1])
            table = bq_client.get_table(table_ref)
            tables[table_name] = [
                field.to_api_repr() for field in table.schema
            ]
            if table_name.endswith("*"):
                tables[table_name].append({
                    "name": "_TABLE_SUFFIX",
                    "type": "STRING",
                    "mode": "REQUIRED"
                })
    with open("/vol/schema.json", mode="w") as schema:
        schema.write(json.dumps(tables))
Esempio n. 26
0
def _recreate_table(client: bigquery.Client, table_id: str, schema: list) -> None:
    logger.info(f'Recreating {table_id} table...')
    dataset_ref = client.dataset(DATASET_ID)
    table_ref = dataset_ref.table(table_id)
    table = bigquery.Table(table_ref, schema=schema)
    client.delete_table(table_ref, not_found_ok=True)
    client.create_table(table)
Esempio n. 27
0
def _upload_entity_df_into_bigquery(
    client: Client,
    project: str,
    dataset_name: str,
    entity_df: Union[pandas.DataFrame, str],
) -> Table:
    """Uploads a Pandas entity dataframe into a BigQuery table and returns the resulting table"""

    table_id = _get_table_id_for_new_entity(client, project, dataset_name)

    if type(entity_df) is str:
        job = client.query(f"CREATE TABLE {table_id} AS ({entity_df})")
        job.result()
    elif isinstance(entity_df, pandas.DataFrame):
        # Drop the index so that we dont have unnecessary columns
        entity_df.reset_index(drop=True, inplace=True)

        # Upload the dataframe into BigQuery, creating a temporary table
        job_config = bigquery.LoadJobConfig()
        job = client.load_table_from_dataframe(entity_df,
                                               table_id,
                                               job_config=job_config)
        job.result()
    else:
        raise ValueError(
            f"The entity dataframe you have provided must be a Pandas DataFrame or BigQuery SQL query, "
            f"but we found: {type(entity_df)} ")

    # Ensure that the table expires after some time
    table = client.get_table(table=table_id)
    table.expires = datetime.utcnow() + timedelta(minutes=30)
    client.update_table(table, ["expires"])

    return table
Esempio n. 28
0
def iter_results(
    bigquery_client: bigquery.Client,
    query: str,
    job_config: QueryJobConfig,
    df_cleaner: Callable[[pd.DataFrame], pd.DataFrame] = None,
) -> Generator[pd.Series, None, None]:
    """
    Page through the results of a query and yield each row as a pandas Series

    Args:
        bigquery_client (bigquery.Client): The BigQuery client
        query (str): The query to run
        job_config (QueryJobConfig): The BigQuery job config

    Returns:
        Generator[pd.Series, None, None]: A generator of pandas Series
    """

    query_job = bigquery_client.query(query, job_config=job_config)
    query_job.result()

    # Get reference to destination table
    destination = bigquery_client.get_table(query_job.destination)

    rows = bigquery_client.list_rows(destination, page_size=10000)

    dfs = rows.to_dataframe_iterable()

    for df in dfs:
        if df_cleaner is not None:
            df = df_cleaner(df)

        for index, row in df.iterrows():
            yield row
Esempio n. 29
0
def clean_up_bq_tables(client: cloud_bigquery.Client,
                       table_names: List[str]) -> None:
    for table_name in table_names:
        try:
            client.get_table(table_name)
            client.delete_table(table_name)
        except NotFound:
            pass
Esempio n. 30
0
def load_views(bq: bigquery.Client, dataset: bigquery.Dataset,
               views: Dict[str, str]):
    """Load views for a test."""
    for table, view_query in views.items():
        view = bigquery.Table(dataset.table(table))
        view.view_query = view_query.format(project=dataset.project,
                                            dataset=dataset.dataset_id)
        bq.create_table(view)