コード例 #1
0
def _upload_entity_df_and_get_entity_schema(
    client: Client, table_name: str, entity_df: Union[pd.DataFrame, str],
) -> Dict[str, np.dtype]:
    """Uploads a Pandas entity dataframe into a BigQuery table and returns the resulting table"""

    if type(entity_df) is str:
        job = client.query(f"CREATE TABLE {table_name} AS ({entity_df})")
        block_until_done(client, job)

        limited_entity_df = (
            client.query(f"SELECT * FROM {table_name} LIMIT 1").result().to_dataframe()
        )

        entity_schema = dict(zip(limited_entity_df.columns, limited_entity_df.dtypes))
    elif isinstance(entity_df, pd.DataFrame):
        # Drop the index so that we dont have unnecessary columns
        entity_df.reset_index(drop=True, inplace=True)
        job = client.load_table_from_dataframe(entity_df, table_name)
        block_until_done(client, job)
        entity_schema = dict(zip(entity_df.columns, entity_df.dtypes))
    else:
        raise InvalidEntityType(type(entity_df))

    # Ensure that the table expires after some time
    table = client.get_table(table=table_name)
    table.expires = datetime.utcnow() + timedelta(minutes=30)
    client.update_table(table, ["expires"])

    return entity_schema
コード例 #2
0
def _mark_existing_metadata_row_as_processed_helper(
    *,
    table_name: str,
    project_id: str,
    dry_run: bool,
    client: bigquery.Client,
    file_id: int,
    processed_time: datetime.datetime,
) -> None:
    query = f"""
        UPDATE
            `{project_id}.direct_ingest_processing_metadata.{table_name}`
        SET
            processed_time = DATETIME "{processed_time}"
        WHERE
            file_id = {file_id}
    """
    if dry_run:
        logging.info("[DRY RUN] Would have run query to mark as processed: %s",
                     query)
        return

    query_job = client.query(query)
    query_job.result()
    logging.info("Ran query to mark as processed: %s", query)
コード例 #3
0
ファイル: bigquery.py プロジェクト: feast-dev/feast
def _upload_entity_df(
    client: Client,
    table_name: str,
    entity_df: Union[pd.DataFrame, str],
) -> Table:
    """Uploads a Pandas entity dataframe into a BigQuery table and returns the resulting table"""

    if isinstance(entity_df, str):
        job = client.query(f"CREATE TABLE {table_name} AS ({entity_df})")

    elif isinstance(entity_df, pd.DataFrame):
        # Drop the index so that we don't have unnecessary columns
        entity_df.reset_index(drop=True, inplace=True)
        job = client.load_table_from_dataframe(entity_df, table_name)
    else:
        raise InvalidEntityType(type(entity_df))

    block_until_done(client, job)

    # Ensure that the table expires after some time
    table = client.get_table(table=table_name)
    table.expires = datetime.utcnow() + timedelta(minutes=30)
    client.update_table(table, ["expires"])

    return table
コード例 #4
0
def update_recently_unixtime(client: bigquery.Client, df_unixtime):

    table_id = f'{project_id}.{dataset}.{recently_unixtime_table}'
    # unixtimeデータフレームをunixtime管理テーブルへinsert
    client.insert_rows_from_dataframe(client.get_table(table_id), df_unixtime)

    # unixtime管理テーブルでTABLE_NAMEカラムが重複してるデータを削除
    duplicate_query = f"""
        SELECT
            * EXCEPT(rowNumber)
        FROM (
            SELECT
                *,
                ROW_NUMBER() OVER (
                    PARTITION BY
                        TABLE_NAME
                    ORDER BY
                        UNIX_TIME DESC
                ) as rowNumber
            FROM
                {table_id}
        )
        WHERE
            rowNumber = 1;
    """

    job_config = bigquery.QueryJobConfig()
    job_config.destination = table_id
    job_config.write_disposition = 'WRITE_TRUNCATE'
    job = client.query(duplicate_query, job_config=job_config)
    job.result()
コード例 #5
0
ファイル: utils.py プロジェクト: pnchbck/pulse-data
def get_file_id_and_processed_status_for_file(
        *, metadata_type: MetadataType, project_id: str, region_code: str,
        client: bigquery.Client,
        normalized_file_name: str) -> Tuple[Optional[int], bool]:
    """Checks to see if the provided |normalized_file_name| has been registered in the raw_data_metadata table. If
    it has, it returns the file's file_id and whether or not the file has already been processed. If it has not,
    returns None, False
    """
    table_name = get_table_name_for_type(metadata_type=metadata_type)
    table_id = f'{project_id}.direct_ingest_processing_metadata.{table_name}'
    query = f"""SELECT file_id, processed_time FROM `{table_id}`
             WHERE region_code = '{region_code}' AND normalized_file_name = '{normalized_file_name}'"""
    query_job = client.query(query)
    rows = query_job.result()

    if rows.total_rows > 1:
        raise ValueError(
            f'Expected there to only be one row per combination of {region_code} and {normalized_file_name}'
        )

    if not rows.total_rows:
        # TODO(3020): Once metadata tables are in postgres (and we don't have any limits on UPDATE queries), insert
        #  a row here that will have the processed_time filled in later.
        logging.info('\nNo found row for %s and %s in %s.',
                     normalized_file_name, region_code, table_id)
        return None, False

    row = one(rows)
    file_id = row.get('file_id')
    processed_time = row.get('processed_time')
    logging.info(
        'Found row for %s and %s with values file_id: %s and processed_time: %s',
        normalized_file_name, region_code, file_id, processed_time)
    return file_id, processed_time
コード例 #6
0
def execute_query(bq_client: bigquery.Client, env_vars: {}, query_path: object,
                  output_table_name: str, time_partition: bool) -> None:
    """Executes transformation query to a new destination table.
    Args:
        bq_client: bigquery.Client object
        env_vars: Dictionary of key: value, where value is environment variable
        query_path: Object representing location of SQL query to execute
        output_table_name: String representing name of table that holds output
        time_partition: Boolean indicating whether to time-partition output
    """
    dataset_ref = bq_client.get_dataset(
        bigquery.DatasetReference(project=bq_client.project,
                                  dataset_id=env_vars['corrected_dataset_id']))
    table_ref = dataset_ref.table(output_table_name)
    job_config = bigquery.QueryJobConfig()
    job_config.destination = table_ref
    job_config.write_disposition = bigquery.WriteDisposition().WRITE_TRUNCATE

    # Time Partitioning table is only needed for final output query
    if time_partition:
        job_config.time_partitioning = bigquery.TimePartitioning(
            field='usage_start_time', expiration_ms=None)
    logging.info('Attempting query...')
    # Execute Query
    query_job = bq_client.query(query=render_template(query_path, env_vars),
                                job_config=job_config)

    query_job.result()  # Waits for the query to finish
コード例 #7
0
def iter_results(
    bigquery_client: bigquery.Client,
    query: str,
    job_config: QueryJobConfig,
    df_cleaner: Callable[[pd.DataFrame], pd.DataFrame] = None,
) -> Generator[pd.Series, None, None]:
    """
    Page through the results of a query and yield each row as a pandas Series

    Args:
        bigquery_client (bigquery.Client): The BigQuery client
        query (str): The query to run
        job_config (QueryJobConfig): The BigQuery job config

    Returns:
        Generator[pd.Series, None, None]: A generator of pandas Series
    """

    query_job = bigquery_client.query(query, job_config=job_config)
    query_job.result()

    # Get reference to destination table
    destination = bigquery_client.get_table(query_job.destination)

    rows = bigquery_client.list_rows(destination, page_size=10000)

    dfs = rows.to_dataframe_iterable()

    for df in dfs:
        if df_cleaner is not None:
            df = df_cleaner(df)

        for index, row in df.iterrows():
            yield row
コード例 #8
0
ファイル: diff.py プロジェクト: DataBiosphere/clinvar-ingest
def get_all_rows(table_name: str = table_name, dest_data_project: str = dest_data_project,
                 dest_dataset_name: str = dest_dataset_name, repo_data_project: str = repo_data_project,
                 repo_dataset_name: str = repo_dataset_name, release_date: str = release_date,
                 version_col: str = version_col, client: bigquery.Client = repo_client):
    query = f"""
    SELECT * EXCEPT (datarepo_row_id, release_date)
    FROM `{repo_data_project}.{repo_dataset_name}.{table_name}`
    WHERE {version_col} = "{release_date}"
    """

    dest_table_name = f"{table_name}_{DiffType.CREATE.value}"
    table_id = f"{dest_data_project}.{dest_dataset_name}.{dest_table_name}"

    job_config = bigquery.QueryJobConfig(destination=table_id)

    query_job = client.query(query, job_config=job_config)
    print("The query submitted is:")
    print(query)
    print(f"The destination table is: {table_id}")
    # return the destination table name
    try:
        query_job.result()
    except GoogleCloudError as err:
        print(f"There was a {type(err)}")
        print(err)
    # return the destination table name
    return dest_table_name
コード例 #9
0
def retry_query(gcs_client: storage.Client, bq_client: bigquery.Client,
                lock_blob: storage.Blob, failed_job_id: str,
                table: bigquery.TableReference, retry_attempt_cnt):
    """Retry a query that failed"""
    if retry_attempt_cnt > 1:
        # if this is not the first retry, truncate over the previous
        # job_id retry attempt suffix '_xx' (3 chars)
        retry_job_id = f"{failed_job_id[:-3]}_{retry_attempt_cnt:02}"  # pad with zero
    else:
        retry_job_id = f"{failed_job_id}_{retry_attempt_cnt:02}"  # pad with zero
    failed_job: bigquery.QueryJob = bq_client.get_job(failed_job_id)
    job_config: bigquery.QueryJobConfig = bigquery.QueryJobConfig(
        table_definitions=failed_job.table_definitions, use_legacy_sql=False)
    retry_job = bq_client.query(failed_job.query,
                                job_config=job_config,
                                job_id=retry_job_id)
    # To keep track of retry attempts between cloud
    # function invocations, the retry count state is
    # kept in the _bqlock lock file.
    utils.handle_bq_lock(gcs_client,
                         lock_blob,
                         retry_job_id,
                         table,
                         retry_attempt_cnt=retry_attempt_cnt)
    logging.log_bigquery_job(
        retry_job, table, f"Submitted asynchronous query job: {retry_job_id}")
コード例 #10
0
def _upload_entity_df_into_bigquery(
    client: Client,
    project: str,
    dataset_name: str,
    entity_df: Union[pandas.DataFrame, str],
) -> Table:
    """Uploads a Pandas entity dataframe into a BigQuery table and returns the resulting table"""

    table_id = _get_table_id_for_new_entity(client, project, dataset_name)

    if type(entity_df) is str:
        job = client.query(f"CREATE TABLE {table_id} AS ({entity_df})")
        job.result()
    elif isinstance(entity_df, pandas.DataFrame):
        # Drop the index so that we dont have unnecessary columns
        entity_df.reset_index(drop=True, inplace=True)

        # Upload the dataframe into BigQuery, creating a temporary table
        job_config = bigquery.LoadJobConfig()
        job = client.load_table_from_dataframe(entity_df,
                                               table_id,
                                               job_config=job_config)
        job.result()
    else:
        raise ValueError(
            f"The entity dataframe you have provided must be a Pandas DataFrame or BigQuery SQL query, "
            f"but we found: {type(entity_df)} ")

    # Ensure that the table expires after some time
    table = client.get_table(table=table_id)
    table.expires = datetime.utcnow() + timedelta(minutes=30)
    client.update_table(table, ["expires"])

    return table
コード例 #11
0
def load_data_to_bq(df=None,
                    table_name='CRY',
                    dataset='price_data',
                    project=None):
    client = Client()
    table = client.get_table(".".join([client.project, dataset, table_name]))
    if table.num_rows == 0 and df is not None:
        df.to_gbq(".".join([dataset, table_name]), if_exists='append')
    else:
        delete_qry = f'''DELETE FROM `{dataset+"."+table_name}` AS t2
                         WHERE concat(symbol, cast(date as string)) IN
                         (SELECT concat(symbol, cast(MAX(date) as string))
                         FROM `{dataset+"."+table_name}`
                         GROUP BY symbol) AND symbol IN
                         {'("'+'","'.join(df.symbol.unique())+'")'}'''
        delete_DML = client.query(delete_qry)
        delete_DML.result()
        existing = pd.read_gbq(f'''select symbol, max(date) as max_date
                               from {dataset+"."+table_name}
                               group by symbol''',
                               dialect="legacy")
        df = df.merge(existing, on='symbol', how='left')
        df = df.loc[(df.date.dt.tz_localize('UTC') > df.max_date)
                    | df.max_date.isnull(), :]
        df.drop('max_date', axis=1, inplace=True)
        df.to_gbq(".".join([dataset, table_name]),
                  if_exists='append',
                  project_id=project)
コード例 #12
0
ファイル: bigquery.py プロジェクト: feast-dev/feast
def _get_entity_df_event_timestamp_range(
    entity_df: Union[pd.DataFrame, str],
    entity_df_event_timestamp_col: str,
    client: Client,
) -> Tuple[datetime, datetime]:
    if type(entity_df) is str:
        job = client.query(
            f"SELECT MIN({entity_df_event_timestamp_col}) AS min, MAX({entity_df_event_timestamp_col}) AS max "
            f"FROM ({entity_df})")
        res = next(job.result())
        entity_df_event_timestamp_range = (
            res.get("min"),
            res.get("max"),
        )
    elif isinstance(entity_df, pd.DataFrame):
        entity_df_event_timestamp = entity_df.loc[:,
                                                  entity_df_event_timestamp_col].infer_objects(
                                                  )
        if pd.api.types.is_string_dtype(entity_df_event_timestamp):
            entity_df_event_timestamp = pd.to_datetime(
                entity_df_event_timestamp, utc=True)
        entity_df_event_timestamp_range = (
            entity_df_event_timestamp.min().to_pydatetime(),
            entity_df_event_timestamp.max().to_pydatetime(),
        )
    else:
        raise InvalidEntityType(type(entity_df))

    return entity_df_event_timestamp_range
コード例 #13
0
def _export_table(
    client: bigquery.Client,
    project_id: str,
    dataset_id: str,
    table: str,
    bucket: str,
    storage_client: storage.Client,
):
    """Export a single table or view to GCS as JSON."""
    # since views cannot get exported directly, write data into a temporary table
    job = client.query(
        f"""
        SELECT *
        FROM {dataset_id}.{table}
    """
    )

    job.result()

    destination_uri = f"gs://{bucket}/{table}.ndjson"
    dataset_ref = bigquery.DatasetReference(project_id, job.destination.dataset_id)
    table_ref = dataset_ref.table(job.destination.table_id)

    logger.info(f"Export table {table} to {destination_uri}")

    job_config = bigquery.ExtractJobConfig()
    job_config.destination_format = "NEWLINE_DELIMITED_JSON"
    extract_job = client.extract_table(
        table_ref, destination_uri, location="US", job_config=job_config
    )
    extract_job.result()

    # convert ndjson to json
    _convert_ndjson_to_json(bucket, table, storage_client)
コード例 #14
0
def exclude_site_submission(client: bigquery.Client,
                            dataset: bigquery.DatasetReference,
                            hpo_ids: List[str]) -> bigquery.QueryJob:
    """
    Empty all CDM tables associated with one or more HPO sites
    
    :param client: Active bigquery client object 
    :param dataset: the dataset to exclude site data from
    :param hpo_ids: Identifies the HPO sites whose data should be excluded
    :return: Query job associated with removing all the records
    :raises RuntimeError if CDM tables associated with a site are not found in the dataset
    """
    LOGGER.debug(
        f'exclude_site_submission called with dataset={dataset.dataset_id} and hpo_ids={hpo_ids}'
    )
    all_tables = list(bq.list_tables(client, dataset))
    tables_to_empty = []
    for hpo_id in hpo_ids:
        hpo_tables = _filter_hpo_tables(all_tables, hpo_id)
        if not hpo_tables:
            raise RuntimeError(
                f'No tables found for {hpo_id} in dataset {dataset.dataset_id}. '
                f'Ensure the specified arguments are correct.')
        tables_to_empty.extend(hpo_tables)
    script = DELETE_QUERY_TPL.render(tables_to_empty=tables_to_empty)
    LOGGER.debug(f'exclude_site_submission about to start script:\n {script}')
    return client.query(script)
コード例 #15
0
ファイル: sync_bq_tables.py プロジェクト: leplatrem/ctms-api
def bq_reader(
    client: bigquery.Client,
    table: str,
    modifier: Callable[[Dict[str, Any]], BaseModel],
    table_index: int,
    total_tables: int,
    report_frequency: int,
):
    # TODO: Probably want some sort of ordering here, a report of where
    # we are in that ordering, and a way to resume from that place if
    # things crash (LOGGING AND QUERYING?)
    query = f"SELECT * FROM `mozilla-cdp-prod.sfdc_exports.{table}`"
    query_job_rows = client.query(query).result()
    total_rows = query_job_rows.total_rows
    report_prefix = f"{table} (table: {table_index}/{total_tables})"
    start = monotonic()
    for i, row in enumerate(query_job_rows):
        i = i + 1
        if i % report_frequency == 0 or i == total_rows:
            percent_done = int(i / total_rows * 100)
            time_since_start = monotonic() - start
            per_second = int(i / time_since_start)
            print(
                f"{report_prefix}: {percent_done}% Complete ({per_second} rows/s) ({int(time_since_start)}s since query)"
            )
        newrow = {}
        for key, value in row.items():
            if value != "":
                newrow[key] = value
        try:
            yield modifier(newrow)
        except ValidationError as e:
            # TODO: Write this to a table so we know what didn't work
            print(newrow["email_id"], str(e), file=sys.stderr)
コード例 #16
0
def execute_bq_query(
    client: bigquery.Client,
    query_sql: str,
    query_params: list = None,
    dry_run: bool = False,
) -> bigquery.table.RowIterator:
    """
    Execute bq query and return results as a row iterator.

    """
    if query_params is not None:
        # TODO: implement. See
        #   https://cloud.google.com/bigquery/docs/parameterized-queries
        #   https://googleapis.github.io/google-cloud-python/latest/bigquery/usage/queries.html#run-a-query-using-a-named-query-parameter
        raise NotImplementedError
    if dry_run:
        # TODO: implement. See
        #   https://googleapis.github.io/google-cloud-python/latest/bigquery/usage/queries.html#run-a-dry-run-query
        raise NotImplementedError

    query_job = client.query(query_sql)  # type: bigquery.QueryJob

    # API request; waits for job to complete.
    # TODO: is all results' data retrieved here? Or will some be retrieved later, when the iterator
    #   is consumed?
    return query_job.result()  # type: bigquery.table.RowIterator
コード例 #17
0
def get_tables_in_dataset(client: Client, project_id, dataset_id,
                          table_names) -> List[str]:
    """
    This function retrieves tables that exist in dataset for an inital list table_names . This 
    function raises GoogleCloudError if the query throws an error
    
    :param client: 
    :param project_id: 
    :param dataset_id: 
    :param table_names: 
    :return: a list of tables that exist in the given dataset
    """
    # The following makes sure the tables exist in the dataset
    query_job = client.query(
        GET_ALL_TABLES_QUERY_TEMPLATE.render(project=project_id,
                                             dataset=dataset_id,
                                             table_names=table_names))

    try:
        result = query_job.result()
        # Raise the Runtime Error if the errors are neither GoogleCloudError nor TimeoutError
        if query_job.errors:
            raise RuntimeError(result.errors)

        return [dict(row.items())[TABLE_ID] for row in result]

    except (GoogleCloudError, TimeoutError, RuntimeError) as e:
        # Catch GoogleCloudError and TimeoutError that could be raised by query_job.result()
        # Also catch the RuntimeError raised from the try block
        # Log the error and raise it again
        LOGGER.error(f"Error running job {result.job_id}: {e}")
        raise
コード例 #18
0
def get_bucket(client: BQClient, hpo_id: str) -> str:
    """
    Retrieves bucket name for site

    :param client: Bigquery Client object
    :param hpo_id: Identifies the HPO site
    :return: bucket name for the HPO site as a string
    :raises GoogleCloudError/TimeoutError
    """
    bucket_name_query = BUCKET_NAME_QUERY.render(
        project=client.project,
        dataset=bq_consts.LOOKUP_TABLES_DATASET_ID,
        bucket_names_table=bq_consts.HPO_ID_BUCKET_NAME_TABLE_ID,
        hpo_id=hpo_id.upper())

    try:
        bucket_names_result = client.query(bucket_name_query).result()
        bucket_names = bucket_names_result.to_dataframe(
        )['bucket_name'].to_list()
        if len(bucket_names) > 1:
            LOGGER.warning(
                f'Found more than one bucket name for site {hpo_id}: {bucket_names}'
            )
        bucket_name = bucket_names[0]
    except (GoogleCloudError, TOError) as e:
        LOGGER.error(f'Job failed with error {str(e)}')
        raise e
    return bucket_name
コード例 #19
0
 def _execute_query(client: bigquery.Client, query: str,
                    parameters: List) -> pandas.DataFrame:
     try:
         start = timer()
         result = (
             client.query(
                 query,
                 job_config=bigquery.QueryJobConfig(
                     query_parameters=parameters)).result().to_dataframe(
                         create_bqstorage_client=True,
                     )  # Use to generate directly a dataframe pandas
         )
         end = timer()
         logging.getLogger(__name__).info(
             f'[benchmark][google_big_query] - execute {end - start} seconds',
             extra={
                 'benchmark': {
                     'operation': 'execute',
                     'execution_time': end - start,
                     'connector': 'google_big_query',
                 }
             },
         )
         return result
     except TypeError as e:
         logging.getLogger(__name__).error(
             f'Error to execute request {query} - {e}')
         raise e
コード例 #20
0
class QueryPublications(DoFn):
    '''Apache Beam DoFn class for querying patent publications from BigQuery by
    application numbers.
    '''
    def __init__(self):
        super().__init__()
        self.storage_client = None

    def setup(self):
        logging.info('Initializing BigQuery client')
        self.storage_client = Client()

    def process(self, app_numbers):
        sql = """
            SELECT application_number, application_kind, grant_date
            FROM `patents-public-data.patents.publications`
            WHERE
            country_code = @us_country_code
            AND application_number IN UNNEST(@application_numbers)
            AND IF (
                publication_date >= @wipo_kind_codes_from,
                kind_code IN UNNEST(@wipo_patent_publication_codes),
                kind_code = @uspto_patent_publication_code
            );
        """

        job_config = QueryJobConfig(query_parameters=[
            ScalarQueryParameter(
                'us_country_code',
                'STRING',
                US_COUNTRY_CODE,
            ),
            ArrayQueryParameter(
                'application_numbers',
                'STRING',
                app_numbers,
            ),
            ScalarQueryParameter(
                'wipo_kind_codes_from',
                'INT64',
                WIPO_KIND_CODES_FROM,
            ),
            ArrayQueryParameter(
                'wipo_patent_publication_codes',
                'STRING',
                WIPO_PATENT_PUBLICATION_CODES,
            ),
            ScalarQueryParameter(
                'uspto_patent_publication_code',
                'STRING',
                USPTO_PATENT_PUBLICATION_CODE,
            ),
        ])
        query = self.storage_client.query(sql, job_config=job_config)

        logging.info('Executing query for publications')
        iterator = query.result()

        return iterator
コード例 #21
0
ファイル: test_setup.py プロジェクト: MrTrustworthy/biquery
def test_tables_correct(client: Client, bq_tables: List[Table]):
    counts = set()
    for table in bq_tables:
        job = client.query(
            f"SELECT COUNT(*) FROM `{table.project}.{table.dataset_id}.{table.table_id}`"
        )
        counts.add(Driver.get_scalar_result(job))
    assert counts == {1000, 100, 37, 6}
コード例 #22
0
def external_query(  # pylint: disable=too-many-arguments
        gcs_client: storage.Client, bq_client: bigquery.Client, gsurl: str,
        query: str, dest_table_ref: bigquery.TableReference, job_id: str):
    """Load from query over external table from GCS.

    This hinges on a SQL query defined in GCS at _config/*.sql and
    an external table definition _config/external.json (otherwise will assume
    CSV external table)
    """
    external_table_config = read_gcs_file_if_exists(
        gcs_client, f"{gsurl}_config/external.json")
    if not external_table_config:
        external_table_config = look_for_config_in_parents(
            gcs_client, gsurl, "external.json")
    if external_table_config:
        external_table_def = json.loads(external_table_config)
    else:
        print(
            f" {gsurl}_config/external.json not found in parents of {gsurl}. "
            "Falling back to default PARQUET external table: "
            f"{json.dumps(constants.DEFAULT_EXTERNAL_TABLE_DEFINITION)}")
        external_table_def = constants.DEFAULT_EXTERNAL_TABLE_DEFINITION

    # This may cause an issue if >10,000 files.
    external_table_def["sourceUris"] = flatten2dlist(
        get_batches_for_prefix(gcs_client, gsurl))
    print(
        f"external table def = {json.dumps(external_table_config, indent=2)}")
    external_config = bigquery.ExternalConfig.from_api_repr(external_table_def)
    job_config = bigquery.QueryJobConfig(
        table_definitions={"temp_ext": external_config}, use_legacy_sql=False)

    # drop partition decorator if present.
    table_id = dest_table_ref.table_id.split("$")[0]

    # similar syntax to str.format but doesn't require escaping braces
    # elsewhere in query (e.g. in a regex)
    rendered_query = query\
        .replace(
            "{dest_dataset}",
            f"`{dest_table_ref.project}`.{dest_table_ref.dataset_id}")\
        .replace("{dest_table}", table_id)

    job: bigquery.QueryJob = bq_client.query(rendered_query,
                                             job_config=job_config,
                                             job_id=job_id)

    print(f"started asynchronous query job: {job.job_id}")

    start_poll_for_errors = time.monotonic()
    # Check if job failed quickly
    while time.monotonic(
    ) - start_poll_for_errors < constants.WAIT_FOR_JOB_SECONDS:
        job.reload(client=bq_client)
        if job.state == "DONE":
            check_for_bq_job_and_children_errors(bq_client, job)
            return
        time.sleep(constants.JOB_POLL_INTERVAL_SECONDS)
コード例 #23
0
def get_table_refs(bq_client: bigquery.Client,
                   bq_storage_client: bigquery_storage.BigQueryReadClient,
                   dataset: bigquery.Dataset):
    query = f"select table_id from {dataset.project}.{dataset.dataset_id}.__TABLES__"
    df: pd.DataFrame = bq_client.query(query).to_dataframe(
        bqstorage_client=bq_storage_client)
    for table_id in df.table_id:
        if table_id is not None:
            yield bigquery.TableReference(dataset.reference, table_id)
コード例 #24
0
def _delete_rows(client: bigquery.Client, table_id: str) -> None:
    """Delete all rows in a table."""
    query = f'DELETE FROM `{table_id}` WHERE TRUE'
    job = client.query(query)
    try:
        rows = job.result()
    except google.cloud.exceptions.GoogleCloudError as e:
        logging.error(e)
        raise e
コード例 #25
0
def get_last_update_status(client: gbigquery.Client):
    check_environment_variable()
    query = make_oanda_prices_query()
    results = client.query(query).result()
    last_datetime = datetime_process.extract_datetime(results)
    is_valid = datetime_process.is_update_time_valid(last_datetime)
    status = make_status(last_datetime, is_valid)

    return status
コード例 #26
0
ファイル: main.py プロジェクト: saliyamwd/bigquery-utils
def external_query(  # pylint: disable=too-many-arguments
        gcs_client: storage.Client, bq_client: bigquery.Client, gsurl: str,
        query: str, dest_table_ref: bigquery.TableReference,
        job_id_prefix: str):
    """Load from query over external table from GCS.

    This hinges on a SQL query defined in GCS at _config/bq_transform.sql and
    an external table definition _config/external.json (otherwise will assume
    CSV external table)
    """
    external_table_config = read_gcs_file_if_exists(
        gcs_client, f"{gsurl}_config/external.json")
    if external_table_config:
        external_table_def = json.loads(external_table_config)
    else:
        print(f"Falling back to default CSV external table."
              f" {gsurl}/_config/external.json not found.")
        external_table_def = DEFAULT_EXTERNAL_TABLE_DEFINITION

    external_table_def["sourceUris"] = flatten2dlist(
        get_batches_for_prefix(gcs_client, gsurl))
    external_config = bigquery.ExternalConfig.from_api_repr(external_table_def)
    job_config = bigquery.QueryJobConfig(
        table_definitions={"temp_ext": external_config}, use_legacy_sql=False)

    # Note, dest_table might include a partition decorator.
    rendered_query = query.format(
        dest_dataset=dest_table_ref.dataset_id,
        dest_table=dest_table_ref.table_id,
    )

    job: bigquery.QueryJob = bq_client.query(
        rendered_query,
        job_config=job_config,
        job_id_prefix=job_id_prefix,
    )

    print(f"started asynchronous query job: {job.job_id}")

    start_poll_for_errors = time.monotonic()
    # Check if job failed quickly
    while time.monotonic() - start_poll_for_errors < WAIT_FOR_JOB_SECONDS:
        job.reload()
        if job.errors:
            msg = f"query job {job.job_id} failed quickly: {job.errors}"
            for err in job.errors:
                # BQ gives confusing warning about missing dataset if the
                # external query refers to the wrong external table name.
                # In this case we can give the end user a little more context.
                if "missing dataset" in err.get("message", ""):
                    raise RuntimeError(
                        "External queries must select from the external table "
                        "named 'temp_ext'. This error may be due to specifying"
                        "the wrong name for the external table. " + msg)
            raise RuntimeError(msg)
        time.sleep(JOB_POLL_INTERVAL_SECONDS)
コード例 #27
0
ファイル: temp_table.py プロジェクト: yeondudad/bigquery-etl
def get_temporary_dataset(client: bigquery.Client):
    """Get a cached reference to the dataset used for server-assigned destinations."""
    global temporary_dataset
    if temporary_dataset is None:
        # look up the dataset used for query results without a destination
        dry_run = bigquery.QueryJobConfig(dry_run=True)
        destination = client.query("SELECT NULL", dry_run).destination
        temporary_dataset = client.dataset(destination.dataset_id,
                                           destination.project)
    return temporary_dataset
コード例 #28
0
ファイル: test_burnham.py プロジェクト: usApp-stAck/burnham
def test_burnham(
    client: bigquery.Client,
    query_job_config: bigquery.QueryJobConfig,
    query: str,
    want: List[Any],
):
    """Test that the Glean telemetry in BigQuery matches what we expect."""
    query_job = client.query(query, job_config=query_job_config)
    got = [dict(row.items()) for row in query_job.result()]
    assert got == want
コード例 #29
0
ファイル: diff.py プロジェクト: DataBiosphere/clinvar-ingest
def get_non_pk_cols(table_name: str, repo_data_project: str, repo_dataset_name: str, client: bigquery.Client) -> Dict[
    str, str]:
    # should use the jade client
    query = f"""
    SELECT column_name, data_type FROM `{repo_data_project}.{repo_dataset_name}.INFORMATION_SCHEMA.COLUMNS`
    WHERE table_name = "{table_name}" AND column_name NOT IN ("id", "release_date", "datarepo_row_id")
    """

    query_job = client.query(query)

    return {row[0]: row[1] for row in query_job}
コード例 #30
0
ファイル: utils.py プロジェクト: pnchbck/pulse-data
def get_next_available_file_id(metadata_type: MetadataType, project_id: str,
                               client: bigquery.Client) -> int:
    """Retrieves the next available file_id in the raw_file_metadata table."""
    table_name = get_table_name_for_type(metadata_type)
    query = f"""SELECT MAX(file_id) AS max_file_id
                FROM `{project_id}.direct_ingest_processing_metadata.{table_name}`"""
    query_job = client.query(query)
    rows = query_job.result()
    max_file_id = one(rows).get('max_file_id')
    if max_file_id is None:
        return 1
    return max_file_id + 1