Exemple #1
0
    def test_table_lookup(self):
        # type: () -> None
        dataset_ref1 = DatasetReference('my_project', 'dataset1')
        table_ref1 = TableReference(dataset_ref1, 'table1')
        table1 = Table(table_ref1, _TEST_SCHEMA)

        # Trying to get the same dataset/table in another project doesn't work.
        with self.assertRaisesRegexp(NotFound, 'other_project'):
            self.bq_client.get_table(
                TableReference(DatasetReference('other_project', 'dataset1'),
                               'table1'))

        # Trying to get the table before the dataset exists doesn't work
        with self.assertRaisesRegexp(NotFound, 'dataset1'):
            self.bq_client.get_table(table_ref1)
        self.bq_client.create_dataset(Dataset(dataset_ref1))

        # Trying to get the table before the table exists doesn't work
        with self.assertRaises(NotFound):
            self.bq_client.get_table(table_ref1)

        self.bq_client.create_table(table1)

        # Assert the created table has the expected properties.
        table_found = self.bq_client.get_table(table_ref1)
        self.assertEqual(table1.project, "my_project")
        self.assertEqual(table1.dataset_id, "dataset1")
        self.assertEqual(table1.table_id, "table1")
        six.assertCountEqual(self, table_found.schema, _TEST_SCHEMA)
 def test_listing_tables(self):
     # type: () -> None
     dataset_ref1 = DatasetReference('my_project', 'dataset1')
     self.bq_client.create_dataset(Dataset(dataset_ref1))
     self.bq_client.create_table(Table(TableReference(dataset_ref1, 'table1'), _TEST_SCHEMA))
     self.bq_client.create_table(Table(TableReference(dataset_ref1, 'table2'), []))
     six.assertCountEqual(
         self,
         [table_ref.table_id for table_ref in self.bq_client.list_tables(dataset_ref1)],
         ['table1', 'table2'])
    def setUp(self):
        super(ClientWriteFromQueryTest, self).setUp()
        dataset_ref = DatasetReference(self.bq_client.project, 'my_dataset')
        schema = [SchemaField(name="a", field_type='INT64'),
                  SchemaField(name="b", field_type='FLOAT64'),
                  ]
        self.source_table = Table(TableReference(dataset_ref, 'source_table'), schema)
        self.destination_table = Table(TableReference(dataset_ref, 'destination_table'), schema)
        self.bq_client.create_dataset(Dataset(dataset_ref))
        self.bq_client.create_table(self.source_table)
        # We don't create the destination table here; some tests do not want it created.

        # Stick two rows into source_table
        self.assertFalse(self.bq_client.insert_rows(self.source_table,
                                                    [{'a': 1, 'b': 2.5}, {'a': 3, 'b': 4.25}]))
Exemple #4
0
 def __create_job_config(
         self, ems_query_job_config: EmsQueryJobConfig) -> QueryJobConfig:
     job_config = QueryJobConfig()
     job_config.priority = ems_query_job_config.priority.value
     job_config.use_legacy_sql = False
     job_config.use_query_cache = ems_query_job_config.use_query_cache
     job_config.labels = ems_query_job_config.labels
     if ems_query_job_config.destination_table is not None:
         job_config.time_partitioning = TimePartitioning("DAY")
         table_reference = TableReference(
             DatasetReference(
                 ems_query_job_config.destination_project_id
                 or self.__project_id,
                 ems_query_job_config.destination_dataset),
             ems_query_job_config.destination_table)
         job_config.destination = table_reference
         job_config.write_disposition = ems_query_job_config.write_disposition.value
         job_config.create_disposition = ems_query_job_config.create_disposition.value
     partitioning = ems_query_job_config.time_partitioning
     if partitioning is not None:
         job_config.time_partitioning = TimePartitioning(
             partitioning.type.value, partitioning.field,
             partitioning.expiration_ms,
             partitioning.require_partition_filter)
     if ems_query_job_config.table_definitions is not None:
         job_config.table_definitions = ems_query_job_config.table_definitions
     return job_config
Exemple #5
0
 def table_exists(self, table: str) -> bool:
     try:
         self.__bigquery_client.get_table(
             table=TableReference.from_string(table))
         return True
     except NotFound:
         return False
Exemple #6
0
def log_bigquery_job(job: Union[bigquery.LoadJob, bigquery.QueryJob],
                     table: bigquery.TableReference,
                     message: Optional[str] = None,
                     severity: Optional[str] = 'NOTICE'):
    """
    Creates a structured log which includes a BigQuery job and table reference
    :param job:
    :param table:
    :param message:
    :param severity:
    """
    if job.error_result:
        severity = "ERROR"
        message = message or "BigQuery Job had errors."
    elif severity == "ERROR":
        message = message or ("BigQuery Job completed"
                              " but is considered an error.")
    else:
        severity = "NOTICE"
        message = message or "BigQuery Job completed without errors."

    print(
        json.dumps(
            dict(
                message=message,
                severity=severity,
                job=job.to_api_repr(),
                table=table.to_api_repr(),
                errors=job.error_result,
            )))
Exemple #7
0
def make_table(project,
               dataset_id,
               table_id,
               friendly_name=None,
               description=None,
               expires=None,
               partitioning_type=None,
               view_use_legacy_sql=None,
               view_query=None,
               schema=None,
               labels=None):
    dataset_ref = DatasetReference(project, dataset_id)
    table_ref = TableReference(dataset_ref, table_id)
    table = Table(table_ref)
    table.friendly_name = friendly_name
    table.description = description
    table.expires = expires
    table.partitioning_type = partitioning_type
    if view_use_legacy_sql is not None:
        table.view_use_legacy_sql = view_use_legacy_sql
    if view_query is not None:
        table.view_query = view_query
    table.schema = schema
    if labels is not None:
        table.labels = labels
    return table
Exemple #8
0
    def _table_ref(self, table_id):
        """Return a BigQuery client library table reference"""
        from google.cloud.bigquery import DatasetReference
        from google.cloud.bigquery import TableReference

        return TableReference(
            DatasetReference(self.project_id, self.dataset_id), table_id)
    def test_generate_queries(self, mock_table_cols, mock_table_dates,
                              mock_sandbox):
        table_cols_dict = {
            "condition_occurrence": [
                "condition_start_date", "condition_start_datetime",
                "condition_end_date", "condition_end_datetime"
            ],
            "measurement": ["measurement_date", "measurement_datetime"],
            'drug_exposure': [
                'drug_exposure_start_date', 'drug_exposure_start_datetime',
                'drug_exposure_end_date', 'drug_exposure_end_datetime',
                'verbatim_end_date'
            ]
        }

        table_cols_df = pd.DataFrame.from_dict({
            "table_name": [
                "condition_occurrence", "_mapping_condition_occurrence",
                "measurement", "measurement_ext", "drug_exposure_ext",
                "drug_exposure"
            ]
        })

        mock_table_cols.return_value = table_cols_df
        mock_table_dates.return_value = table_cols_dict
        mock_sandbox.return_value = self.dataset_id + '_sandbox'
        pid_rid_table_ref = TableReference.from_string(
            f"{self.project_id}.{self.pids_dataset_id}.{self.pids_table}")
        deactivated_pids_table_ref = TableReference.from_string(
            f"{self.project_id}.{self.deactivated_pids_dataset_id}.{self.deactivated_pids_table}"
        )

        queries = rdp.generate_queries(
            client=self.mock_bq_client,
            project_id=self.project_id,
            dataset_id=self.dataset_id,
            sandbox_dataset_id=self.dataset_id,
            deact_pids_table_ref=deactivated_pids_table_ref,
            pid_rid_table_ref=pid_rid_table_ref,
        )

        mock_table_cols.called_once_with(self.mock_bq_client, self.project_id,
                                         self.dataset_id)
        mock_table_dates.called_once_with(table_cols_df)

        # count sandbox and clean queries
        self.assertEqual(len(table_cols_dict) * 2, len(queries))
 def test_listing_tables_with_max(self):
     # type: () -> None
     dataset_ref1 = DatasetReference('my_project', 'dataset1')
     self.bq_client.create_dataset(Dataset(dataset_ref1))
     for i in range(10):
         self.bq_client.create_table(Table(TableReference(dataset_ref1, 'table{}'.format(i)),
                                           _TEST_SCHEMA))
     self.assertEqual(5, len(self.bq_client.list_tables(dataset_ref1, max_results=5)))
     self.assertEqual(10, len(self.bq_client.list_tables(dataset_ref1, max_results=20)))
     self.assertEqual(10, len(self.bq_client.list_tables(dataset_ref1)))
 def __create_test_table(self, table_name, dataset_id):
     table_schema = [
         SchemaField("int_data", "INT64"),
         SchemaField("str_data", "STRING")
     ]
     table_reference = TableReference(dataset_id, table_name)
     test_table = Table(table_reference, table_schema)
     test_table.time_partitioning = TimePartitioning("DAY")
     self.__delete_if_exists(test_table)
     self.GCP_BIGQUERY_CLIENT.create_table(test_table)
     return test_table
Exemple #12
0
 def run_async_load_job(self, job_id_prefix: str,
                        config: EmsLoadJobConfig) -> str:
     return self.__bigquery_client.load_table_from_uri(
         source_uris=config.source_uri_template,
         destination=TableReference(
             DatasetReference(config.destination_project_id,
                              config.destination_dataset),
             config.destination_table),
         job_id_prefix=job_id_prefix,
         location=self.__location,
         job_config=self.__create_load_job_config(config)).job_id
Exemple #13
0
    def run_async_extract_job(self, job_id_prefix: str, table: str,
                              destination_uris: List[str],
                              job_config: EmsExtractJobConfig) -> str:

        extract_job_config = self.__create_extract_job_config(job_config)

        return self.__bigquery_client.extract_table(
            source=TableReference.from_string(table_id=table),
            destination_uris=destination_uris,
            job_id_prefix=job_id_prefix,
            location=self.__location,
            job_config=extract_job_config).job_id
Exemple #14
0
def list_item_from_table_id(table_id: str) -> TableListItem:
    """
    Get a table list item as returned by :meth:`bigquery.Client.list_tables` 
    from a table ID
    
    :param table_id: A table ID including project ID, dataset ID, and table ID, 
      each separated by ``.``. 
    :return: a table list item
    """
    resource = {
        "tableReference": TableReference.from_string(table_id).to_api_repr()
    }
    return TableListItem(resource)
Exemple #15
0
 def get_raw_details(self, dataset_id, view_id):
     if self.did_full_update:
         return self.raw_details[dataset_id][view_id]
     if dataset_id not in self.raw_details:
         self.raw_details[dataset_id] = dict()
     if view_id not in self.raw_details[dataset_id]:
         try:
             self.raw_details[dataset_id][view_id] = self.bq_client._client.get_table( \
                 TableReference(DatasetReference(self.bq_conductor_conf.GOOGLE_CLOUD_PROJECT,
                                                 dataset_id.split('.')[-1]), view_id)).to_api_repr()
         except:
             self.raw_details[dataset_id][view_id] = None
     return self.raw_details[dataset_id][view_id]
Exemple #16
0
def log_api_error(table: bigquery.TableReference, message: str,
                  error: Union[GoogleAPIError, ClientError]):
    """
    Creates a structured api error log which includes a BigQuery table reference
    :param table:
    :param message:
    :param error:
    :return:
    """
    print(
        json.dumps(
            dict(message=message or error.message,
                 severity='ERROR',
                 table=table.to_api_repr(),
                 errors=error.errors or error.message or message)))
Exemple #17
0
def handle_bq_lock(gcs_client: storage.Client,
                   lock_blob: storage.Blob,
                   next_job_id: Optional[str],
                   table: bigquery.TableReference,
                   retry_attempt_cnt: Optional[int] = None):
    """Reclaim the lock blob for the new job id (in-place) or delete the lock
    blob if next_job_id is None."""
    try:
        if next_job_id:
            lock_blob_contents = json.dumps(
                dict(job_id=next_job_id,
                     table=table.to_api_repr(),
                     retry_attempt_cnt=retry_attempt_cnt))
            logging.log_with_table(
                table,
                f"Writing the following content to lock_blob {lock_blob.name}:"
                f" {dict(job_id=next_job_id, table=table.to_api_repr(), retry_attempt_cnt=retry_attempt_cnt)}"
            )
            if lock_blob.exists(client=gcs_client):
                lock_blob.upload_from_string(
                    lock_blob_contents,
                    if_generation_match=lock_blob.generation,
                    client=gcs_client)
            else:  # This happens when submitting the first job in the backlog
                lock_blob.upload_from_string(
                    lock_blob_contents,
                    if_generation_match=0,  # noqa: E126
                    client=gcs_client)
        else:
            logging.log_with_table(
                table, "releasing lock at: "
                f"gs://{lock_blob.bucket.name}/{lock_blob.name}")
            lock_blob.delete(
                if_generation_match=lock_blob.generation,
                client=gcs_client,
            )
    except (google.api_core.exceptions.PreconditionFailed,
            google.api_core.exceptions.NotFound) as err:
        if isinstance(err, google.api_core.exceptions.PreconditionFailed):
            raise exceptions.BacklogException(
                f"The lock at gs://{lock_blob.bucket.name}/{lock_blob.name} "
                f"was changed by another process.") from err
        logging.log_with_table(
            table,
            "Tried deleting a lock blob that was either already deleted "
            "or never existed.")
Exemple #18
0
def send_to_bq(dataset, table, row):
    bigquery_client = bigquery.Client(project='theo-home')

    table_ref = TableReference(
        dataset_ref=DatasetReference(dataset_id=dataset, project='theo-home'),
        table_id=table,
    )

    schema = [SchemaField(name=field, field_type=bq_types[type(data)]) for field, data in row.items()]

    table = bigquery_client.create_table(
        Table(table_ref, schema=schema),
        exists_ok=True
    )

    errors = bigquery_client.insert_rows(table, [row])
    if errors:
        print(errors, file=sys.stderr)
 def to_table(dataset_ref, model):
     schema = model.schema
     if schema:
         schema = tuple(
             BigQuerySchemaField.to_schema_field(s) for s in schema)
     else:
         schema = None
     table_ref = TableReference(dataset_ref, model.table_id)
     table = Table(table_ref, schema)
     table.friendly_name = model.friendly_name
     table.description = model.description
     table.expires = model.expires
     table.partitioning_type = model.partitioning_type
     if model.view_use_legacy_sql is not None:
         table.view_use_legacy_sql = model.view_use_legacy_sql
     if model.view_query is not None:
         table.view_query = model.view_query
     table.labels = model.labels if model.labels is not None else dict()
     return table
 def test_load_table_from_file(self, column_to_null):
     # type: (int) -> None
     dataset_ref = DatasetReference('my_project', 'my_dataset')
     dataset = Dataset(dataset_ref)
     table = Table(TableReference(dataset_ref, 'table1'), _TEST_SCHEMA)
     self.bq_client.create_dataset(dataset)
     self.bq_client.create_table(table)
     output = cStringIO()
     csv_out = csv.writer(output)
     input_row = list(self.INPUT_ROW)
     expected_row = list(self.EXPECTED_ROW)
     if column_to_null is not None:
         input_row[column_to_null] = 'NULL'
         expected_row[column_to_null] = None
     csv_out.writerow(input_row)
     self.bq_client.load_table_from_file(output, table.reference, job_config=None, rewind=True)
     self.assertRowsExpected(
             self.bq_client.query('SELECT * FROM `my_project.my_dataset.table1`',
                                  QueryJobConfig()),
             [expected_row])
Exemple #21
0
def log_with_table(
    table: bigquery.TableReference,
    message: str,
    severity: Optional[str] = 'NOTICE',
):
    """
    Creates a structured log which includes a BigQuery table reference
    :param table:
    :param message:
    :param severity:
    :return:
    """
    table_json = None
    if table is not None:
        table_json = table.to_api_repr()
    print(
        json.dumps(dict(
            message=message,
            severity=severity,
            table=table_json,
        )))
Exemple #22
0
    def test_delete_table(self):
        # type: () -> None
        dataset_ref1 = DatasetReference('my_project', 'dataset1')
        table_ref1 = TableReference(dataset_ref1, 'table1')

        # Can't delete table, dataset not created yet.
        with self.assertRaisesRegexp(NotFound, 'dataset1'):
            self.bq_client.delete_table(table_ref1)

        self.bq_client.create_dataset(Dataset(dataset_ref1))

        # Can't delete table, table not created yet.
        with self.assertRaisesRegexp(NotFound, 'table1'):
            self.bq_client.delete_table(table_ref1)

        table1 = Table(table_ref1, _TEST_SCHEMA)
        self.bq_client.create_table(table1)

        self.bq_client.delete_table(table_ref1)
        with self.assertRaisesRegexp(NotFound, 'table1'):
            self.bq_client.get_table(table_ref1)
Exemple #23
0
    def test_insert_rows(self):
        # type: () -> None
        dataset_ref = DatasetReference('my_project', 'my_dataset')
        dataset = Dataset(dataset_ref)
        table1_ref = TableReference(dataset_ref, 'table1')
        schema = [
            SchemaField(name="a", field_type='INT64'),
            SchemaField(name="b", field_type='FLOAT64'),
        ]
        table = Table(table1_ref, schema)
        self.bq_client.create_dataset(dataset)
        self.bq_client.create_table(table)

        # Insert two rows, check that they landed
        self.assertFalse(
            self.bq_client.insert_rows(table, [{
                'a': 1,
                'b': 2.5
            }, {
                'a': 3,
                'b': 4.25
            }]))
        self.assertRowsExpected(
            self.bq_client.query(
                'SELECT * FROM `my_project.my_dataset.table1`',
                QueryJobConfig()), [[1, 2.5], [3, 4.25]])

        # Insert two more rows, check that all four rows are now present.
        self.assertFalse(
            self.bq_client.insert_rows(table, [{
                'a': 5,
                'b': 6.5
            }, {
                'a': 7,
                'b': 8.25
            }]))
        self.assertRowsExpected(
            self.bq_client.query(
                'SELECT * FROM `my_project.my_dataset.table1`',
                QueryJobConfig()), [[1, 2.5], [3, 4.25], [5, 6.5], [7, 8.25]])
Exemple #24
0
def write_to_table_from_json_file(path: Path) -> LoadJob:
    table = TableReference(DatasetReference(PROJECT_NAME, BQ_DATASET),
                           BQ_TABLE)
    bq = BQ()
    job_config = job.LoadJobConfig()
    # job_config.autodetect = True
    job_config.source_format = job.SourceFormat.NEWLINE_DELIMITED_JSON
    job_config.write_disposition = job.WriteDisposition.WRITE_APPEND
    job_config.create_disposition = job.CreateDisposition.CREATE_NEVER

    try:
        with path.open("rb") as f:
            load_job: LoadJob = bq.load_table_from_file(f,
                                                        destination=table,
                                                        job_config=job_config)
            load_job.result()
        os.remove(str(path))
        logging.info(f"{load_job.job_id} completed at {load_job.ended}")
    except Exception as ex:
        logging.info(f"{load_job.job_id} crashed with {load_job.errors}")

    return load_job
Exemple #25
0
    def create(self, table_id, schema):
        """Create a table in Google BigQuery given a table and schema

        Parameters
        ----------
        table : str
            Name of table to be written
        schema : str
            Use the generate_bq_schema to generate your table schema from a
            dataframe.
        """
        from google.cloud.bigquery import DatasetReference
        from google.cloud.bigquery import Table
        from google.cloud.bigquery import TableReference

        if self.exists(table_id):
            raise TableCreationError(
                "Table {0} already exists".format(table_id)
            )

        if not _Dataset(self.project_id, credentials=self.credentials).exists(
            self.dataset_id
        ):
            _Dataset(
                self.project_id,
                credentials=self.credentials,
                location=self.location,
            ).create(self.dataset_id)

        table_ref = TableReference(
            DatasetReference(self.project_id, self.dataset_id), table_id
        )
        table = Table(table_ref)
        table.schema = pandas_gbq.schema.to_google_cloud_bigquery(schema)

        try:
            self.client.create_table(table)
        except self.http_error as ex:
            self.process_http_error(ex)
    def test_dataset_delete(self):
        # type: () -> None
        dataset1 = Dataset(DatasetReference('my_project', 'dataset1'))

        # Can't delete dataset, doesn't exist yet.
        with self.assertRaisesRegexp(NotFound, 'dataset1'):
            self.bq_client.delete_dataset(dataset1.reference)

        self.bq_client.create_dataset(dataset1)
        self.bq_client.create_table(Table(TableReference(dataset1.reference, 'table1'),
                                          _TEST_SCHEMA))

        # Can't delete dataset, not empty
        with self.assertRaises(BadRequest):
            self.bq_client.delete_dataset(dataset1.reference)

        # Okay to delete, specifically requesting to delete contents.
        self.bq_client.delete_dataset(dataset1.reference, delete_contents=True)

        # And now dataset is gone again.
        with self.assertRaisesRegexp(NotFound, 'dataset1'):
            self.bq_client.get_dataset(DatasetReference('my_project', 'dataset1'))
Exemple #27
0
    def __inner(s, i, client):
        # check if weather station (data) exists in BQ
        # not all stations are active now/data may not be available
        # Supply NaN's for missing weather columns

        log.info(f"Download data for station: {i}, site: {s}")
        this_station_ref = TableReference(dataset_ref_, str(s))
        if client.table_exists(this_station_ref):
            df = client.bq_to_df(this_station_ref)
            # remove any null year/month/day, sometimes we get dirty data
            df = df[(~df['year'].isnull()) & (~df['month'].isnull()) &
                    (~df['day'].isnull())]
            df.index = df.apply(lambda x: pd.datetime(int(x[
                'year']), int(x['month']), int(x['day'])),
                                axis=1)

            for dt in weather_data_types:
                if dt not in df.columns:
                    df[dt] = np.nan

            df = df_index.join(df)
            return df[weather_data_types]
        return pd.DataFrame()  # else return empty dataframe
Exemple #28
0
    def load_from_gcs(self, table_id, input_uri):
        job_results = {}

        client = self.bq_client

        project_id, ds_id, table_name = table_id.split(".")
        dataset = DatasetReference(project_id, ds_id)
        output_table = TableReference(dataset, table_name)

        ending = input_uri.split(".")[-1]
        file_type = self.__get_file_type(ending)

        config = LoadJobConfig(autodetect=True,
                               write_disposition=WriteDisposition.WRITE_APPEND,
                               source_format=file_type)
        loadJob = LoadJob(self.__create_job_id(project_id, "loadJob"),
                          input_uri,
                          output_table,
                          client,
                          job_config=config)

        job_results["loadJob"] = self.__get_results(loadJob)

        return job_results
Exemple #29
0
def external_query(  # pylint: disable=too-many-arguments
        gcs_client: storage.Client, bq_client: bigquery.Client, gsurl: str,
        query: str, job_id: str, table: bigquery.TableReference):
    """Load from query over external table from GCS.

    This hinges on a SQL query defined in GCS at _config/*.sql and
    an external table definition
    _config/{constants.BQ_EXTERNAL_TABLE_CONFIG_FILENAME} (otherwise will assume
    PARQUET external table)
    """
    external_table_config = read_gcs_file_if_exists(
        gcs_client,
        f"{gsurl}_config/{constants.BQ_EXTERNAL_TABLE_CONFIG_FILENAME}")
    if not external_table_config:
        external_table_config = look_for_config_in_parents(
            gcs_client, gsurl, constants.BQ_EXTERNAL_TABLE_CONFIG_FILENAME)
    if external_table_config:
        external_table_def = json.loads(external_table_config)
    else:
        print(f" {gsurl}_config/{constants.BQ_EXTERNAL_TABLE_CONFIG_FILENAME} "
              f"not found in parents of {gsurl}. "
              "Falling back to default PARQUET external table: "
              f"{json.dumps(constants.DEFAULT_EXTERNAL_TABLE_DEFINITION)}")
        external_table_def = constants.DEFAULT_EXTERNAL_TABLE_DEFINITION
    print(
        json.dumps(
            dict(message="Found external table definition.",
                 table=table.to_api_repr(),
                 external_table_def=external_table_def)))
    # Reduce the amount of sourceUris by using wildcards with common
    # prefixes. This is done to keep the cloud logging audit metadata
    # below 100k in size, otherwise the metadata is omitted in the event.
    source_uris_with_wildcards = compact_source_uris_with_wildcards(
        flatten2dlist(get_batches_for_gsurl(gcs_client, gsurl)))
    # This may cause an issue if >10,000 files.
    external_table_def["sourceUris"] = source_uris_with_wildcards
    # Check if hivePartitioningOptions
    # https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#hivepartitioningoptions
    # is set in external.json file
    if external_table_def.get("hivePartitioningOptions"):
        external_table_def["hivePartitioningOptions"] = {
            "mode":
            external_table_def["hivePartitioningOptions"].get("mode")
            or "AUTO",
            "sourceUriPrefix":
            get_hive_partitioning_source_uri_prefix(
                external_table_def["sourceUris"][0])
        }
    external_config = bigquery.ExternalConfig.from_api_repr(external_table_def)
    job_config = bigquery.QueryJobConfig(
        table_definitions={"temp_ext": external_config}, use_legacy_sql=False)

    # drop partition decorator if present.
    table_id = table.table_id.split("$")[0]
    # similar syntax to str.format but doesn't require escaping braces
    # elsewhere in query (e.g. in a regex)
    rendered_query = query.replace(
        "{dest_dataset}", f"`{table.project}`.{table.dataset_id}").replace(
            "{dest_table}", table_id)
    job: bigquery.QueryJob = bq_client.query(rendered_query,
                                             job_config=job_config,
                                             job_id=job_id)
    logging.log_bigquery_job(
        job, table, f"Submitted asynchronous query job: {job.job_id}")
    start_poll_for_errors = time.monotonic()
    # Check if job failed quickly
    while time.monotonic(
    ) - start_poll_for_errors < constants.WAIT_FOR_JOB_SECONDS:
        job.reload(client=bq_client)
        if job.state == "DONE":
            check_for_bq_job_and_children_errors(bq_client, job, table)
            return
        time.sleep(constants.JOB_POLL_INTERVAL_SECONDS)
Exemple #30
0
    ch.setFormatter(formatter)
    # add ch to logger
    log.addHandler(ch)
    return log


if __name__ == '__main__':

    log = _setup_logging()

    bqclient = BQClient()

    dataset_id = 'weather'
    dataset_ref_ = bqclient.client.dataset(dataset_id)

    stations_ref = TableReference(dataset_ref_, 'stations_site_list')

    # https://www.matthewproctor.com/australian_postcodes
    postcodes_ref = TableReference(dataset_ref_, 'australian_postcodes')

    stations = bqclient.bq_to_df(stations_ref)

    postcodes = bqclient.bq_to_df(postcodes_ref)

    kdtree = cKDTree(stations[['lat', 'lon']])

    weather_file_on_disc = 'historical_weather.pk'
    import pickle
    if Path(weather_file_on_disc).exists():
        historical_weather = pickle.load(open(weather_file_on_disc, 'rb'))
    else: