def select_insert(self,
                   source_table_id,
                   destination_table_id,
                   query_field,
                   prefix='    ',
                   fg='yellow'):
     query = 'SELECT {query_field} FROM {dataset_id}.{source_table_id}'.format(
         query_field=query_field,
         dataset_id=self._dataset_ref.dataset_id,
         source_table_id=source_table_id)
     destination_table = self.dataset.table(destination_table_id)
     job_config = QueryJobConfig()
     job_config.use_legacy_sql = False
     job_config.use_query_cache = False
     job_config.write_disposition = WriteDisposition.WRITE_TRUNCATE
     job_config.destination = destination_table
     job = self._client.query(query, job_config)
     echo('Inserting... {0}'.format(job.job_id),
          prefix=prefix,
          fg=fg,
          no_color=self.no_color)
     echo('  {0}'.format(job.query),
          prefix=prefix,
          fg=fg,
          no_color=self.no_color)
     job.result()
     assert job.state == 'DONE'
     error_result = job.error_result
     if error_result:
         raise RuntimeError(job.errors)
Beispiel #2
0
    def select_into(self, query: str, output_dataset: str,
                    output_table: str) -> bool:
        logging.info(
            f"DataWarehouse.select_into -> {output_dataset}.{output_table} ..."
        )
        client = self._get_client()

        config = QueryJobConfig()
        config.allow_large_results = True
        config.destination = f"{self.config.gcp_project}.{output_dataset}.{output_table}"
        config.create_disposition = CreateDisposition.CREATE_IF_NEEDED
        config.write_disposition = WriteDisposition.WRITE_TRUNCATE

        query_job = client.query(query, config)

        # Execute the thing and check the result
        try:
            result = query_job.result()
            mb = int(query_job.total_bytes_processed / (1024 * 1024))
            logging.info(
                f"DataWarehouse.select_into -> {output_dataset}.{output_table}: {query_job.state} (processed {mb}mb)"
            )
            return True
        except:
            logging.error(
                f"DataWarehouse.select_into -> Exception: \n\t{query_job.error_result.message}"
            )
            return False
Beispiel #3
0
    def execute(self,
                query,
                destination_table,
                write_disposition="WRITE_TRUNCATE",
                allow_large_results=True):
        """
        :param query_file: query file path
        :param destination_table: target table
        :param write_disposition:  default is to replace existing table. To append: WRITE_APPEND
        :param allow_large_results: default to True
        :return:
        """
        query_configuration = QueryJobConfig()
        query_configuration.use_legacy_sql = False
        if destination_table:
            ref = TableReferenceBuilder(destination_table, self._dataset,
                                        self._project)
            query_configuration.write_disposition = write_disposition
            query_configuration.default_dataset = ref.dataset_reference
            query_configuration.destination = ref.table_reference
            query_configuration.allow_large_results = allow_large_results

        sql_query = self.__get_query(query)
        if not self._quiet:
            print("-- #### {}\n{}\n".format(destination_table or "",
                                            sql_query))

        self._query_job = bigquery.Client(project=self._project).query(
            sql_query, job_config=query_configuration)
        if self._query_job.errors:
            raise Exception(self._query_job.errors)
Beispiel #4
0
def run_query_job(
    querytext: str,
    temp_table: str = None,
    query_job_config: QueryJobConfig = QueryJobConfig()
) -> QueryJob:
    """
    Set up and run a query job.

    Arguments:
        querytext {str} -- The querytext for the job.

    Keyword Arguments:
        temp_table {str} -- A temporary table in which to materialize results.
        The results will be streamed from this table when done. This is
        required for all large queries, and strongly recommended.
        (default: {None})

        query_job_config {QueryJobConfig} -- A QueryJobConfig to start from.
        (default: {QueryJobConfig()})

    Returns:
        QueryJob -- The resulting job.
    """
    LOG.debug("Running query: %s", querytext)
    client = get_bq_client()
    if temp_table:
        query_job_config.destination = temp_table
        query_job_config.write_disposition = WriteDisposition.WRITE_TRUNCATE
    return client.query(query=querytext, job_config=query_job_config)
Beispiel #5
0
    def write_to_table_with_query(self, write_disposition):
        # type: (str) -> None
        """Query all rows from source table, write to destination table w/ requested disposition.

        Args:
            write_disposition: Whether to require the destination table to be empty,
                to append to it, or to overwrite (truncate) it.
        """
        job_config = QueryJobConfig()
        job_config.destination = self.destination_table.reference
        job_config.write_disposition = write_disposition
        self.bq_client.query(
            'SELECT * FROM `{}.{}.{}`'.format(self.source_table.project,
                                              self.source_table.dataset_id,
                                              self.source_table.table_id),
            job_config)
Beispiel #6
0
    def create_table_from_query(
        self,
        query,  # type: str
        table_path,  # type: str
        write_disposition='WRITE_EMPTY',  # type: Optional[str]
        use_legacy_sql=False,  # type: Optional[bool]
        max_wait_secs=None,  # type: Optional[int]
        expected_schema=None  # type: Optional[List[SchemaField]]
    ):
        # type: (...) -> None
        """Creates a table in BigQuery from a specified query.

        Args:
          query: The query to run.
          table_path: The path to the table (in the client's project) to write
              the results to.
          write_disposition: Specifies behavior if table already exists. See options here:
              https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs under
              configuration.query.writeDisposition
          use_legacy_sql: Whether the query is written in standard or legacy sql.
          max_wait_secs: Seconds to wait for the query before timing out. If not
                set, the class default will be used.
          expected_schema: The expected schema of the resulting table; unused in this implementation
        """

        if write_disposition not in [
                'WRITE_TRUNCATE', 'WRITE_APPEND', 'WRITE_EMPTY'
        ]:
            raise ValueError(
                'write_disposition must be one of WRITE_TRUNCATE, '
                'WRITE_APPEND, or WRITE_EMPTY')

        config = QueryJobConfig()
        if self.maximum_billing_tier:
            config.maximum_billing_tier = self.maximum_billing_tier
        config.use_legacy_sql = use_legacy_sql
        config.write_disposition = write_disposition
        config.allow_large_results = True

        config.destination = self.get_table_reference_from_path(table_path)

        query_job = self._run_async_query(query, job_config=config)

        return self._wait_for_job(query_job,
                                  query,
                                  max_wait_secs=max_wait_secs
                                  or self.max_wait_secs)
Beispiel #7
0
    def execute(self, query, tbl_ref=None, append=False, preview=True):
        sql_query = self.__get_query(query)
        if tbl_ref:
            print("-- ## " + str(tbl_ref))
        print("{}{}".format("-- preview: \n" if preview else "", sql_query))
        if preview:
            return

        job_conf = QueryJobConfig()
        job_conf.use_legacy_sql = False
        if tbl_ref:
            job_conf.write_disposition = "WRITE_APPEND" if append else "WRITE_TRUNCATE"
            job_conf.default_dataset = tbl_ref.dataset_ref
            job_conf.destination = tbl_ref.table_ref
            job_conf.allow_large_results = True
        query_job = self.connect(tbl_ref.project if tbl_ref else None).query(
            sql_query, job_config=job_conf)
        if query_job.errors:
            raise Exception(query_job.errors)
Beispiel #8
0
    def create_table_from_query(self,
                                query,  # type: str
                                table_path,  # type: str
                                write_disposition='WRITE_EMPTY',  # type: Optional[str]
                                use_legacy_sql=False,  # type: Optional[bool]
                                max_wait_secs=None,  # type: Optional[int]
                                expected_schema=None  # type: Optional[List[SchemaField]]
                                ):
        # type: (...) -> None
        """Creates a table in BigQuery from a specified query.

        Args:
          query: The query to run.
          table_path: The path to the table (in the client's project) to write
              the results to.
          write_disposition: One of 'WRITE_TRUNCATE', 'WRITE_APPEND',
              'WRITE_EMPTY'. Default is WRITE_EMPTY.
          use_legacy_sql: Whether the query is written in standard or legacy sql.
          max_wait_secs: Seconds to wait for the query before timing out. If not
                set, the class default will be used.
          expected_schema: The expected schema of the resulting table; unused in this implementation
        """

        if write_disposition not in ['WRITE_TRUNCATE', 'WRITE_APPEND', 'WRITE_EMPTY']:
            raise ValueError('write_disposition must be one of WRITE_TRUNCATE, '
                             'WRITE_APPEND, or WRITE_EMPTY')

        config = QueryJobConfig()
        if self.maximum_billing_tier:
            config.maximum_billing_tier = self.maximum_billing_tier
        config.use_legacy_sql = use_legacy_sql
        config.write_disposition = write_disposition
        config.allow_large_results = True

        config.destination = self.get_table_reference_from_path(table_path)

        query_job = self.gclient.query(query, job_config=config, retry=self.default_retry)

        return query_job.result(timeout=max_wait_secs or self.max_wait_secs)
Beispiel #9
0
def materialize_view(  # pylint: disable=too-many-arguments, too-many-locals
    client: bigquery.Client,
    source_view_name: str,
    destination_table_name: str,
    project: str,
    source_dataset: str,
    destination_dataset: str,
) -> QueryJob:
    query = get_select_all_from_query(source_view_name,
                                      project=project,
                                      dataset=source_dataset)
    LOGGER.info("materializing view: %s.%s -> %s.%s", source_dataset,
                source_view_name, destination_dataset, destination_table_name)
    LOGGER.debug("materialize_view: %s=%s", destination_table_name, [query])

    start = time.perf_counter()
    dataset_ref = client.dataset(destination_dataset)
    destination_table_ref = dataset_ref.table(destination_table_name)

    job_config = QueryJobConfig()
    job_config.destination = destination_table_ref
    job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE

    query_job = client.query(query, job_config=job_config)
    # getting the result will make sure that the query ran successfully
    result: bigquery.table.RowIterator = query_job.result()
    duration = time.perf_counter() - start
    total_bytes_processed = query_job.total_bytes_processed
    LOGGER.info(
        'materialized view: %s.%s, total rows: %s, %s bytes processed, took: %.3fs',
        source_dataset, source_view_name, result.total_rows,
        total_bytes_processed, duration)
    if LOGGER.isEnabledFor(logging.DEBUG):
        sample_result = list(islice(result, 3))
        LOGGER.debug("sample_result: %s", sample_result)
    return MaterializeViewResult(total_bytes_processed=total_bytes_processed,
                                 total_rows=result.total_rows)