Beispiel #1
0
    def get_query_results(self, query, use_legacy_sql=False, max_wait_secs=None):
        # type: (str, Optional[bool], Optional[int]) -> List[Tuple[Any]]
        """Returns a list or rows, each of which is a tuple of values.

        Args:
            query: A string with a complete SQL query.
            use_legacy_sql: Whether to use legacy SQL
            max_wait_secs: The maximum number of seconds to wait for the query to complete. If not
                set, the class default will be used.

        Returns:
            A list of tuples of values.
        """
        config = QueryJobConfig()
        if self.maximum_billing_tier:
            config.maximum_billing_tier = self.maximum_billing_tier

        config.use_legacy_sql = use_legacy_sql

        query_job = self.gclient.query(query, job_config=config,
                                       retry=self.default_retry_for_api_calls)
        # The above retry is for errors encountered in executing the jobs. The below retry is
        # for errors encountered in polling to see whether the job is done.
        query_job._retry = self.default_retry_for_async_jobs

        rows = self._wait_for_job(query_job, query,
                                  max_wait_secs=max_wait_secs or self.max_wait_secs)
        if query_job.errors:
            logging.warning('Errors in get_query_results: {}'.format(query_job.errors))
        return [x.values() for x in list(rows)]
Beispiel #2
0
def run_query_job(
    querytext: str,
    temp_table: str = None,
    query_job_config: QueryJobConfig = QueryJobConfig()
) -> QueryJob:
    """
    Set up and run a query job.

    Arguments:
        querytext {str} -- The querytext for the job.

    Keyword Arguments:
        temp_table {str} -- A temporary table in which to materialize results.
        The results will be streamed from this table when done. This is
        required for all large queries, and strongly recommended.
        (default: {None})

        query_job_config {QueryJobConfig} -- A QueryJobConfig to start from.
        (default: {QueryJobConfig()})

    Returns:
        QueryJob -- The resulting job.
    """
    LOG.debug("Running query: %s", querytext)
    client = get_bq_client()
    if temp_table:
        query_job_config.destination = temp_table
        query_job_config.write_disposition = WriteDisposition.WRITE_TRUNCATE
    return client.query(query=querytext, job_config=query_job_config)
Beispiel #3
0
    def get_query_results(self,
                          query,
                          use_legacy_sql=False,
                          max_wait_secs=None):
        # type: (str, Optional[bool], Optional[int]) -> List[Tuple[Any]]
        """Returns a list or rows, each of which is a tuple of values.

        Args:
            query: A string with a complete SQL query.
            use_legacy_sql: Whether to use legacy SQL
            max_wait_secs: The maximum number of seconds to wait for the query to complete. If not
                set, the class default will be used.

        Returns:
            A list of tuples of values.
        """
        config = QueryJobConfig()
        if self.maximum_billing_tier:
            config.maximum_billing_tier = self.maximum_billing_tier

        config.use_legacy_sql = use_legacy_sql

        query_job = self._run_async_query(query, job_config=config)

        rows = self._wait_for_job(query_job,
                                  query,
                                  max_wait_secs=max_wait_secs
                                  or self.max_wait_secs)
        if query_job.errors:
            logging.warning('Errors in get_query_results: {}'.format(
                query_job.errors))
        return [x.values() for x in list(rows)]
Beispiel #4
0
def load_query_result_to_table(dest_table,
                               query,
                               part_col_name=None,
                               clustering_fields=None):
    bq_client = get_bigquery_client()
    qjc = None
    print(query)
    if bq_table_exists(dest_table):
        table = bq_client.get_table(dest_table)
        qjc = QueryJobConfig(
            destination=dest_table,
            write_disposition="WRITE_TRUNCATE",
            create_disposition="CREATE_IF_NEEDED",
            time_partitioning=table.time_partitioning,
            range_partitioning=table.range_partitioning,
            clustering_fields=table.clustering_fields,
        )
        job = bq_client.query(query, job_config=qjc)
        job.result()

    else:
        import time

        temp_table_name = f"load_query_result_to_table__{str(int(time.time()))}"
        bq_client.query(
            f"CREATE OR REPLACE TABLE temp_1d.{temp_table_name} AS {query}"
        ).result()
        if part_col_name:
            schema = bq_client.get_table(f"temp_1d.{temp_table_name}").schema
            partition_type = [
                f for f in schema if f.name.lower() == part_col_name.lower()
            ][0].field_type
            if partition_type == "DATE":
                qjc = QueryJobConfig(
                    destination=dest_table,
                    write_disposition="WRITE_TRUNCATE",
                    create_disposition="CREATE_IF_NEEDED",
                    time_partitioning=TimePartitioning(field=part_col_name),
                    clustering_fields=clustering_fields,
                )
            elif partition_type == "INTEGER":
                qjc = QueryJobConfig(
                    destination=dest_table,
                    write_disposition="WRITE_TRUNCATE",
                    create_disposition="CREATE_IF_NEEDED",
                    range_partitioning=RangePartitioning(PartitionRange(
                        start=200001, end=209912, interval=1),
                                                         field=part_col_name),
                    clustering_fields=clustering_fields,
                )
            else:
                print(partition_type)
                raise Exception(
                    f"Partition column[{part_col_name}] is neither DATE or INTEGER type."
                )
        job = bq_client.query(f"SELECT * FROM temp_1d.{temp_table_name}",
                              job_config=qjc)
        job.result()
Beispiel #5
0
    def dry_run(self, query: str) -> List[SqlColumn]:
        client = self._get_client()

        logging.info(f"DataWarehouse.dry_run")
        config = QueryJobConfig()
        config.dry_run = True
        query_job = client.query(query, config)

        result = query_job.result()
        return DataWarehouse._translate_columns(result.schema)
Beispiel #6
0
def execute_sync_query(project_id, query_str, bq_client=None):
    if bq_client is None:
        bq_client = bigquery.Client(project_id)
    config = QueryJobConfig()
    config.use_legacy_sql = False
    config.use_query_cache = False
    query_job = bq_client.query(query_str, job_config=config, location="EU")

    result = []
    for row in query_job:
        result.append(row)

    return result
Beispiel #7
0
def bq_query_input_solid(context, sql_queries: List[str]) -> List[DataFrame]:
    query_job_config = _preprocess_config(context.solid_config.get('query_job_config', {}))

    results = []
    for sql_query in sql_queries:
        cfg = QueryJobConfig(**query_job_config) if query_job_config else None
        context.log.info(
            'executing query %s with config: %s'
            % (sql_query, cfg.to_api_repr() if cfg else '(no config provided)')
        )
        results.append(context.resources.bigquery.query(sql_query, job_config=cfg).to_dataframe())

    return results
Beispiel #8
0
    def write_to_table_with_query(self, write_disposition):
        # type: (str) -> None
        """Query all rows from source table, write to destination table w/ requested disposition.

        Args:
            write_disposition: Whether to require the destination table to be empty,
                to append to it, or to overwrite (truncate) it.
        """
        job_config = QueryJobConfig()
        job_config.destination = self.destination_table.reference
        job_config.write_disposition = write_disposition
        self.bq_client.query(
            'SELECT * FROM `{}.{}.{}`'.format(self.source_table.project,
                                              self.source_table.dataset_id,
                                              self.source_table.table_id),
            job_config)
Beispiel #9
0
 def _execute_query_custom(self, query):
     client = self.connection()
     con = google.cloud.bigquery.dbapi.connect(client=client)
     cursor = Cursor(con)
     try:
         cursor.execute(
             query,
             job_config=QueryJobConfig(script_options=ScriptOptions(
                 statement_timeout_ms=120000)))
     except Exception as e:
         cursor.close()
         con.close()
         raise e
     con.commit()
     try:
         result = cursor.fetchall()
     except AttributeError:
         result = None
     except:
         raise
     cursor.close()
     con.close()
     query_create_table = re.search("(?i)(?<=((create table ))).*(?= as)",
                                    query)
     if result:
         return [dict(r) for r in result]
     elif query_create_table:
         return {'execute_query': query_create_table}
     else:
         empty_list = []
         return empty_list
Beispiel #10
0
def do_extract(fq_dataset, max_tables, query_project, fq_destination_table,
               fq_sample_mapping_table, cohort_sample_names_file,
               sample_map_outfile, ttl, number_of_partitions,
               probes_per_partition, extract_genotype_counts_only):
    try:
        global client
        client = bigquery.Client(project=query_project,
                                 default_query_job_config=QueryJobConfig(
                                     priority="INTERACTIVE",
                                     use_query_cache=False))

        global RAW_ARRAY_TABLE_COUNT
        RAW_ARRAY_TABLE_COUNT = max_tables
        print(f"Using {RAW_ARRAY_TABLE_COUNT} tables in {fq_dataset}...")

        cohort = get_all_samples(fq_sample_mapping_table,
                                 cohort_sample_names_file, sample_map_outfile)
        print(
            f"Discovered {len(cohort)} samples in {fq_sample_mapping_table}..."
        )

        populate_extract_table(fq_dataset, cohort, fq_destination_table, ttl,
                               number_of_partitions, probes_per_partition,
                               extract_genotype_counts_only)

        print(f"\nFinal cohort extract written to {fq_destination_table}\n")
    except Exception as err:
        print(f"Unexpected error! {err}")
        raise
    finally:
        dump_job_stats()
Beispiel #11
0
    def test_execute_w_query_dry_run(self):
        from google.cloud.bigquery.job import QueryJobConfig
        from google.cloud.bigquery.schema import SchemaField
        from google.cloud.bigquery import dbapi

        connection = dbapi.connect(
            self._mock_client(
                rows=[("hello", "world", 1), ("howdy", "y'all", 2)],
                schema=[
                    SchemaField("a", "STRING", mode="NULLABLE"),
                    SchemaField("b", "STRING", mode="REQUIRED"),
                    SchemaField("c", "INTEGER", mode="NULLABLE"),
                ],
                dry_run_job=True,
                total_bytes_processed=12345,
            )
        )
        cursor = connection.cursor()

        cursor.execute(
            "SELECT a, b, c FROM hello_world WHERE d > 3;",
            job_config=QueryJobConfig(dry_run=True),
        )

        self.assertEqual(cursor.rowcount, 0)
        self.assertIsNone(cursor.description)
        rows = cursor.fetchall()
        self.assertEqual(list(rows), [])
Beispiel #12
0
        def _compute_fn(context, _):
            query_job_config = _preprocess_config(context.solid_config.get('query_job_config', {}))

            # Retrieve results as pandas DataFrames
            results = []
            for sql_query in sql_queries:
                # We need to construct a new QueryJobConfig for each query.
                # See: https://bit.ly/2VjD6sl
                cfg = QueryJobConfig(**query_job_config) if query_job_config else None
                context.log.info(
                    'executing query %s with config: %s'
                    % (sql_query, cfg.to_api_repr() if cfg else '(no config provided)')
                )
                results.append(context.resources.bq.query(sql_query, job_config=cfg).to_dataframe())

            yield Result(results)
Beispiel #13
0
    def execute(self,
                query,
                destination_table,
                write_disposition="WRITE_TRUNCATE",
                allow_large_results=True):
        """
        :param query_file: query file path
        :param destination_table: target table
        :param write_disposition:  default is to replace existing table. To append: WRITE_APPEND
        :param allow_large_results: default to True
        :return:
        """
        query_configuration = QueryJobConfig()
        query_configuration.use_legacy_sql = False
        if destination_table:
            ref = TableReferenceBuilder(destination_table, self._dataset,
                                        self._project)
            query_configuration.write_disposition = write_disposition
            query_configuration.default_dataset = ref.dataset_reference
            query_configuration.destination = ref.table_reference
            query_configuration.allow_large_results = allow_large_results

        sql_query = self.__get_query(query)
        if not self._quiet:
            print("-- #### {}\n{}\n".format(destination_table or "",
                                            sql_query))

        self._query_job = bigquery.Client(project=self._project).query(
            sql_query, job_config=query_configuration)
        if self._query_job.errors:
            raise Exception(self._query_job.errors)
Beispiel #14
0
    def _solid(context):  # pylint: disable=unused-argument
        query_job_config = _preprocess_config(context.solid_config.get("query_job_config", {}))

        # Retrieve results as pandas DataFrames
        results = []
        for sql_query in sql_queries:
            # We need to construct a new QueryJobConfig for each query.
            # See: https://bit.ly/2VjD6sl
            cfg = QueryJobConfig(**query_job_config) if query_job_config else None
            context.log.info(
                "executing query %s with config: %s"
                % (sql_query, cfg.to_api_repr() if cfg else "(no config provided)")
            )
            results.append(
                context.resources.bigquery.query(sql_query, job_config=cfg).to_dataframe()
            )

        return results
Beispiel #15
0
    def start_async_job(self, query, dest_path=None):
        # type: (str, Optional[str]) -> QueryJob
        """
        Makes a QueryJob for the given query to be written to the dest_path, and starts it,
        returning the job.

        Args:
            query: The query string to run.
            dest_path: String of the path to the destination table. It's None if the query is a
                Data Definition Language (DDL) statement (CREATE/ALTER/DELETE tables), because
                destination table is already specified in a DDL query

        Returns:
            A QueryJob instance for the job

        Raises:
            ValueError. If dest_path is specified for a DDL query or dest_path is missing for a
            non-DDL query.
        """

        is_ddl_query = any(query.strip().upper().startswith(ddl_op)
                           for ddl_op in ['CREATE', 'ALTER', 'DROP'])

        # Make an asynchronous job and start it.
        config = QueryJobConfig()
        if dest_path:
            if is_ddl_query:
                raise ValueError(
                    'Cannot specify destination path for the DDL query below:\n '
                    + query)
            # allow_large_results requires destination to be specified
            config.allow_large_results = True
            config.destination = (
                self.get_table_reference_from_path(dest_path))

        elif not is_ddl_query:
            raise ValueError('Destination table is not specified, '
                             'But the query below is not a DDL statement:\n ' +
                             query)

        return self._run_async_query(query, config)
    def _configure_gcp_client(self, query_job_config):
        """Configure GCP client."""
        logger.info('Storing BigQuery Auth Credentials')
        os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = SETTINGS.bigquery_credentials_filepath
        logger.info('Creating new query job configuration')
        if query_job_config:
            self.query_job_config = query_job_config
        else:
            self.query_job_config = QueryJobConfig(use_legacy_sql=False,
                                                   use_query_cache=True)

        self.client = Client(default_query_job_config=self.query_job_config)
Beispiel #17
0
    def test_insert_rows(self):
        # type: () -> None
        dataset_ref = DatasetReference('my_project', 'my_dataset')
        dataset = Dataset(dataset_ref)
        table1_ref = TableReference(dataset_ref, 'table1')
        schema = [
            SchemaField(name="a", field_type='INT64'),
            SchemaField(name="b", field_type='FLOAT64'),
        ]
        table = Table(table1_ref, schema)
        self.bq_client.create_dataset(dataset)
        self.bq_client.create_table(table)

        # Insert two rows, check that they landed
        self.assertFalse(
            self.bq_client.insert_rows(table, [{
                'a': 1,
                'b': 2.5
            }, {
                'a': 3,
                'b': 4.25
            }]))
        self.assertRowsExpected(
            self.bq_client.query(
                'SELECT * FROM `my_project.my_dataset.table1`',
                QueryJobConfig()), [[1, 2.5], [3, 4.25]])

        # Insert two more rows, check that all four rows are now present.
        self.assertFalse(
            self.bq_client.insert_rows(table, [{
                'a': 5,
                'b': 6.5
            }, {
                'a': 7,
                'b': 8.25
            }]))
        self.assertRowsExpected(
            self.bq_client.query(
                'SELECT * FROM `my_project.my_dataset.table1`',
                QueryJobConfig()), [[1, 2.5], [3, 4.25], [5, 6.5], [7, 8.25]])
Beispiel #18
0
    def select_into(self, query: str, output_dataset: str,
                    output_table: str) -> bool:
        logging.info(
            f"DataWarehouse.select_into -> {output_dataset}.{output_table} ..."
        )
        client = self._get_client()

        config = QueryJobConfig()
        config.allow_large_results = True
        config.destination = f"{self.config.gcp_project}.{output_dataset}.{output_table}"
        config.create_disposition = CreateDisposition.CREATE_IF_NEEDED
        config.write_disposition = WriteDisposition.WRITE_TRUNCATE

        query_job = client.query(query, config)

        # Execute the thing and check the result
        try:
            result = query_job.result()
            mb = int(query_job.total_bytes_processed / (1024 * 1024))
            logging.info(
                f"DataWarehouse.select_into -> {output_dataset}.{output_table}: {query_job.state} (processed {mb}mb)"
            )
            return True
        except:
            logging.error(
                f"DataWarehouse.select_into -> Exception: \n\t{query_job.error_result.message}"
            )
            return False
 def select_insert(self,
                   source_table_id,
                   destination_table_id,
                   query_field,
                   prefix='    ',
                   fg='yellow'):
     query = 'SELECT {query_field} FROM {dataset_id}.{source_table_id}'.format(
         query_field=query_field,
         dataset_id=self._dataset_ref.dataset_id,
         source_table_id=source_table_id)
     destination_table = self.dataset.table(destination_table_id)
     job_config = QueryJobConfig()
     job_config.use_legacy_sql = False
     job_config.use_query_cache = False
     job_config.write_disposition = WriteDisposition.WRITE_TRUNCATE
     job_config.destination = destination_table
     job = self._client.query(query, job_config)
     echo('Inserting... {0}'.format(job.job_id),
          prefix=prefix,
          fg=fg,
          no_color=self.no_color)
     echo('  {0}'.format(job.query),
          prefix=prefix,
          fg=fg,
          no_color=self.no_color)
     job.result()
     assert job.state == 'DONE'
     error_result = job.error_result
     if error_result:
         raise RuntimeError(job.errors)
Beispiel #20
0
    def __init__(
        self,
        *,
        from_: Union[BaseResourceLoader, str],
        bqtk_config: BQTestKitConfig,
        location: Optional[str] = None,
        bq_client: Client,
        job_config: QueryJobConfig = None,
        project: Project = None,
        interpolators: List[BaseInterpolator] = None,
        global_dict: Dict[str, Any] = None,
        temp_tables: List[Tuple[BaseDataLiteralTransformer,
                                TableResources]] = None,
        temp_technical_column_prefix: str = DEFAULT_TECHNICAL_COLUMN_PREFIX
    ) -> None:
        """Constructor of BQQueryTemplate

        Args:
            from_ (Union[BaseResourceLoader, str]): query to load from.
            bqtk_config (BQTestKitConfig): config used accross the query DSL.
            bq_client (Client): instance of bigquery client to use accross the datasetL.
            location (Optional[str], optional): force location for dataset. Defaults extracted from bqtk_config.
            job_config (QueryJobConfig, optional): Configure job. Defaults to QueryJobConfig().
            project (Project, optional): project in which this query should be run.
                Allows usage of relative table name. Defaults to None.
            interpolators (List[BaseInterpolator], optional): List of interpolator to use before running query.
                Defaults to None.
            global_dict (Dict[str, Any], optional): global dictionary to mix with local interpolator's dictionary.
                Defaults to None.
            temp_tables (List[Tuple[BaseDataLiteralTransformer, TableResources]]):
                list of all table to create as temp table with a data literal.
                Defaults to None.
            temp_technical_column_prefix (str):
                prefix used when renaming partition column which are invalid in bigquery.
                Defaults to bq_test_kit.constants.DEFAULT_TECHNICAL_COLUMN_PREFIX.
        """
        self.from_ = from_
        self._bq_client = bq_client
        self.job_config = job_config if job_config else QueryJobConfig()
        self.location = location if location else bqtk_config.get_default_location(
        )
        self.project = project
        self.interpolators = interpolators if interpolators else []
        self.bqtk_config = bqtk_config
        self.global_dict = global_dict if global_dict else {}
        self.temp_tables = ([
            self._to_temp_tables_with_schema_field(temp_table)
            for temp_table in temp_tables
        ] if temp_tables else [])
        self.temp_technical_column_prefix = temp_technical_column_prefix
Beispiel #21
0
    def test_write_query_result_write_disposition_append(self):
        # type: () -> None

        # You can write into destination_table with WRITE_APPEND
        self.bq_client.create_table(self.destination_table)
        self.write_to_table_with_query('WRITE_APPEND')

        # And you can do it again
        self.write_to_table_with_query('WRITE_APPEND')

        self.assertRowsExpected(
            self.bq_client.query(
                'SELECT * FROM `my_project.my_dataset.destination_table`',
                QueryJobConfig()), [[1, 2.5], [3, 4.25], [1, 2.5], [3, 4.25]])
    def test_write_query_result_write_disposition_empty(self):
        # type: () -> None
        # You can write into destination_table with WRITE_EMPTY because it's empty
        # Note we do not create the table first; write_empty creates the table.
        self.write_to_table_with_query('WRITE_EMPTY')

        # ... but you can't do that again, because now it's not
        with self.assertRaisesRegexp(ValueError, 'trying to overwrite nonempty table'):
            self.write_to_table_with_query('WRITE_EMPTY')

        self.assertRowsExpected(
                self.bq_client.query('SELECT * FROM `my_project.my_dataset.destination_table`',
                                     QueryJobConfig()),
                [[1, 2.5], [3, 4.25]])
    def test_write_query_result_write_disposition_truncate(self):
        # type: () -> None

        self.bq_client.create_table(self.destination_table)
        # Stick a row into destination table
        self.assertFalse(self.bq_client.insert_rows(self.destination_table, [{'a': 5, 'b': 6}]))

        # Overwrite destination_table with the data from source_table with WRITE_TRUNCATE
        self.write_to_table_with_query('WRITE_TRUNCATE')

        self.assertRowsExpected(
                self.bq_client.query('SELECT * FROM `my_project.my_dataset.destination_table`',
                                     QueryJobConfig()),
                [[1, 2.5], [3, 4.25]])
Beispiel #24
0
def materialize_view(  # pylint: disable=too-many-arguments, too-many-locals
    client: bigquery.Client,
    source_view_name: str,
    destination_table_name: str,
    project: str,
    source_dataset: str,
    destination_dataset: str,
) -> QueryJob:
    query = get_select_all_from_query(source_view_name,
                                      project=project,
                                      dataset=source_dataset)
    LOGGER.info("materializing view: %s.%s -> %s.%s", source_dataset,
                source_view_name, destination_dataset, destination_table_name)
    LOGGER.debug("materialize_view: %s=%s", destination_table_name, [query])

    start = time.perf_counter()
    dataset_ref = client.dataset(destination_dataset)
    destination_table_ref = dataset_ref.table(destination_table_name)

    job_config = QueryJobConfig()
    job_config.destination = destination_table_ref
    job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE

    query_job = client.query(query, job_config=job_config)
    # getting the result will make sure that the query ran successfully
    result: bigquery.table.RowIterator = query_job.result()
    duration = time.perf_counter() - start
    total_bytes_processed = query_job.total_bytes_processed
    LOGGER.info(
        'materialized view: %s.%s, total rows: %s, %s bytes processed, took: %.3fs',
        source_dataset, source_view_name, result.total_rows,
        total_bytes_processed, duration)
    if LOGGER.isEnabledFor(logging.DEBUG):
        sample_result = list(islice(result, 3))
        LOGGER.debug("sample_result: %s", sample_result)
    return MaterializeViewResult(total_bytes_processed=total_bytes_processed,
                                 total_rows=result.total_rows)
Beispiel #25
0
    def get_query_results(self, query, use_legacy_sql=False, max_wait_secs=None):
        # type: (str, Optional[Bool], Optional[int]) -> List[Tuple[Any]]
        """Returns a list or rows, each of which is a tuple of values.

        Args:
            query: A string with a complete SQL query.
            use_legacy_sql: Whether to use legacy SQL
            max_wait_secs: The maximum number of seconds to wait for the query to complete. If not
                set, the class default will be used.

        Returns:
            A list of tuples of values.
        """
        config = QueryJobConfig()
        if self.maximum_billing_tier:
            config.maximum_billing_tier = self.maximum_billing_tier

        config.use_legacy_sql = use_legacy_sql

        query_job = self.gclient.query(query, job_config=config, retry=self.default_retry)

        rows = query_job.result(retry=self.default_retry,
                                timeout=max_wait_secs or self.max_wait_secs)
        return [x.values() for x in list(rows)]
Beispiel #26
0
def bq_to_df(query, spark_session=None):
    import time

    temp_table_name = f"bq_to_df__{str(int(time.time()))}"
    temp_dataset = "temp_1d"
    jc = QueryJobConfig(
        create_disposition="CREATE_IF_NEEDED",
        write_disposition="WRITE_TRUNCATE",
        destination=f"sktaic-datahub.{temp_dataset}.{temp_table_name}",
    )
    bq_client = get_bigquery_client()
    job = bq_client.query(query, job_config=jc)
    job.result()

    return _bq_table_to_df(temp_dataset, temp_table_name, "*", spark_session=spark_session)
Beispiel #27
0
def explore_visits_by_hour(context):
    query_job_config = QueryJobConfig(
        destination='%s.aggregations.explore_visits_per_hour' % PROJECT_ID,
        create_disposition='CREATE_IF_NEEDED',
        write_disposition='WRITE_TRUNCATE',
    )

    sql = '''
   SELECT FORMAT_DATETIME("%F %H:00:00", DATETIME(TIMESTAMP_SECONDS(CAST(timestamp AS INT64)))) AS ts,
          COUNT(1) AS num_visits
     FROM events.events
    WHERE url = '/explore'
 GROUP BY ts
 ORDER BY ts ASC
'''
    context.resources.bigquery.query(sql, job_config=query_job_config)
Beispiel #28
0
    def create_table_from_query(
        self,
        query,  # type: str
        table_path,  # type: str
        write_disposition='WRITE_EMPTY',  # type: Optional[str]
        use_legacy_sql=False,  # type: Optional[bool]
        max_wait_secs=None,  # type: Optional[int]
        expected_schema=None  # type: Optional[List[SchemaField]]
    ):
        # type: (...) -> None
        """Creates a table in BigQuery from a specified query.

        Args:
          query: The query to run.
          table_path: The path to the table (in the client's project) to write
              the results to.
          write_disposition: Specifies behavior if table already exists. See options here:
              https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs under
              configuration.query.writeDisposition
          use_legacy_sql: Whether the query is written in standard or legacy sql.
          max_wait_secs: Seconds to wait for the query before timing out. If not
                set, the class default will be used.
          expected_schema: The expected schema of the resulting table; unused in this implementation
        """

        if write_disposition not in [
                'WRITE_TRUNCATE', 'WRITE_APPEND', 'WRITE_EMPTY'
        ]:
            raise ValueError(
                'write_disposition must be one of WRITE_TRUNCATE, '
                'WRITE_APPEND, or WRITE_EMPTY')

        config = QueryJobConfig()
        if self.maximum_billing_tier:
            config.maximum_billing_tier = self.maximum_billing_tier
        config.use_legacy_sql = use_legacy_sql
        config.write_disposition = write_disposition
        config.allow_large_results = True

        config.destination = self.get_table_reference_from_path(table_path)

        query_job = self._run_async_query(query, job_config=config)

        return self._wait_for_job(query_job,
                                  query,
                                  max_wait_secs=max_wait_secs
                                  or self.max_wait_secs)
def do_extract(fq_pet_vet_dataset, max_tables, fq_cohort_sample_names,
               query_project, fq_temp_table_dataset, fq_destination_dataset,
               destination_table, min_variant_samples,
               fq_sample_mapping_table):
    try:

        global client
        client = bigquery.Client(
            project=query_project,
            default_query_job_config=QueryJobConfig(
                labels={"id": f"test_cohort_export_{output_table_prefix}"},
                priority="INTERACTIVE",
                use_query_cache=False))

        ## TODO -- provide a cmdline arg to override this (so we can simulat smaller datasets)
        global PET_VET_TABLE_COUNT
        PET_VET_TABLE_COUNT = max_tables
        print(
            f"Using {PET_VET_TABLE_COUNT} PET tables in {fq_pet_vet_dataset}..."
        )

        cohort = get_all_samples(fq_cohort_sample_names,
                                 fq_sample_mapping_table)
        print(
            f"Discovered {len(cohort)} samples in {fq_cohort_sample_names}...")

        make_new_vet_union_all(fq_pet_vet_dataset, fq_temp_table_dataset,
                               cohort)

        create_position_table(fq_temp_table_dataset, min_variant_samples)
        make_new_pet_union_all(fq_pet_vet_dataset, fq_temp_table_dataset,
                               cohort)
        populate_final_extract_table(fq_temp_table_dataset,
                                     fq_destination_dataset, destination_table,
                                     fq_sample_mapping_table)
    except Exception as err:
        print(err)

    dump_job_stats()
    print(
        f"\nFinal cohort extract written to {fq_destination_dataset}.{destination_table}\n"
    )
Beispiel #30
0
 def _execute(self):
     client = self._get_client()
     job_id = self._get_job_id(with_unique_suffix=True)
     destination_table = self._get_full_table_name()
     query = self._params['query'].strip()
     location = self._params['bq_dataset_location']
     try:
         job = client.get_job(job_id)
     except exceptions.NotFound:
         if self._params['overwrite']:
             write_disposition = 'WRITE_TRUNCATE'
         else:
             write_disposition = 'WRITE_APPEND'
         job_config = QueryJobConfig(destination=destination_table,
                                     write_disposition=write_disposition)
         job = client.query(query=query,
                            job_id=job_id,
                            location=location,
                            job_config=job_config)
     self._wait(job)