Esempio n. 1
0
def load(project_id, bq_client, src_dataset_id, dst_dataset_id):
    """
    Transform safely loaded tables and store results in target dataset.

    :param project_id: Identifies the BQ project
    :param bq_client: a BigQuery client object
    :param src_dataset_id: reference to source dataset object
    :param dst_dataset_id: reference to destination dataset object
    :return: List of BQ job_ids
    """
    dst_dataset = Dataset(f'{bq_client.project}.{dst_dataset_id}')
    dst_dataset.description = f'Vocabulary cleaned and loaded from {src_dataset_id}'
    dst_dataset.labels = {'type': 'vocabulary'}
    dst_dataset.location = "US"
    bq_client.create_dataset(dst_dataset, exists_ok=True)
    src_tables = list(bq_client.list_tables(dataset=src_dataset_id))

    job_config = QueryJobConfig()
    query_jobs = []
    for src_table in src_tables:
        schema = bq.get_table_schema(src_table.table_id)
        destination = f'{project_id}.{dst_dataset_id}.{src_table.table_id}'
        table = bq_client.create_table(Table(destination, schema=schema),
                                       exists_ok=True)
        job_config.destination = table
        query = SELECT_TPL.render(project_id=project_id,
                                  dataset_id=src_dataset_id,
                                  table=src_table.table_id,
                                  fields=schema)
        query_job = bq_client.query(query, job_config=job_config)
        LOGGER.info(f'table:{destination} job_id:{query_job.job_id}')
        query_jobs.append(query_job)
        query_job.result()
    return query_jobs
Esempio n. 2
0
def schema_upgrade_cdm52_to_cdm531(project_id,
                                   dataset_id,
                                   snapshot_dataset_id,
                                   hpo_id=None):
    """
   :param project_id:
   :param dataset_id: Dataset to convert
   :param snapshot_dataset_id: Dataset with converted tables. Overwritten if tables already exist
   :param hpo_id: Identifies the hpo_id of the site
   :return:
    """
    # Create dataset if not exists
    client = bq.get_client(project_id)
    client.create_dataset(snapshot_dataset_id, exists_ok=True)

    sq.create_empty_cdm_tables(snapshot_dataset_id, hpo_id)

    copy_table_job_ids = []
    tables = [table.table_id for table in list(client.list_tables(dataset_id))]
    if hpo_id:
        hpo_tables = [
            resources.get_table_id(table, hpo_id)
            for table in resources.CDM_TABLES + PII_TABLES
        ]
        # Filter tables that do not exist
        tables = [table for table in hpo_tables if table in tables]
    for table_id in tables:
        q = get_upgrade_table_query(client, dataset_id, table_id, hpo_id)
        job_config = QueryJobConfig()
        job_config.destination = f'{client.project}.{snapshot_dataset_id}.{table_id}'
        job_config.use_legacy_sql = False
        job = client.query(q, job_config)
        copy_table_job_ids.append(job.job_id)
        job.result()
    return copy_table_job_ids
Esempio n. 3
0
    def test_fill_from_default_conflict(self):
        from google.cloud.bigquery import QueryJobConfig

        basic_job_config = QueryJobConfig()
        conflicting_job_config = self._make_one("conflicting_job_type")
        self.assertNotEqual(basic_job_config._job_type,
                            conflicting_job_config._job_type)

        with self.assertRaises(TypeError):
            basic_job_config._fill_from_default(conflicting_job_config)
    def test_create_udf(self, udf_path):
        client = self._client
        bq_test_dataset = utils.get_target_bq_dataset(udf_path)

        job_config = QueryJobConfig()
        job_config.default_dataset = (f'{client.project}.{bq_test_dataset}')
        try:
            udf_sql = utils.replace_with_test_datasets(udf_path,
                                                       client.project)
            udf_creation_result = client.query(udf_sql,
                                               job_config=job_config).result()
            self.assertIsInstance(udf_creation_result, _EmptyRowIterator)
        except GoogleAPICallError as e:
            self.fail(e.message)
Esempio n. 5
0
def bq_create_table_as_select(google_client, dataset_id, table_name, query):

    table_ref = google_client.dataset(dataset_id).table(table_name)
    job_config = QueryJobConfig()

    job_config.destination = table_ref
    job_config.write_disposition = "WRITE_TRUNCATE"
    query_job = google_client.query(query=query, job_config=job_config)
    retry_count = 100
    while retry_count > 0 and query_job.state != 'DONE':
        retry_count -= 1
        time.sleep(3)
        query_job.reload()  # API call
        logging.info("job state : %s " % (query_job.state))
    logging.info("job state : %s at %s" % (query_job.state, query_job.ended))
Esempio n. 6
0
    def test_create_udf(self, udf_path):
        client = bigquery.Client()
        bq_test_dataset = Utils.get_target_bq_dataset(udf_path)
        client.create_dataset(bq_test_dataset, exists_ok=True)

        job_config = QueryJobConfig()
        job_config.default_dataset = (f'{client.project}.{bq_test_dataset}')
        with open(udf_path) as udf_file:
            try:
                udf_sql = Utils.replace_with_test_datasets(
                    udf_path, client.project)
                udf_creation_result = client.query(
                    udf_sql, job_config=job_config).result()
                self.assertIsInstance(udf_creation_result, _EmptyRowIterator)
            except GoogleAPICallError as e:
                self.fail(e.message)
    def test_create_udf_signature(self, udf_path):
        client = self._client
        bq_test_dataset = utils.get_target_bq_dataset(udf_path)

        job_config = QueryJobConfig()
        job_config.default_dataset = (f'{client.project}.{bq_test_dataset}')
        udf_signature = utils.extract_udf_signature(udf_path)
        udf_sql = utils.replace_with_test_datasets(
            project_id=client.project,
            udf_sql=f'CREATE OR REPLACE FUNCTION {udf_signature} AS (NULL)')
        try:
            udf_creation_result = client.query(udf_sql,
                                               job_config=job_config).result()
            self.assertIsInstance(udf_creation_result, _EmptyRowIterator)
        except GoogleAPICallError as e:
            self.fail(e.message)
Esempio n. 8
0
 def __init__(self, dataset: str):
     qjc: QueryJobConfig = QueryJobConfig(
         default_dataset=f"{os.environ.get('PROJECT')}.{dataset}")
     self.client = bigquery.Client(default_query_job_config=qjc)
     self.dataset = dataset
     self.dcl = "datasetchangelog"
     self.destination = f"{self.client.project}.{self.dataset}.{self.dcl}"
     self.logger = logging.getLogger(__name__)
Esempio n. 9
0
    def test_fill_from_default(self):
        from google.cloud.bigquery import QueryJobConfig

        job_config = QueryJobConfig()
        job_config.dry_run = True
        job_config.maximum_bytes_billed = 1000

        default_job_config = QueryJobConfig()
        default_job_config.use_query_cache = True
        default_job_config.maximum_bytes_billed = 2000

        final_job_config = job_config._fill_from_default(default_job_config)
        self.assertTrue(final_job_config.dry_run)
        self.assertTrue(final_job_config.use_query_cache)
        self.assertEqual(final_job_config.maximum_bytes_billed, 1000)
Esempio n. 10
0
def load(project_id,
         bq_client,
         src_dataset_id,
         dst_dataset_id,
         overwrite_ok=False):
    """
    Transform safely loaded tables and store results in target dataset.

    :param project_id:
    :param bq_client:
    :param src_dataset_id:
    :param dst_dataset_id:
    :param overwrite_ok: if True and the dest dataset already exists the dataset is recreated
    :return:
    """
    if overwrite_ok:
        bq_client.delete_dataset(dst_dataset_id,
                                 delete_contents=True,
                                 not_found_ok=True)
    bq_client.create_dataset(dst_dataset_id)
    src_tables = list(bq_client.list_tables(dataset=src_dataset_id))

    job_config = QueryJobConfig()
    query_jobs = []
    for src_table in src_tables:
        schema = bq.get_table_schema(src_table.table_id)
        destination = f'{project_id}.{dst_dataset_id}.{src_table.table_id}'
        table = bq_client.create_table(Table(destination, schema=schema),
                                       exists_ok=True)
        job_config.destination = table
        query = SELECT_TPL.render(project_id=project_id,
                                  dataset_id=src_dataset_id,
                                  table=src_table.table_id,
                                  fields=schema)
        query_job = bq_client.query(query, job_config=job_config)
        LOGGER.info(f'table:{destination} job_id:{query_job.job_id}')
        query_jobs.append(query_job)
    return query_jobs
    def process(self, app_numbers):
        sql = """
            SELECT application_number, application_kind, grant_date
            FROM `patents-public-data.patents.publications`
            WHERE
            country_code = @us_country_code
            AND application_number IN UNNEST(@application_numbers)
            AND IF (
                publication_date >= @wipo_kind_codes_from,
                kind_code IN UNNEST(@wipo_patent_publication_codes),
                kind_code = @uspto_patent_publication_code
            );
        """

        job_config = QueryJobConfig(query_parameters=[
            ScalarQueryParameter(
                'us_country_code',
                'STRING',
                US_COUNTRY_CODE,
            ),
            ArrayQueryParameter(
                'application_numbers',
                'STRING',
                app_numbers,
            ),
            ScalarQueryParameter(
                'wipo_kind_codes_from',
                'INT64',
                WIPO_KIND_CODES_FROM,
            ),
            ArrayQueryParameter(
                'wipo_patent_publication_codes',
                'STRING',
                WIPO_PATENT_PUBLICATION_CODES,
            ),
            ScalarQueryParameter(
                'uspto_patent_publication_code',
                'STRING',
                USPTO_PATENT_PUBLICATION_CODE,
            ),
        ])
        query = self.storage_client.query(sql, job_config=job_config)

        logging.info('Executing query for publications')
        iterator = query.result()

        return iterator
Esempio n. 12
0
    def run_query(self,
                  query,
                  job_id_prefix=None,
                  wait=False,
                  use_query_cache=True):

        job_config = QueryJobConfig(use_query_cache=use_query_cache)

        query_job = self.bq_client.query(query,
                                         job_id_prefix=job_id_prefix,
                                         job_config=job_config)
        print(f'Job {query_job.job_id} started.')
        query_job.add_done_callback(self.done_cb)
        if wait:
            query_job.result()

        return query_job
Esempio n. 13
0
 def _set_maximum_bytes_billed(
     self, job_config: bigquery.QueryJobConfig, maximum_gbytes_billed: Optional[float]
 ) -> bigquery.QueryJobConfig:
     """クエリの処理容量制限を設定する
     GBからByteに変換して登録する
     Arguments:
         job_config {bigquery.QueryJobConfig} -- ジョブ設定
         maximum_gbytes_billed {Optional[float]} -- GB上限
     Returns:
         bigquery.QueryJobConfig -- 設定ずみジョブ設定
     """
     maximum_gbytes_billed = (
         maximum_gbytes_billed if self.maximum_gbytes_billed is None else self.maximum_gbytes_billed
     )
     if maximum_gbytes_billed is not None:
         maximum_bytes_billed = int(maximum_gbytes_billed * 1024 ** 3)
         job_config.maximum_bytes_billed = maximum_bytes_billed
     return job_config
Esempio n. 14
0
def test_serialization():
    bigquery_task = BigQueryTask(
        name="flytekit.demo.bigquery_task.query",
        inputs=kwtypes(ds=str),
        task_config=BigQueryConfig(
            ProjectID="Flyte", Location="Asia", QueryJobConfig=QueryJobConfig(allow_large_results=True)
        ),
        query_template=query_template,
        output_structured_dataset_type=StructuredDataset,
    )

    @workflow
    def my_wf(ds: str) -> StructuredDataset:
        return bigquery_task(ds=ds)

    default_img = Image(name="default", fqn="test", tag="tag")
    serialization_settings = SerializationSettings(
        project="proj",
        domain="dom",
        version="123",
        image_config=ImageConfig(default_image=default_img, images=[default_img]),
        env={},
    )

    task_spec = get_serializable(OrderedDict(), serialization_settings, bigquery_task)

    assert "SELECT * FROM `bigquery-public-data.crypto_dogecoin.transactions`" in task_spec.template.sql.statement
    assert "@version" in task_spec.template.sql.statement
    assert task_spec.template.sql.dialect == task_spec.template.sql.Dialect.ANSI
    s = Struct()
    s.update({"ProjectID": "Flyte", "Location": "Asia", "allowLargeResults": True})
    assert task_spec.template.custom == json_format.MessageToDict(s)
    assert len(task_spec.template.interface.inputs) == 1
    assert len(task_spec.template.interface.outputs) == 1

    admin_workflow_spec = get_serializable(OrderedDict(), serialization_settings, my_wf)
    assert admin_workflow_spec.template.interface.outputs["o0"].type.structured_dataset_type is not None
    assert admin_workflow_spec.template.outputs[0].var == "o0"
    assert admin_workflow_spec.template.outputs[0].binding.promise.node_id == "n0"
    assert admin_workflow_spec.template.outputs[0].binding.promise.var == "results"
Esempio n. 15
0
 def run_sql(self, query: str) -> BigQueryRows:
     bigquery = self._bigquery(self.credentials.project_id)
     if log.isEnabledFor(logging.DEBUG):
         log.debug('Query (%r characters total): %r', len(query),
                   self._trunc_query(query))
     if config.bigquery_batch_mode:
         job_config = QueryJobConfig(priority=QueryPriority.BATCH)
         job: QueryJob = bigquery.query(query, job_config=job_config)
         result = job.result()
     else:
         delays = (10, 20, 40, 80)
         assert sum(delays) < config.contribution_lambda_timeout(
             retry=False)
         for attempt, delay in enumerate((*delays, None)):
             job: QueryJob = bigquery.query(query)
             try:
                 result = job.result()
             except (Forbidden, InternalServerError,
                     ServiceUnavailable) as e:
                 if delay is None:
                     raise e
                 elif isinstance(
                         e, Forbidden
                 ) and 'Exceeded rate limits' not in e.message:
                     raise e
                 else:
                     log.warning(
                         'BigQuery job error during attempt %i/%i. Retrying in %is.',
                         attempt + 1,
                         len(delays) + 1,
                         delay,
                         exc_info=e)
                     sleep(delay)
             else:
                 break
         else:
             assert False
     if log.isEnabledFor(logging.DEBUG):
         log.debug('Job info: %s', json.dumps(self._job_info(job)))
     return result
Esempio n. 16
0
 def __create_job_config(
         self, ems_query_job_config: EmsQueryJobConfig) -> QueryJobConfig:
     job_config = QueryJobConfig()
     job_config.priority = ems_query_job_config.priority.value
     job_config.use_legacy_sql = False
     job_config.use_query_cache = ems_query_job_config.use_query_cache
     job_config.labels = ems_query_job_config.labels
     if ems_query_job_config.destination_table is not None:
         job_config.time_partitioning = TimePartitioning("DAY")
         table_reference = TableReference(
             DatasetReference(
                 ems_query_job_config.destination_project_id
                 or self.__project_id,
                 ems_query_job_config.destination_dataset),
             ems_query_job_config.destination_table)
         job_config.destination = table_reference
         job_config.write_disposition = ems_query_job_config.write_disposition.value
         job_config.create_disposition = ems_query_job_config.create_disposition.value
     partitioning = ems_query_job_config.time_partitioning
     if partitioning is not None:
         job_config.time_partitioning = TimePartitioning(
             partitioning.type.value, partitioning.field,
             partitioning.expiration_ms,
             partitioning.require_partition_filter)
     if ems_query_job_config.table_definitions is not None:
         job_config.table_definitions = ems_query_job_config.table_definitions
     return job_config
Esempio n. 17
0
 def _config_query(self, use_legacy_sql):
     job_config = QueryJobConfig()
     job_config.destination = self.temp_table
     job_config.use_legacy_sql = use_legacy_sql
     job_config.allow_large_results = True
     return job_config
Esempio n. 18
0
def load_from_github(repository_name):
    """
    Load repository information from Github such as pull requests and contributors
    :param repository_name:
    :return:
    """
    logger.debug(datetime.datetime.now().strftime("%H:%M:%S") +
                 "Loading from github")
    repo_api = get_repo_api(repository_name)

    try:
        full_name = repo_api.full_name
    except UnknownObjectException:
        return None
    repo = Repository(id=repo_api.id, full_name=full_name, name=repo_api.name)
    repo.language, created = Language.objects.get_or_create(
        name=repo_api.language)
    repo.save()
    logger.debug(datetime.datetime.now().strftime("%H:%M:%S") +
                 "Getting contributors")
    contributor_counter = len(list(repo_api.get_contributors()))
    repo.contributors_count = contributor_counter

    if config.GET_CONTRIBUTORS_DATA:
        for contrib in repo_api.get_contributors():
            contributor_counter += 1
            try:
                contributor_db = Contributor.objects.get(
                    login__exact=contrib.login)
            except ObjectDoesNotExist:
                contributor_db = Contributor()
                contributor_db.login = contrib.login
                contributor_db.followers_count = contrib.followers
                contributor_db.url = contrib.html_url
                contributor_db.save()

            contribution_db = Contribution(repository=repo,
                                           contributor=contributor_db,
                                           amount=contrib.contributions)
            contribution_db.save()

    logger.debug(datetime.datetime.now().strftime("%H:%M:%S") +
                 "Getting pull request Data")

    if config.USE_BIGQUERY:
        bigquery_client: bigquery.Client = bigquery.Client.from_service_account_json(
            "socialpatterns-c03d755a739c.json")
        repo_url_bigquery = repo_api.html_url.replace("github.com",
                                                      "api.github.com/repos")
        query_config = QueryJobConfig()
        query_config.use_legacy_sql = False
        query_text = """ SELECT Count(*) AS Pull_Request , (SELECT Count(*) FROM `ghtorrent-bq.ght_2018_04_01.issue_comments`        WHERE  issue_id IN (SELECT id  FROM   `ghtorrent-bq.ght_2018_04_01.issues`  WHERE  pull_request_id IN (SELECT id FROM   `ghtorrent-bq.ght_2018_04_01.pull_requests` WHERE   base_repo_id = (SELECT id FROM   `ghtorrent-bq.ght_2018_04_01.projects` AS pj WHERE  pj.url ="%s" LIMIT 1  ))   )) AS Comments  FROM   `ghtorrent-bq.ght_2018_04_01.pull_requests` WHERE  base_repo_id =   (SELECT id   FROM   `ghtorrent-bq.ght_2018_04_01.projects` AS pj   WHERE  pj.url="%s" LIMIT 1   )  """ % (
            repo_url_bigquery, repo_url_bigquery)
        query_job = bigquery_client.query(query_text, job_config=query_config)
        pr_number = list(query_job.result())[0][0]
        comments = list(query_job.result())[0][1]
    else:
        if config.CHECK_CLOSED_PR:
            pull_requests = repo_api.get_pulls(state="all")
        else:
            pull_requests = repo_api.get_pulls()

        pr_number = len(list(pull_requests))
        comments = 0

        for pr in pull_requests:
            try:
                comments += pr.comments
            except ssl.SSLError:
                logger.error("Read timeout when getting comments")
    repo.comments_count = comments
    repo.pull_request_count = pr_number
    repo.save()
    return repo
Esempio n. 19
0
    def run_query(self, query, **kwargs):
        from google.auth.exceptions import RefreshError
        from google.cloud.bigquery import QueryJobConfig
        from concurrent.futures import TimeoutError

        job_config = {
            'query': {
                'useLegacySql': self.dialect == 'legacy'
                # 'allowLargeResults', 'createDisposition',
                # 'preserveNulls', destinationTable, useQueryCache
            }
        }
        config = kwargs.get('configuration')
        if config is not None:
            if len(config) != 1:
                raise ValueError("Only one job type must be specified, but "
                                 "given {}".format(','.join(config.keys())))
            if 'query' in config:
                if 'query' in config['query']:
                    if query is not None:
                        raise ValueError("Query statement can't be specified "
                                         "inside config while it is specified "
                                         "as parameter")
                    query = config['query']['query']
                    del config['query']['query']

                job_config['query'].update(config['query'])
            else:
                raise ValueError("Only 'query' job type is supported")

        self._start_timer()
        try:
            self._print('Requesting query... ', end="")
            query_reply = self.client.query(
                query,
                job_config=QueryJobConfig.from_api_repr(job_config['query']))
            self._print('ok.')
        except (RefreshError, ValueError):
            if self.private_key:
                raise AccessDenied(
                    "The service account credentials are not valid")
            else:
                raise AccessDenied(
                    "The credentials have been revoked or expired, "
                    "please re-run the application to re-authorize")
        except self.http_error as ex:
            self.process_http_error(ex)

        job_id = query_reply.job_id
        self._print('Job ID: %s\nQuery running...' % job_id)

        while query_reply.state != 'DONE':
            self.print_elapsed_seconds('  Elapsed', 's. Waiting...')

            timeout_ms = job_config['query'].get('timeoutMs')
            if timeout_ms and timeout_ms < self.get_elapsed_seconds() * 1000:
                raise QueryTimeout('Query timeout: {} ms'.format(timeout_ms))

            timeout_sec = 1.0
            if timeout_ms:
                # Wait at most 1 second so we can show progress bar
                timeout_sec = min(1.0, timeout_ms / 1000.0)

            try:
                query_reply.result(timeout=timeout_sec)
            except TimeoutError:
                # Use our own timeout logic
                pass
            except self.http_error as ex:
                self.process_http_error(ex)

        if self.verbose:
            if query_reply.cache_hit:
                self._print('Query done.\nCache hit.\n')
            else:
                bytes_processed = query_reply.total_bytes_processed or 0
                bytes_billed = query_reply.total_bytes_billed or 0
                self._print('Query done.\nProcessed: {} Billed: {}'.format(
                    self.sizeof_fmt(bytes_processed),
                    self.sizeof_fmt(bytes_billed)))
                self._print('Standard price: ${:,.2f} USD\n'.format(
                    bytes_billed * self.query_price_for_TB))

            self._print('Retrieving results...')

        try:
            rows_iter = query_reply.result()
        except self.http_error as ex:
            self.process_http_error(ex)
        result_rows = list(rows_iter)
        total_rows = rows_iter.total_rows
        schema = {
            'fields': [field.to_api_repr() for field in rows_iter.schema],
        }

        # print basic query stats
        self._print('Got {} rows.\n'.format(total_rows))

        return schema, result_rows
Esempio n. 20
0
        def enrich_task(ds, **kwargs):
            template_context = kwargs.copy()
            template_context['ds'] = ds
            template_context['params'] = environment

            client = Client()

            # Need to use a temporary table because bq query sets field modes to NULLABLE and descriptions to null
            # when writeDisposition is WRITE_TRUNCATE

            # Create a temporary table
            temp_table_name = '{task}_{milliseconds}'.format(
                task=task, milliseconds=int(round(time.time() * 1000)))
            temp_table_ref = client.dataset(dataset_name_temp).table(
                temp_table_name)
            table = Table(temp_table_ref)

            description_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/descriptions/{task}.txt'.format(
                    task=task))
            table.description = read_file(description_path)
            table.time_partitioning = TimePartitioning(
                field=time_partitioning_field)
            logging.info('Creating table: ' + json.dumps(table.to_api_repr()))

            schema_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/schemas/{task}.json'.format(
                    task=task))
            schema = read_bigquery_schema_from_file(schema_path)
            table.schema = schema

            table = client.create_table(table)
            assert table.table_id == temp_table_name

            # Query from raw to temporary table
            query_job_config = QueryJobConfig()
            # Finishes faster, query limit for concurrent interactive queries is 50
            query_job_config.priority = QueryPriority.INTERACTIVE
            query_job_config.destination = temp_table_ref

            sql_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/sqls/{task}.sql'.format(task=task))
            sql_template = read_file(sql_path)
            sql = kwargs['task'].render_template('', sql_template,
                                                 template_context)
            print('Enrichment sql:')
            print(sql)

            query_job = client.query(sql,
                                     location='US',
                                     job_config=query_job_config)
            submit_bigquery_job(query_job, query_job_config)
            assert query_job.state == 'DONE'

            if load_all_partitions:
                # Copy temporary table to destination
                copy_job_config = CopyJobConfig()
                copy_job_config.write_disposition = 'WRITE_TRUNCATE'

                dest_table_name = '{task}'.format(task=task)
                dest_table_ref = client.dataset(
                    dataset_name,
                    project=destination_dataset_project_id).table(
                        dest_table_name)
                copy_job = client.copy_table(temp_table_ref,
                                             dest_table_ref,
                                             location='US',
                                             job_config=copy_job_config)
                submit_bigquery_job(copy_job, copy_job_config)
                assert copy_job.state == 'DONE'
            else:
                # Merge
                # https://cloud.google.com/bigquery/docs/reference/standard-sql/dml-syntax#merge_statement
                merge_job_config = QueryJobConfig()
                # Finishes faster, query limit for concurrent interactive queries is 50
                merge_job_config.priority = QueryPriority.INTERACTIVE

                merge_sql_path = os.path.join(
                    dags_folder,
                    'resources/stages/enrich/sqls/merge_{task}.sql'.format(
                        task=task))
                merge_sql_template = read_file(merge_sql_path)
                template_context['params']['source_table'] = temp_table_name
                merge_sql = kwargs['task'].render_template(
                    '', merge_sql_template, template_context)
                print('Merge sql:')
                print(merge_sql)
                merge_job = client.query(merge_sql,
                                         location='US',
                                         job_config=merge_job_config)
                submit_bigquery_job(merge_job, merge_job_config)
                assert merge_job.state == 'DONE'

            # Delete temp table
            client.delete_table(temp_table_ref)
 def dry_run_query(self, query):
     job_config = QueryJobConfig()
     job_config.dry_run = True
     job_config.use_query_cache = False
     return self.client.query(query=(query), job_config=job_config)
def parse_url(url):  # noqa: C901
    query = dict(url.query)  # need mutable query.

    # use_legacy_sql (legacy)
    if "use_legacy_sql" in query:
        raise ValueError("legacy sql is not supported by this dialect")
    # allow_large_results (legacy)
    if "allow_large_results" in query:
        raise ValueError(
            "allow_large_results is only allowed for legacy sql, which is not supported by this dialect"
        )
    # flatten_results (legacy)
    if "flatten_results" in query:
        raise ValueError(
            "flatten_results is only allowed for legacy sql, which is not supported by this dialect"
        )
    # maximum_billing_tier (deprecated)
    if "maximum_billing_tier" in query:
        raise ValueError("maximum_billing_tier is a deprecated argument")

    project_id = url.host
    location = None
    dataset_id = url.database or None
    arraysize = None
    credentials_path = None

    # location
    if "location" in query:
        location = query.pop("location")

    # credentials_path
    if "credentials_path" in query:
        credentials_path = query.pop("credentials_path")

    # arraysize
    if "arraysize" in query:
        str_arraysize = query.pop("arraysize")
        try:
            arraysize = int(str_arraysize)
        except ValueError:
            raise ValueError("invalid int in url query arraysize: " +
                             str_arraysize)

    # if only these "non-config" values were present, the dict will now be empty
    if not query:
        # if a dataset_id exists, we need to return a job_config that isn't None
        # so it can be updated with a dataset reference from the client
        if dataset_id:
            return (
                project_id,
                location,
                dataset_id,
                arraysize,
                credentials_path,
                QueryJobConfig(),
            )
        else:
            return project_id, location, dataset_id, arraysize, credentials_path, None

    job_config = QueryJobConfig()

    # clustering_fields list(str)
    if "clustering_fields" in query:
        clustering_fields = GROUP_DELIMITER.split(query["clustering_fields"])
        job_config.clustering_fields = list(clustering_fields)

    # create_disposition
    if "create_disposition" in query:
        create_disposition = query["create_disposition"]
        try:
            job_config.create_disposition = getattr(CreateDisposition,
                                                    create_disposition)
        except AttributeError:
            raise ValueError("invalid create_disposition in url query: " +
                             create_disposition)

    # default_dataset
    if "default_dataset" in query or "dataset_id" in query or "project_id" in query:
        raise ValueError(
            "don't pass default_dataset, dataset_id, project_id in url query, instead use the url host and database"
        )

    # destination
    if "destination" in query:
        dest_project = None
        dest_dataset = None
        dest_table = None

        try:
            dest_project, dest_dataset, dest_table = query[
                "destination"].split(".")
        except ValueError:
            raise ValueError(
                "url query destination parameter should be fully qualified with project, dataset, and table"
            )

        job_config.destination = TableReference(
            DatasetReference(dest_project, dest_dataset), dest_table)

    # destination_encryption_configuration
    if "destination_encryption_configuration" in query:
        job_config.destination_encryption_configuration = EncryptionConfiguration(
            query["destination_encryption_configuration"])

    # dry_run
    if "dry_run" in query:
        try:
            job_config.dry_run = parse_boolean(query["dry_run"])
        except ValueError:
            raise ValueError("invalid boolean in url query for dry_run: " +
                             query["dry_run"])

    # labels
    if "labels" in query:
        label_groups = GROUP_DELIMITER.split(query["labels"])
        labels = {}
        for label_group in label_groups:
            try:
                key, value = KEY_VALUE_DELIMITER.split(label_group)
            except ValueError:
                raise ValueError("malformed url query in labels: " +
                                 label_group)
            labels[key] = value

        job_config.labels = labels

    # maximum_bytes_billed
    if "maximum_bytes_billed" in query:
        try:
            job_config.maximum_bytes_billed = int(
                query["maximum_bytes_billed"])
        except ValueError:
            raise ValueError(
                "invalid int in url query maximum_bytes_billed: " +
                query["maximum_bytes_billed"])

    # priority
    if "priority" in query:
        try:
            job_config.priority = getattr(QueryPriority, query["priority"])
        except AttributeError:
            raise ValueError("invalid priority in url query: " +
                             query["priority"])

    # query_parameters
    if "query_parameters" in query:
        raise NotImplementedError("url query query_parameters not implemented")

    # schema_update_options
    if "schema_update_options" in query:
        schema_update_options = GROUP_DELIMITER.split(
            query["schema_update_options"])
        try:
            job_config.schema_update_options = [
                getattr(SchemaUpdateOption, schema_update_option)
                for schema_update_option in schema_update_options
            ]
        except AttributeError:
            raise ValueError("invalid schema_update_options in url query: " +
                             query["schema_update_options"])

    # table_definitions
    if "table_definitions" in query:
        raise NotImplementedError(
            "url query table_definitions not implemented")

    # time_partitioning
    if "time_partitioning" in query:
        raise NotImplementedError(
            "url query time_partitioning not implemented")

    # udf_resources
    if "udf_resources" in query:
        raise NotImplementedError("url query udf_resources not implemented")

    # use_query_cache
    if "use_query_cache" in query:
        try:
            job_config.use_query_cache = parse_boolean(
                query["use_query_cache"])
        except ValueError:
            raise ValueError(
                "invalid boolean in url query for use_query_cache: " +
                query["use_query_cache"])

    # write_disposition
    if "write_disposition" in query:
        try:
            job_config.write_disposition = getattr(WriteDisposition,
                                                   query["write_disposition"])
        except AttributeError:
            raise ValueError("invalid write_disposition in url query: " +
                             query["write_disposition"])

    return project_id, location, dataset_id, arraysize, credentials_path, job_config
Esempio n. 23
0
from datetime import datetime, timedelta, date
from google.cloud import bigquery
from google.cloud.bigquery.table import RowIterator
from google.cloud.bigquery import QueryJobConfig

billing_project = "momovn-dev"
conf = QueryJobConfig()
conf.use_query_cache = True
conf.use_legacy_sql = False
checkpointDate = None
start_date = datetime.strptime('20201002', '%Y%m%d').date()
end_date = datetime.strptime('20201002', '%Y%m%d').date()
day_count = (end_date - start_date).days + 1

for checkpointDate in (start_date + timedelta(n) for n in range(day_count)):
    try:
        # checkpointDate = datetime.strptime(single_date, '%Y%m%d').date()
        checkpointDateWithoutDash = checkpointDate.strftime("%Y%m%d")
        checkpointDateWithDash = checkpointDate.strftime("%Y-%m-%d")

        query = f"""WITH A AS( SELECT GPS.reference PHONE FROM `momovn-prod.HERMES.HERMES_LOCATIONS` GPS WHERE DATE(GPS.event_timestamp,'Asia/Bangkok') = {checkpointDateWithDash})
        SELECT COUNT(DISTINCT T1.USER_ID), 'HERMES LOCATION'
        FROM `momovn-prod.BITEAM_INTERN.{checkpointDateWithoutDash}_CHECK_LOCATION` T1 
        LEFT JOIN A T2 
        ON T1.USER_ID = T2.PHONE 
        WHERE T2.PHONE IS NULL
        UNION ALL
        SELECT COUNT(DISTINCT T1.USER_ID), 'USER_LOCATION'
        FROM `momovn-prod.BITEAM_INTERN.{checkpointDateWithoutDash}_CHECK_LOCATION` T1 
        LEFT JOIN `momovn-prod.HERMES.USER_LOCATIONS_{checkpointDateWithoutDash}` T2 
        ON T1.USER_ID = T2.USER_ID 
Esempio n. 24
0
        def enrich_task():
            client = Client()

            # Need to use a temporary table because bq query sets field modes to NULLABLE and descriptions to null
            # when writeDisposition is WRITE_TRUNCATE

            # Create a temporary table
            temp_table_name = '{task}_{milliseconds}'.format(
                task=task, milliseconds=int(round(time.time() * 1000)))
            temp_table_ref = client.dataset(dataset_name_temp).table(
                temp_table_name)
            table = Table(temp_table_ref)

            description_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/descriptions/{task}.txt'.format(
                    task=task))
            table.description = read_file(description_path)
            if time_partitioning_field is not None:
                table.time_partitioning = TimePartitioning(
                    field=time_partitioning_field)
            logging.info('Creating table: ' + json.dumps(table.to_api_repr()))

            schema_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/schemas/{task}.json'.format(
                    task=task))
            schema = read_bigquery_schema_from_file(schema_path)
            table.schema = schema

            table = client.create_table(table)
            assert table.table_id == temp_table_name

            # Query from raw to temporary table
            query_job_config = QueryJobConfig()
            # Finishes faster, query limit for concurrent interactive queries is 50
            query_job_config.priority = QueryPriority.INTERACTIVE
            query_job_config.destination = temp_table_ref
            sql_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/sqls/{task}.sql'.format(task=task))
            sql = read_file(sql_path, environment)
            query_job = client.query(sql,
                                     location='US',
                                     job_config=query_job_config)
            submit_bigquery_job(query_job, query_job_config)
            assert query_job.state == 'DONE'

            # Copy temporary table to destination
            copy_job_config = CopyJobConfig()
            copy_job_config.write_disposition = 'WRITE_TRUNCATE'

            dest_table_name = '{task}'.format(task=task)
            dest_table_ref = client.dataset(
                dataset_name,
                project=destination_dataset_project_id).table(dest_table_name)
            copy_job = client.copy_table(temp_table_ref,
                                         dest_table_ref,
                                         location='US',
                                         job_config=copy_job_config)
            submit_bigquery_job(copy_job, copy_job_config)
            assert copy_job.state == 'DONE'

            # Delete temp table
            client.delete_table(temp_table_ref)