Ejemplo n.º 1
0
 def __create_job_config(
         self, ems_query_job_config: EmsQueryJobConfig) -> QueryJobConfig:
     job_config = QueryJobConfig()
     job_config.priority = ems_query_job_config.priority.value
     job_config.use_legacy_sql = False
     job_config.use_query_cache = ems_query_job_config.use_query_cache
     job_config.labels = ems_query_job_config.labels
     if ems_query_job_config.destination_table is not None:
         job_config.time_partitioning = TimePartitioning("DAY")
         table_reference = TableReference(
             DatasetReference(
                 ems_query_job_config.destination_project_id
                 or self.__project_id,
                 ems_query_job_config.destination_dataset),
             ems_query_job_config.destination_table)
         job_config.destination = table_reference
         job_config.write_disposition = ems_query_job_config.write_disposition.value
         job_config.create_disposition = ems_query_job_config.create_disposition.value
     partitioning = ems_query_job_config.time_partitioning
     if partitioning is not None:
         job_config.time_partitioning = TimePartitioning(
             partitioning.type.value, partitioning.field,
             partitioning.expiration_ms,
             partitioning.require_partition_filter)
     if ems_query_job_config.table_definitions is not None:
         job_config.table_definitions = ems_query_job_config.table_definitions
     return job_config
Ejemplo n.º 2
0
        def enrich_task():
            client = Client()

            # Need to use a temporary table because bq query sets field modes to NULLABLE and descriptions to null
            # when writeDisposition is WRITE_TRUNCATE

            # Create a temporary table
            temp_table_name = '{task}_{milliseconds}'.format(
                task=task, milliseconds=int(round(time.time() * 1000)))
            temp_table_ref = client.dataset(dataset_name_temp).table(
                temp_table_name)
            table = Table(temp_table_ref)

            description_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/descriptions/{task}.txt'.format(
                    task=task))
            table.description = read_file(description_path)
            if time_partitioning_field is not None:
                table.time_partitioning = TimePartitioning(
                    field=time_partitioning_field)
            logging.info('Creating table: ' + json.dumps(table.to_api_repr()))

            schema_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/schemas/{task}.json'.format(
                    task=task))
            schema = read_bigquery_schema_from_file(schema_path)
            table.schema = schema

            table = client.create_table(table)
            assert table.table_id == temp_table_name

            # Query from raw to temporary table
            query_job_config = QueryJobConfig()
            # Finishes faster, query limit for concurrent interactive queries is 50
            query_job_config.priority = QueryPriority.INTERACTIVE
            query_job_config.destination = temp_table_ref
            sql_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/sqls/{task}.sql'.format(task=task))
            sql = read_file(sql_path, environment)
            query_job = client.query(sql,
                                     location='US',
                                     job_config=query_job_config)
            submit_bigquery_job(query_job, query_job_config)
            assert query_job.state == 'DONE'

            # Copy temporary table to destination
            copy_job_config = CopyJobConfig()
            copy_job_config.write_disposition = 'WRITE_TRUNCATE'

            dest_table_name = '{task}'.format(task=task)
            dest_table_ref = client.dataset(
                dataset_name,
                project=destination_dataset_project_id).table(dest_table_name)
            copy_job = client.copy_table(temp_table_ref,
                                         dest_table_ref,
                                         location='US',
                                         job_config=copy_job_config)
            submit_bigquery_job(copy_job, copy_job_config)
            assert copy_job.state == 'DONE'

            # Delete temp table
            client.delete_table(temp_table_ref)
def parse_url(url):  # noqa: C901
    query = dict(url.query)  # need mutable query.

    # use_legacy_sql (legacy)
    if "use_legacy_sql" in query:
        raise ValueError("legacy sql is not supported by this dialect")
    # allow_large_results (legacy)
    if "allow_large_results" in query:
        raise ValueError(
            "allow_large_results is only allowed for legacy sql, which is not supported by this dialect"
        )
    # flatten_results (legacy)
    if "flatten_results" in query:
        raise ValueError(
            "flatten_results is only allowed for legacy sql, which is not supported by this dialect"
        )
    # maximum_billing_tier (deprecated)
    if "maximum_billing_tier" in query:
        raise ValueError("maximum_billing_tier is a deprecated argument")

    project_id = url.host
    location = None
    dataset_id = url.database or None
    arraysize = None
    credentials_path = None

    # location
    if "location" in query:
        location = query.pop("location")

    # credentials_path
    if "credentials_path" in query:
        credentials_path = query.pop("credentials_path")

    # arraysize
    if "arraysize" in query:
        str_arraysize = query.pop("arraysize")
        try:
            arraysize = int(str_arraysize)
        except ValueError:
            raise ValueError("invalid int in url query arraysize: " +
                             str_arraysize)

    # if only these "non-config" values were present, the dict will now be empty
    if not query:
        # if a dataset_id exists, we need to return a job_config that isn't None
        # so it can be updated with a dataset reference from the client
        if dataset_id:
            return (
                project_id,
                location,
                dataset_id,
                arraysize,
                credentials_path,
                QueryJobConfig(),
            )
        else:
            return project_id, location, dataset_id, arraysize, credentials_path, None

    job_config = QueryJobConfig()

    # clustering_fields list(str)
    if "clustering_fields" in query:
        clustering_fields = GROUP_DELIMITER.split(query["clustering_fields"])
        job_config.clustering_fields = list(clustering_fields)

    # create_disposition
    if "create_disposition" in query:
        create_disposition = query["create_disposition"]
        try:
            job_config.create_disposition = getattr(CreateDisposition,
                                                    create_disposition)
        except AttributeError:
            raise ValueError("invalid create_disposition in url query: " +
                             create_disposition)

    # default_dataset
    if "default_dataset" in query or "dataset_id" in query or "project_id" in query:
        raise ValueError(
            "don't pass default_dataset, dataset_id, project_id in url query, instead use the url host and database"
        )

    # destination
    if "destination" in query:
        dest_project = None
        dest_dataset = None
        dest_table = None

        try:
            dest_project, dest_dataset, dest_table = query[
                "destination"].split(".")
        except ValueError:
            raise ValueError(
                "url query destination parameter should be fully qualified with project, dataset, and table"
            )

        job_config.destination = TableReference(
            DatasetReference(dest_project, dest_dataset), dest_table)

    # destination_encryption_configuration
    if "destination_encryption_configuration" in query:
        job_config.destination_encryption_configuration = EncryptionConfiguration(
            query["destination_encryption_configuration"])

    # dry_run
    if "dry_run" in query:
        try:
            job_config.dry_run = parse_boolean(query["dry_run"])
        except ValueError:
            raise ValueError("invalid boolean in url query for dry_run: " +
                             query["dry_run"])

    # labels
    if "labels" in query:
        label_groups = GROUP_DELIMITER.split(query["labels"])
        labels = {}
        for label_group in label_groups:
            try:
                key, value = KEY_VALUE_DELIMITER.split(label_group)
            except ValueError:
                raise ValueError("malformed url query in labels: " +
                                 label_group)
            labels[key] = value

        job_config.labels = labels

    # maximum_bytes_billed
    if "maximum_bytes_billed" in query:
        try:
            job_config.maximum_bytes_billed = int(
                query["maximum_bytes_billed"])
        except ValueError:
            raise ValueError(
                "invalid int in url query maximum_bytes_billed: " +
                query["maximum_bytes_billed"])

    # priority
    if "priority" in query:
        try:
            job_config.priority = getattr(QueryPriority, query["priority"])
        except AttributeError:
            raise ValueError("invalid priority in url query: " +
                             query["priority"])

    # query_parameters
    if "query_parameters" in query:
        raise NotImplementedError("url query query_parameters not implemented")

    # schema_update_options
    if "schema_update_options" in query:
        schema_update_options = GROUP_DELIMITER.split(
            query["schema_update_options"])
        try:
            job_config.schema_update_options = [
                getattr(SchemaUpdateOption, schema_update_option)
                for schema_update_option in schema_update_options
            ]
        except AttributeError:
            raise ValueError("invalid schema_update_options in url query: " +
                             query["schema_update_options"])

    # table_definitions
    if "table_definitions" in query:
        raise NotImplementedError(
            "url query table_definitions not implemented")

    # time_partitioning
    if "time_partitioning" in query:
        raise NotImplementedError(
            "url query time_partitioning not implemented")

    # udf_resources
    if "udf_resources" in query:
        raise NotImplementedError("url query udf_resources not implemented")

    # use_query_cache
    if "use_query_cache" in query:
        try:
            job_config.use_query_cache = parse_boolean(
                query["use_query_cache"])
        except ValueError:
            raise ValueError(
                "invalid boolean in url query for use_query_cache: " +
                query["use_query_cache"])

    # write_disposition
    if "write_disposition" in query:
        try:
            job_config.write_disposition = getattr(WriteDisposition,
                                                   query["write_disposition"])
        except AttributeError:
            raise ValueError("invalid write_disposition in url query: " +
                             query["write_disposition"])

    return project_id, location, dataset_id, arraysize, credentials_path, job_config
Ejemplo n.º 4
0
        def enrich_task(ds, **kwargs):
            template_context = kwargs.copy()
            template_context['ds'] = ds
            template_context['params'] = environment

            client = Client()

            # Need to use a temporary table because bq query sets field modes to NULLABLE and descriptions to null
            # when writeDisposition is WRITE_TRUNCATE

            # Create a temporary table
            temp_table_name = '{task}_{milliseconds}'.format(
                task=task, milliseconds=int(round(time.time() * 1000)))
            temp_table_ref = client.dataset(dataset_name_temp).table(
                temp_table_name)
            table = Table(temp_table_ref)

            description_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/descriptions/{task}.txt'.format(
                    task=task))
            table.description = read_file(description_path)
            table.time_partitioning = TimePartitioning(
                field=time_partitioning_field)
            logging.info('Creating table: ' + json.dumps(table.to_api_repr()))

            schema_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/schemas/{task}.json'.format(
                    task=task))
            schema = read_bigquery_schema_from_file(schema_path)
            table.schema = schema

            table = client.create_table(table)
            assert table.table_id == temp_table_name

            # Query from raw to temporary table
            query_job_config = QueryJobConfig()
            # Finishes faster, query limit for concurrent interactive queries is 50
            query_job_config.priority = QueryPriority.INTERACTIVE
            query_job_config.destination = temp_table_ref

            sql_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/sqls/{task}.sql'.format(task=task))
            sql_template = read_file(sql_path)
            sql = kwargs['task'].render_template('', sql_template,
                                                 template_context)
            print('Enrichment sql:')
            print(sql)

            query_job = client.query(sql,
                                     location='US',
                                     job_config=query_job_config)
            submit_bigquery_job(query_job, query_job_config)
            assert query_job.state == 'DONE'

            if load_all_partitions:
                # Copy temporary table to destination
                copy_job_config = CopyJobConfig()
                copy_job_config.write_disposition = 'WRITE_TRUNCATE'

                dest_table_name = '{task}'.format(task=task)
                dest_table_ref = client.dataset(
                    dataset_name,
                    project=destination_dataset_project_id).table(
                        dest_table_name)
                copy_job = client.copy_table(temp_table_ref,
                                             dest_table_ref,
                                             location='US',
                                             job_config=copy_job_config)
                submit_bigquery_job(copy_job, copy_job_config)
                assert copy_job.state == 'DONE'
            else:
                # Merge
                # https://cloud.google.com/bigquery/docs/reference/standard-sql/dml-syntax#merge_statement
                merge_job_config = QueryJobConfig()
                # Finishes faster, query limit for concurrent interactive queries is 50
                merge_job_config.priority = QueryPriority.INTERACTIVE

                merge_sql_path = os.path.join(
                    dags_folder,
                    'resources/stages/enrich/sqls/merge_{task}.sql'.format(
                        task=task))
                merge_sql_template = read_file(merge_sql_path)
                template_context['params']['source_table'] = temp_table_name
                merge_sql = kwargs['task'].render_template(
                    '', merge_sql_template, template_context)
                print('Merge sql:')
                print(merge_sql)
                merge_job = client.query(merge_sql,
                                         location='US',
                                         job_config=merge_job_config)
                submit_bigquery_job(merge_job, merge_job_config)
                assert merge_job.state == 'DONE'

            # Delete temp table
            client.delete_table(temp_table_ref)