Exemple #1
0
 def __create_job_config(
         self, ems_query_job_config: EmsQueryJobConfig) -> QueryJobConfig:
     job_config = QueryJobConfig()
     job_config.priority = ems_query_job_config.priority.value
     job_config.use_legacy_sql = False
     job_config.use_query_cache = ems_query_job_config.use_query_cache
     job_config.labels = ems_query_job_config.labels
     if ems_query_job_config.destination_table is not None:
         job_config.time_partitioning = TimePartitioning("DAY")
         table_reference = TableReference(
             DatasetReference(
                 ems_query_job_config.destination_project_id
                 or self.__project_id,
                 ems_query_job_config.destination_dataset),
             ems_query_job_config.destination_table)
         job_config.destination = table_reference
         job_config.write_disposition = ems_query_job_config.write_disposition.value
         job_config.create_disposition = ems_query_job_config.create_disposition.value
     partitioning = ems_query_job_config.time_partitioning
     if partitioning is not None:
         job_config.time_partitioning = TimePartitioning(
             partitioning.type.value, partitioning.field,
             partitioning.expiration_ms,
             partitioning.require_partition_filter)
     if ems_query_job_config.table_definitions is not None:
         job_config.table_definitions = ems_query_job_config.table_definitions
     return job_config
def create_table(client, dataset_ref, table_name, is_partitioned=False):
    table_ref = dataset_ref.table(table_name)
    table_obj = Table(table_ref, schema=TABLE_SCHEMA)

    if is_partitioned:
        time_partitioning = TimePartitioning()
        time_partitioning.field = 'partition_value'
        table_obj.time_partitioning = time_partitioning

    return client.create_table(table_obj)
Exemple #3
0
    def process_response_rows_for_bigquery(self, rows: list,
                                           table_reference: TableReference):
        rows_dataframe = DataFrame.from_records(rows)

        rows_dataframe = concat(
            [rows_dataframe, rows_dataframe['dimensions'].apply(Series)],
            axis=1,
            join='inner')
        rows_dataframe = rows_dataframe.drop(['dimensions'], axis=1)
        rows_dataframe['date'] = rows_dataframe['date'].apply(
            lambda x: x.date())

        job_config = LoadJobConfig()
        job_config.write_disposition = WriteDisposition.WRITE_APPEND
        job_config.time_partitioning = TimePartitioning(
            type_=TimePartitioningType.DAY, field='date')
        job_config.schema = [
            self._get_schema_for_field(column)
            for column in list(rows_dataframe.columns.values)
        ]

        try:
            load_job = self.bigquery.client.load_table_from_dataframe(
                rows_dataframe, table_reference, job_config=job_config)

            load_job.result()
        except BadRequest as error:
            print(error.errors)
Exemple #4
0
 def setUp(self):
     self.data = [
         Observation(**dict(zip(TEST_DATA_FIELDS, row))).__dict__
         for row in TEST_DATA_ROWS
     ]
     self.client.delete_table(f'{self.dataset_id}.observation',
                              not_found_ok=True)
     job_config = bigquery.LoadJobConfig()
     job_config.write_disposition = bigquery.WriteDisposition.WRITE_APPEND
     # TODO figure out how to handle if clustering does NOT exist
     #      CREATE OR REPLACE fails if partitioning specs differ
     job_config.clustering_fields = ['person_id']
     job_config.time_partitioning = TimePartitioning(
         type_=bigquery.TimePartitioningType.DAY)
     job_config.schema = Observation.SCHEMA
     job_config.create_disposition = bigquery.CreateDisposition.CREATE_IF_NEEDED
     load_job = self.client.load_table_from_json(
         self.data,
         destination=f'{self.dataset_id}.{ppi_branching.OBSERVATION}',
         job_config=job_config)
     try:
         load_job.result()
     except google.api_core.exceptions.BadRequest:
         self.assertEqual(0, len(load_job.errors),
                          f'job errors={load_job.errors}')
Exemple #5
0
        def load_task():
            client = bigquery.Client()
            job_config = bigquery.LoadJobConfig()
            schema_path = os.path.join(
                dags_folder,
                'resources/stages/load/schemas/{task}.json'.format(task=task))
            job_config.schema = read_bigquery_schema_from_file(schema_path)
            job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
            job_config.write_disposition = 'WRITE_TRUNCATE'
            job_config.ignore_unknown_values = True
            job_config.time_partitioning = TimePartitioning(
                field=time_partitioning_field)

            export_location_uri = 'gs://{bucket}/export'.format(
                bucket=output_bucket)
            uri = '{export_location_uri}/{task}/*.json'.format(
                export_location_uri=export_location_uri, task=task)
            table_ref = create_dataset(
                client, dataset_name,
                destination_dataset_project_id).table(task)
            load_job = client.load_table_from_uri(uri,
                                                  table_ref,
                                                  job_config=job_config)
            submit_bigquery_job(load_job, job_config)
            assert load_job.state == 'DONE'
Exemple #6
0
def load_query_result_to_table(dest_table,
                               query,
                               part_col_name=None,
                               clustering_fields=None):
    bq_client = get_bigquery_client()
    qjc = None
    print(query)
    if bq_table_exists(dest_table):
        table = bq_client.get_table(dest_table)
        qjc = QueryJobConfig(
            destination=dest_table,
            write_disposition="WRITE_TRUNCATE",
            create_disposition="CREATE_IF_NEEDED",
            time_partitioning=table.time_partitioning,
            range_partitioning=table.range_partitioning,
            clustering_fields=table.clustering_fields,
        )
        job = bq_client.query(query, job_config=qjc)
        job.result()

    else:
        import time

        temp_table_name = f"load_query_result_to_table__{str(int(time.time()))}"
        bq_client.query(
            f"CREATE OR REPLACE TABLE temp_1d.{temp_table_name} AS {query}"
        ).result()
        if part_col_name:
            schema = bq_client.get_table(f"temp_1d.{temp_table_name}").schema
            partition_type = [
                f for f in schema if f.name.lower() == part_col_name.lower()
            ][0].field_type
            if partition_type == "DATE":
                qjc = QueryJobConfig(
                    destination=dest_table,
                    write_disposition="WRITE_TRUNCATE",
                    create_disposition="CREATE_IF_NEEDED",
                    time_partitioning=TimePartitioning(field=part_col_name),
                    clustering_fields=clustering_fields,
                )
            elif partition_type == "INTEGER":
                qjc = QueryJobConfig(
                    destination=dest_table,
                    write_disposition="WRITE_TRUNCATE",
                    create_disposition="CREATE_IF_NEEDED",
                    range_partitioning=RangePartitioning(PartitionRange(
                        start=200001, end=209912, interval=1),
                                                         field=part_col_name),
                    clustering_fields=clustering_fields,
                )
            else:
                print(partition_type)
                raise Exception(
                    f"Partition column[{part_col_name}] is neither DATE or INTEGER type."
                )
        job = bq_client.query(f"SELECT * FROM temp_1d.{temp_table_name}",
                              job_config=qjc)
        job.result()
 def __create_test_table(self, table_name, dataset_id):
     table_schema = [
         SchemaField("int_data", "INT64"),
         SchemaField("str_data", "STRING")
     ]
     table_reference = TableReference(dataset_id, table_name)
     test_table = Table(table_reference, table_schema)
     test_table.time_partitioning = TimePartitioning("DAY")
     self.__delete_if_exists(test_table)
     self.GCP_BIGQUERY_CLIENT.create_table(test_table)
     return test_table
Exemple #8
0
def store_digital_health_status_data(project_id,
                                     json_data,
                                     destination_table,
                                     schema=None):
    """
    Stores the fetched digital_health_sharing_status data in a BigQuery dataset.

    If the table doesn't exist, it will create that table. If the table does exist,
    it will create a partition in the designated table or append to the same partition.
    This is necessary for storing data has "RECORD" type fields which do not conform to a dataframe.
    The data is stored using a JSON file object since it is one of the ways BigQuery expects it.
    :param project_id: identifies the project
    :param json_data: list of json objects retrieved from process_digital_health_data_to_json
    :param destination_table: fully qualified destination table name as 'project.dataset.table'
    :param schema: a list of SchemaField objects corresponding to the destination table

    :return: returns the bq job_id for the loading of digital health data
    """

    # Parameter check
    if not isinstance(project_id, str):
        raise RuntimeError(
            f'Please specify the project in which to create the table')

    client = get_client(project_id)
    if not schema:
        schema = get_table_schema(DIGITAL_HEALTH_SHARING_STATUS)

    try:
        table = client.get_table(destination_table)
    except NotFound:
        table = Table(destination_table, schema=schema)
        table.time_partitioning = TimePartitioning(
            type_=TimePartitioningType.DAY)
        table = client.create_table(table)

    file_obj = StringIO()
    for json_obj in json_data:
        json.dump(json_obj, file_obj)
        file_obj.write('\n')
    job_config = LoadJobConfig(
        source_format=SourceFormat.NEWLINE_DELIMITED_JSON, schema=schema)
    job = client.load_table_from_file(file_obj,
                                      table,
                                      rewind=True,
                                      job_config=job_config,
                                      job_id_prefix='ps_digital_health_load_')
    job.result()

    return job.job_id
Exemple #9
0
    def _process_data_for_bigquery(self, data: DataFrame,
                                   output_tablereference: TableReference):
        job_config = LoadJobConfig()
        job_config.write_disposition = WriteDisposition.WRITE_APPEND
        job_config.time_partitioning = TimePartitioning(
            type_=TimePartitioningType.DAY, field='date')

        try:
            load_job = self.bigquery.client.load_table_from_dataframe(
                data, output_tablereference, job_config=job_config)

            load_job.result()
        except BadRequest as error:
            print(error.errors)
    def setUp(self):
        self.client_mock = Mock()
        self.query_job_mock = Mock(QueryJob)
        self.query_job_mock.priority = "INTERACTIVE"
        self.query_job_mock.create_disposition = None
        self.query_job_mock.write_disposition = None
        self.query_job_mock.time_partitioning = TimePartitioning(
            "DAY", "a", None, None)

        self.query_config = EmsQueryJobConfig(
            destination_project_id="some_destination_project_id",
            destination_dataset="some_dataset",
            destination_table="some_table",
            labels={"label1": "label1_value"})
Exemple #11
0
    def setUp(self):
        self.project_id = os.environ.get(PROJECT_ID)
        self.dataset_id = os.environ.get('COMBINED_DATASET_ID')
        self.dataset_ref = DatasetReference(self.project_id, self.dataset_id)
        self.client = bq.get_client(self.project_id)

        self.schema = [
            SchemaField("person_id", "INT64"),
            SchemaField("first_name", "STRING"),
            SchemaField("last_name", "STRING"),
            SchemaField("algorithm", "STRING")
        ]

        self.ps_api_fields = [
            dict(name='person_id', type='integer', mode='nullable'),
            dict(name='first_name', type='string', mode='nullable'),
            dict(name='last_name', type='string', mode='nullable')
        ]

        self.id_match_fields = [
            dict(name='person_id', type='integer', mode='nullable'),
            dict(name='first_name', type='string', mode='nullable'),
            dict(name='last_name', type='string', mode='nullable'),
            dict(name='algorithm', type='string', mode='nullable')
        ]

        self.hpo_id = 'fake_site'
        self.id_match_table_id = f'{IDENTITY_MATCH_TABLE}_{self.hpo_id}'
        self.ps_values_table_id = f'ps_api_values_{self.hpo_id}'

        # Create and populate the ps_values site table

        schema = bq.get_table_schema(PS_API_VALUES)
        tablename = self.ps_values_table_id

        table = Table(f'{self.project_id}.{self.dataset_id}.{tablename}',
                      schema=schema)
        table.time_partitioning = TimePartitioning(
            type_=TimePartitioningType.HOUR)
        table = self.client.create_table(table)

        populate_query = POPULATE_PS_VALUES.render(
            project_id=self.project_id,
            drc_dataset_id=self.dataset_id,
            ps_values_table_id=self.ps_values_table_id)
        job = self.client.query(populate_query)
        job.result()
def create_table(client, table_name, schema, project=None, dataset=None, partitioning_type=None, partitioned_field=None, clustering_fields=None):
    """
    Create Table with Schema
    :param client: BQ Client
    :param table_name: Table name
    :param schema: Table Schema
    :param project: default to client.project
    :param dataset: default to client.dataset
    :param partitioning_type: either : `time` or `range` partitioned
    :param partitioned_field: field name use for partitionning
    :param clustering_fields: fields to use for clustering
    :return: created table
    """
    partitioning_types = {
        "time" : TimePartitioning(type_= TimePartitioningType.HOUR, field=partitioned_field, require_partition_filter=True),
        "range" : RangePartitioning(range_= PartitionRange(start=0, end=100, interval=10), field=partitioned_field)
    }
    try:
        if project is None:
            project = client.project
        if dataset is None:
            dataset = client.dataset
        logging.info("Project: {}\tDataset: {}\tTable: {}\t\tPartitioning Type:{}".format(project, dataset.dataset_id, table_name, partitioning_type))
        table_id = "{}.{}.{}".format(project, dataset.dataset_id, table_name)
        table = bigquery.Table(table_id, schema=schema)
        if partitioning_type is not None:
            partitioning_type = partitioning_type.lower()
            if partitioning_type == "time":
                logging.info("Table Partitioning: {}".format(partitioning_type))
                schema.append(bigquery.SchemaField("ZONE_PARTITIONTIME","TIMESTAMP"))
                table.schema = schema
                table.time_partitioning = partitioning_types.get(partitioning_type)
            elif partitioning_type == "range":
                table.range_partitioning = partitioning_types.get(partitioning_type)
        if clustering_fields is not None:
            table.clustering_fields = clustering_fields
        client.create_table(table, exists_ok=True)
        table = client.get_table(table)
        logging.info("Table {} created successfully.".format(table_id))
        return table
    except Exception as error:
        raise error
    def test_should_create_table_from_table_object(self):
        # given
        table_id = f'{self.dataset_manager.project_id}.{self.dataset_manager.dataset_name}.example_test_table'
        table = Table(table_id,
                      schema=[
                          {
                              "mode": "NULLABLE",
                              "name": "example_field",
                              "type": "STRING"
                          },
                      ])
        table.time_partitioning = TimePartitioning()

        # when
        self.dataset_manager.create_table_from_schema('example_test_table',
                                                      schema=None,
                                                      table=table)

        # then
        self.table_should_exists()
    def __create_query_job_mock(self,
                                job_id: str,
                                has_error: bool,
                                created: datetime = datetime.now()):
        error_result = {
            'reason': 'someReason',
            'location': 'query',
            'message': 'error occurred'
        }
        query_job_mock = Mock(QueryJob)
        query_job_mock.job_id = job_id
        query_job_mock.priority = "INTERACTIVE"
        query_job_mock.destination = None
        query_job_mock.query = "SIMPLE QUERY"
        query_job_mock.labels = {"label1": "label1_value"}
        query_job_mock.state = "DONE"
        query_job_mock.create_disposition = None
        query_job_mock.write_disposition = None
        query_job_mock.error_result = error_result if has_error else None
        query_job_mock.created = created
        query_job_mock.time_partitioning = TimePartitioning(
            "DAY", "a", None, None)

        return query_job_mock
Exemple #15
0
    def create_table_from_schema(self,
                                 table_id: str,
                                 schema: typing.Union[typing.List[dict], Path,
                                                      None] = None,
                                 table=None):
        from google.cloud.bigquery import Table, TimePartitioning

        if schema and table:
            raise ValueError(
                "You can't provide both schema and table, because the table you provide"
                "should already contain the schema.")
        if not schema and not table:
            raise ValueError("You must provide either schema or table.")

        if isinstance(schema, Path):
            schema = json.loads(schema.read_text())

        if table is None:
            table = Table(table_id, schema=schema)
            table.time_partitioning = TimePartitioning()

        self.logger.info(f'CREATING TABLE FROM SCHEMA: {table.schema}')

        self.bigquery_client.create_table(table)
        def parse_task(ds, **kwargs):
            template_context = kwargs.copy()
            template_context['ds'] = ds
            template_context['params'] = environment
            template_context['params']['table_name'] = table_name
            template_context['params']['columns'] = columns
            template_context['params']['parser'] = parser
            template_context['params']['abi'] = abi
            if parser_type == 'log':
                template_context['params']['event_topic'] = abi_to_event_topic(parser['abi'])
            elif parser_type == 'trace':
                template_context['params']['method_selector'] = abi_to_method_selector(parser['abi'])
            template_context['params']['struct_fields'] = create_struct_string_from_schema(schema)
            template_context['params']['parse_all_partitions'] = parse_all_partitions
            client = bigquery.Client()

            # # # Create a temporary table

            dataset_name_temp = 'parse_temp'
            create_dataset(client, dataset_name_temp)
            temp_table_name = 'temp_{table_name}_{milliseconds}'\
                .format(table_name=table_name, milliseconds=int(round(time.time() * 1000)))
            temp_table_ref = client.dataset(dataset_name_temp).table(temp_table_name)

            temp_table = bigquery.Table(temp_table_ref, schema=read_bigquery_schema_from_dict(schema, parser_type))

            temp_table.description = table_description
            temp_table.time_partitioning = TimePartitioning(field='block_timestamp')
            logging.info('Creating table: ' + json.dumps(temp_table.to_api_repr()))
            temp_table = client.create_table(temp_table)
            assert temp_table.table_id == temp_table_name

            # # # Query to temporary table

            job_config = bigquery.QueryJobConfig()
            job_config.priority = bigquery.QueryPriority.INTERACTIVE
            job_config.destination = temp_table_ref
            sql_template = get_parse_sql_template(parser_type)
            sql = kwargs['task'].render_template('', sql_template, template_context)
            logging.info(sql)
            query_job = client.query(sql, location='US', job_config=job_config)
            submit_bigquery_job(query_job, job_config)
            assert query_job.state == 'DONE'

            # # # Copy / merge to destination

            if parse_all_partitions:
                # Copy temporary table to destination
                copy_job_config = bigquery.CopyJobConfig()
                copy_job_config.write_disposition = 'WRITE_TRUNCATE'
                dest_table_ref = client.dataset(dataset_name, project=parse_destination_dataset_project_id).table(table_name)
                copy_job = client.copy_table(temp_table_ref, dest_table_ref, location='US', job_config=copy_job_config)
                submit_bigquery_job(copy_job, copy_job_config)
                assert copy_job.state == 'DONE'
                # Need to do update description as copy above won't repect the description in case destination table
                # already exists
                table = client.get_table(dest_table_ref)
                table.description = table_description
                table = client.update_table(table, ["description"])
                assert table.description == table_description
            else:
                # Merge
                # https://cloud.google.com/bigquery/docs/reference/standard-sql/dml-syntax#merge_statement
                merge_job_config = bigquery.QueryJobConfig()
                # Finishes faster, query limit for concurrent interactive queries is 50
                merge_job_config.priority = bigquery.QueryPriority.INTERACTIVE

                merge_sql_template = get_merge_table_sql_template()
                merge_template_context = template_context.copy()
                merge_template_context['params']['source_table'] = temp_table_name
                merge_template_context['params']['destination_dataset_project_id'] = parse_destination_dataset_project_id
                merge_template_context['params']['destination_dataset_name'] = dataset_name
                merge_template_context['params']['dataset_name_temp'] = dataset_name_temp
                merge_template_context['params']['columns'] = columns
                merge_sql = kwargs['task'].render_template('', merge_sql_template, merge_template_context)
                print('Merge sql:')
                print(merge_sql)
                merge_job = client.query(merge_sql, location='US', job_config=merge_job_config)
                submit_bigquery_job(merge_job, merge_job_config)
                assert merge_job.state == 'DONE'

            # Delete temp table
            client.delete_table(temp_table_ref)
        def enrich_task(ds, **kwargs):
            template_context = kwargs.copy()
            template_context['ds'] = ds
            template_context['params'] = environment

            client = bigquery.Client()

            # Need to use a temporary table because bq query sets field modes to NULLABLE and descriptions to null
            # when writeDisposition is WRITE_TRUNCATE

            # Create a temporary table
            temp_table_name = '{task}_{milliseconds}'.format(
                task=task, milliseconds=int(round(time.time() * 1000)))
            temp_table_ref = client.dataset(dataset_name_temp).table(
                temp_table_name)

            schema_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/schemas/{task}.json'.format(
                    task=task))
            schema = read_bigquery_schema_from_file(schema_path)
            table = bigquery.Table(temp_table_ref, schema=schema)

            description_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/descriptions/{task}.txt'.format(
                    task=task))
            table.description = read_file(description_path)
            if time_partitioning_field is not None:
                table.time_partitioning = TimePartitioning(
                    field=time_partitioning_field)
            logging.info('Creating table: ' + json.dumps(table.to_api_repr()))
            table = client.create_table(table)
            assert table.table_id == temp_table_name

            # Query from raw to temporary table
            query_job_config = bigquery.QueryJobConfig()
            # Finishes faster, query limit for concurrent interactive queries is 50
            query_job_config.priority = bigquery.QueryPriority.INTERACTIVE
            query_job_config.destination = temp_table_ref

            sql_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/sqls/{task}.sql'.format(task=task))
            sql_template = read_file(sql_path)
            sql = kwargs['task'].render_template('', sql_template,
                                                 template_context)
            print('Enrichment sql:')
            print(sql)

            query_job = client.query(sql,
                                     location='US',
                                     job_config=query_job_config)
            submit_bigquery_job(query_job, query_job_config)
            assert query_job.state == 'DONE'

            if load_all_partitions or always_load_all_partitions:
                # Copy temporary table to destination
                copy_job_config = bigquery.CopyJobConfig()
                copy_job_config.write_disposition = 'WRITE_TRUNCATE'
                dest_table_name = '{task}'.format(task=task)
                dest_table_ref = client.dataset(
                    dataset_name,
                    project=destination_dataset_project_id).table(
                        dest_table_name)
                copy_job = client.copy_table(temp_table_ref,
                                             dest_table_ref,
                                             location='US',
                                             job_config=copy_job_config)
                submit_bigquery_job(copy_job, copy_job_config)
                assert copy_job.state == 'DONE'
            else:
                # Merge
                # https://cloud.google.com/bigquery/docs/reference/standard-sql/dml-syntax#merge_statement
                merge_job_config = bigquery.QueryJobConfig()
                # Finishes faster, query limit for concurrent interactive queries is 50
                merge_job_config.priority = bigquery.QueryPriority.INTERACTIVE

                merge_sql_path = os.path.join(
                    dags_folder,
                    'resources/stages/enrich/sqls/merge/merge_{task}.sql'.
                    format(task=task))
                merge_sql_template = read_file(merge_sql_path)

                merge_template_context = template_context.copy()
                merge_template_context['params'][
                    'source_table'] = temp_table_name
                merge_template_context['params'][
                    'destination_dataset_project_id'] = destination_dataset_project_id
                merge_template_context['params'][
                    'destination_dataset_name'] = dataset_name
                merge_sql = kwargs['task'].render_template(
                    '', merge_sql_template, merge_template_context)
                print('Merge sql:')
                print(merge_sql)
                merge_job = client.query(merge_sql,
                                         location='US',
                                         job_config=merge_job_config)
                submit_bigquery_job(merge_job, merge_job_config)
                assert merge_job.state == 'DONE'

            # Delete temp table
            client.delete_table(temp_table_ref)
Exemple #18
0
 def bq_time_partitioning(self):
     return TimePartitioning(
         type_=bigquery.TimePartitioningType.DAY,
         field=self.field,
         expiration_ms=int(self.expire.milli),
     )
        def enrich_task():
            client = bigquery.Client()

            # Need to use a temporary table because bq query sets field modes to NULLABLE and descriptions to null
            # when writeDisposition is WRITE_TRUNCATE

            # Create a temporary table
            temp_table_name = '{task}_{milliseconds}'.format(
                task=task, milliseconds=int(round(time.time() * 1000)))
            temp_table_ref = client.dataset(dataset_name_temp).table(
                temp_table_name)

            schema_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/schemas/{task}.json'.format(
                    task=task))
            schema = read_bigquery_schema_from_file(schema_path)
            table = bigquery.Table(temp_table_ref, schema=schema)

            description_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/descriptions/{task}.txt'.format(
                    task=task))
            table.description = read_file(description_path)
            if time_partitioning_field is not None:
                table.time_partitioning = TimePartitioning(
                    field=time_partitioning_field)
            logging.info('Creating table: ' + json.dumps(table.to_api_repr()))
            table = client.create_table(table)
            assert table.table_id == temp_table_name

            # Query from raw to temporary table
            query_job_config = bigquery.QueryJobConfig()
            # Finishes faster, query limit for concurrent interactive queries is 50
            query_job_config.priority = bigquery.QueryPriority.INTERACTIVE
            query_job_config.destination = temp_table_ref
            sql_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/sqls/{task}.sql'.format(task=task))
            sql = read_file(sql_path)
            query_job = client.query(sql,
                                     location='US',
                                     job_config=query_job_config)
            submit_bigquery_job(query_job, query_job_config)
            assert query_job.state == 'DONE'

            # Copy temporary table to destination
            copy_job_config = bigquery.CopyJobConfig()
            copy_job_config.write_disposition = 'WRITE_TRUNCATE'

            all_destination_projects = [(destination_dataset_project_id,
                                         dataset_name)]
            if copy_dataset_project_id is not None and len(copy_dataset_project_id) > 0 \
                    and copy_dataset_name is not None and len(copy_dataset_name) > 0:
                all_destination_projects.append(
                    (copy_dataset_project_id, copy_dataset_name))

            for dest_project, dest_dataset_name in all_destination_projects:
                dest_table_name = '{task}'.format(task=task)
                dest_table_ref = client.dataset(
                    dest_dataset_name,
                    project=dest_project).table(dest_table_name)
                copy_job = client.copy_table(temp_table_ref,
                                             dest_table_ref,
                                             location='US',
                                             job_config=copy_job_config)
                submit_bigquery_job(copy_job, copy_job_config)
                assert copy_job.state == 'DONE'

            # Delete temp table
            client.delete_table(temp_table_ref)
Exemple #20
0
    def setUp(self):
        self.maxDiff = None
        self.project_id = os.environ.get(PROJECT_ID)
        self.dataset_id = os.environ.get('COMBINED_DATASET_ID')
        self.dataset_ref = DatasetReference(self.project_id, self.dataset_id)
        self.client = bq.get_client(self.project_id)

        self.hpo_id = 'fake_site'
        self.id_match_table_id = f'{IDENTITY_MATCH_TABLE}_{self.hpo_id}'
        self.ps_values_table_id = f'{PS_API_VALUES}_{self.hpo_id}'
        self.pii_address_table_id = f'{self.hpo_id}_pii_address'
        self.pii_email_table_id = f'{self.hpo_id}_pii_email'
        self.pii_phone_number_table_id = f'{self.hpo_id}_pii_phone_number'
        self.pii_name_table_id = f'{self.hpo_id}_pii_name'
        self.person_table_id = f'{self.hpo_id}_person'
        self.location_table_id = f'{self.hpo_id}_location'
        self.fq_concept_table = f'{self.project_id}.{self.dataset_id}.concept'

        # Create and populate the ps_values site table

        schema = resources.fields_for(f'{PS_API_VALUES}')
        table = Table(
            f'{self.project_id}.{self.dataset_id}.{self.ps_values_table_id}',
            schema=schema)
        table.time_partitioning = TimePartitioning(
            type_=TimePartitioningType.HOUR)
        table = self.client.create_table(table, exists_ok=True)

        populate_query = POPULATE_PS_VALUES.render(
            project_id=self.project_id,
            drc_dataset_id=self.dataset_id,
            ps_values_table_id=self.ps_values_table_id)
        job = self.client.query(populate_query)
        job.result()

        # Create and populate the drc_id_match_table

        schema = resources.fields_for(f'{IDENTITY_MATCH_TABLE}')
        table = Table(
            f'{self.project_id}.{self.dataset_id}.{self.id_match_table_id}',
            schema=schema)
        table.time_partitioning = TimePartitioning(
            type_=TimePartitioningType.HOUR)
        table = self.client.create_table(table, exists_ok=True)

        populate_query = POPULATE_ID_MATCH.render(
            project_id=self.project_id,
            drc_dataset_id=self.dataset_id,
            id_match_table_id=self.id_match_table_id)
        job = self.client.query(populate_query)
        job.result()

        # Create and populate pii_name, pii_email, pii_phone_number, and pii_address table

        schema = resources.fields_for(f'{PII_NAME}')
        table = Table(
            f'{self.project_id}.{self.dataset_id}.{self.pii_name_table_id}',
            schema=schema)
        table.time_partitioning = TimePartitioning(
            type_=TimePartitioningType.HOUR)
        table = self.client.create_table(table, exists_ok=True)

        schema = resources.fields_for(f'{PII_EMAIL}')
        table = Table(
            f'{self.project_id}.{self.dataset_id}.{self.pii_email_table_id}',
            schema=schema)
        table.time_partitioning = TimePartitioning(
            type_=TimePartitioningType.HOUR)
        table = self.client.create_table(table, exists_ok=True)

        schema = resources.fields_for(f'{PII_PHONE_NUMBER}')
        table = Table(
            f'{self.project_id}.{self.dataset_id}.{self.pii_phone_number_table_id}',
            schema=schema)
        table.time_partitioning = TimePartitioning(
            type_=TimePartitioningType.HOUR)
        table = self.client.create_table(table, exists_ok=True)

        schema = resources.fields_for(f'{PII_ADDRESS}')
        table = Table(
            f'{self.project_id}.{self.dataset_id}.{self.pii_address_table_id}',
            schema=schema)
        table.time_partitioning = TimePartitioning(
            type_=TimePartitioningType.HOUR)
        table = self.client.create_table(table, exists_ok=True)

        person_table = Table(
            f'{self.project_id}.{self.dataset_id}.{self.person_table_id}',
            schema=person_schema)
        person_table = self.client.create_table(person_table, exists_ok=True)

        location_table = Table(
            f'{self.project_id}.{self.dataset_id}.{self.location_table_id}',
            schema=location_schema)
        location_table = self.client.create_table(location_table,
                                                  exists_ok=True)

        concept_table = Table(f'{self.project_id}.{self.dataset_id}.concept',
                              schema=concept_schema)
        concept_table = self.client.create_table(concept_table, exists_ok=True)