def _do_temp_table_based_load(self, rows):
        assert isinstance(rows, dict)

        loaded_tmp_tables = []
        try:
            for stream in rows.keys():
                tmp_table_name = "t_{}_{}".format(
                    self.tables[stream],
                    str(uuid.uuid4()).replace("-", ""))

                job = self._load_to_bq(
                    client=self.client,
                    dataset=self.dataset,
                    table_name=tmp_table_name,
                    table_schema=self.schemas[stream],
                    table_config=self.table_configs.get(stream, {}),
                    key_props=self.key_properties[stream],
                    metadata_columns=self.add_metadata_columns,
                    truncate=True,
                    rows=self.rows[stream])

                loaded_tmp_tables.append((stream, tmp_table_name))

            # copy tables to production tables
            for stream, tmp_table_name in loaded_tmp_tables:
                truncate = self.truncate if stream not in self.partially_loaded_streams else False

                copy_config = CopyJobConfig()
                if truncate:
                    copy_config.write_disposition = WriteDisposition.WRITE_TRUNCATE
                    self.logger.info(
                        f"Copy {tmp_table_name} to {self.tables[stream]} by FULL_TABLE"
                    )
                else:
                    copy_config.write_disposition = WriteDisposition.WRITE_APPEND
                    self.logger.info(
                        f"Copy {tmp_table_name} to {self.tables[stream]} by APPEND"
                    )

                self.client.copy_table(
                    sources=self.dataset.table(tmp_table_name),
                    destination=self.dataset.table(self.tables[stream]),
                    job_config=copy_config).result()

                self.partially_loaded_streams.add(stream)
                self.rows[stream].close()  # erase the file
                self.rows[stream] = TemporaryFile(mode="w+b")

        except Exception as e:
            raise e

        finally:  # delete temp tables
            for stream, tmp_table_name in loaded_tmp_tables:
                self.client.delete_table(
                    table=self.dataset.table(tmp_table_name),
                    not_found_ok=True)
Exemple #2
0
    def copy_table(self):
        # Overwrite parent method for alternative approach
        # If target table was created by a previous run, drop it first
        self.drop_table_if_exists(self.table_name,
                                  self.schema_name + self.schema_suffix,
                                  self.database_name)
        # Create target table if source table exists
        if self.test_if_table_exists(
                table_name=self.table_name,
                schema_name=self.schema_name,
                project_id=self.database_name,
        ):
            self.log.info("Copying table into temporary dataset!")
            conn = self.dwh_hook.dbconn
            ds_old = conn.get_dataset(self.schema_name)
            ds_new = conn.get_dataset(self.schema_name + self.schema_suffix)
            table_old = conn.get_table(table=TableReference(
                dataset_ref=ds_old, table_id=self.table_name))
            table_new = ds_new.table(self.table_name)
            copy_job = conn.copy_table(
                table_old,
                table_new,
                job_config=CopyJobConfig(write_disposition="WRITE_TRUNCATE"),
            )

            copy_job.result()  # Waits until the job is done
            assert copy_job.state == "DONE", "Unexpected job state: {0}".format(
                job.state)
            self.log.info("Successfully copied {0}!".format(
                copy_job.__dict__["_properties"]["configuration"]["copy"]
                ["destinationTable"]["tableId"]))
Exemple #3
0
        def enrich_task():
            client = Client()

            # Need to use a temporary table because bq query sets field modes to NULLABLE and descriptions to null
            # when writeDisposition is WRITE_TRUNCATE

            # Create a temporary table
            temp_table_name = '{task}_{milliseconds}'.format(
                task=task, milliseconds=int(round(time.time() * 1000)))
            temp_table_ref = client.dataset(dataset_name_temp).table(
                temp_table_name)
            table = Table(temp_table_ref)

            description_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/descriptions/{task}.txt'.format(
                    task=task))
            table.description = read_file(description_path)
            if time_partitioning_field is not None:
                table.time_partitioning = TimePartitioning(
                    field=time_partitioning_field)
            logging.info('Creating table: ' + json.dumps(table.to_api_repr()))

            schema_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/schemas/{task}.json'.format(
                    task=task))
            schema = read_bigquery_schema_from_file(schema_path)
            table.schema = schema

            table = client.create_table(table)
            assert table.table_id == temp_table_name

            # Query from raw to temporary table
            query_job_config = QueryJobConfig()
            # Finishes faster, query limit for concurrent interactive queries is 50
            query_job_config.priority = QueryPriority.INTERACTIVE
            query_job_config.destination = temp_table_ref
            sql_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/sqls/{task}.sql'.format(task=task))
            sql = read_file(sql_path, environment)
            query_job = client.query(sql,
                                     location='US',
                                     job_config=query_job_config)
            submit_bigquery_job(query_job, query_job_config)
            assert query_job.state == 'DONE'

            # Copy temporary table to destination
            copy_job_config = CopyJobConfig()
            copy_job_config.write_disposition = 'WRITE_TRUNCATE'

            dest_table_name = '{task}'.format(task=task)
            dest_table_ref = client.dataset(
                dataset_name,
                project=destination_dataset_project_id).table(dest_table_name)
            copy_job = client.copy_table(temp_table_ref,
                                         dest_table_ref,
                                         location='US',
                                         job_config=copy_job_config)
            submit_bigquery_job(copy_job, copy_job_config)
            assert copy_job.state == 'DONE'

            # Delete temp table
            client.delete_table(temp_table_ref)
Exemple #4
0
        def enrich_task(ds, **kwargs):
            template_context = kwargs.copy()
            template_context['ds'] = ds
            template_context['params'] = environment

            client = Client()

            # Need to use a temporary table because bq query sets field modes to NULLABLE and descriptions to null
            # when writeDisposition is WRITE_TRUNCATE

            # Create a temporary table
            temp_table_name = '{task}_{milliseconds}'.format(
                task=task, milliseconds=int(round(time.time() * 1000)))
            temp_table_ref = client.dataset(dataset_name_temp).table(
                temp_table_name)
            table = Table(temp_table_ref)

            description_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/descriptions/{task}.txt'.format(
                    task=task))
            table.description = read_file(description_path)
            table.time_partitioning = TimePartitioning(
                field=time_partitioning_field)
            logging.info('Creating table: ' + json.dumps(table.to_api_repr()))

            schema_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/schemas/{task}.json'.format(
                    task=task))
            schema = read_bigquery_schema_from_file(schema_path)
            table.schema = schema

            table = client.create_table(table)
            assert table.table_id == temp_table_name

            # Query from raw to temporary table
            query_job_config = QueryJobConfig()
            # Finishes faster, query limit for concurrent interactive queries is 50
            query_job_config.priority = QueryPriority.INTERACTIVE
            query_job_config.destination = temp_table_ref

            sql_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/sqls/{task}.sql'.format(task=task))
            sql_template = read_file(sql_path)
            sql = kwargs['task'].render_template('', sql_template,
                                                 template_context)
            print('Enrichment sql:')
            print(sql)

            query_job = client.query(sql,
                                     location='US',
                                     job_config=query_job_config)
            submit_bigquery_job(query_job, query_job_config)
            assert query_job.state == 'DONE'

            if load_all_partitions:
                # Copy temporary table to destination
                copy_job_config = CopyJobConfig()
                copy_job_config.write_disposition = 'WRITE_TRUNCATE'

                dest_table_name = '{task}'.format(task=task)
                dest_table_ref = client.dataset(
                    dataset_name,
                    project=destination_dataset_project_id).table(
                        dest_table_name)
                copy_job = client.copy_table(temp_table_ref,
                                             dest_table_ref,
                                             location='US',
                                             job_config=copy_job_config)
                submit_bigquery_job(copy_job, copy_job_config)
                assert copy_job.state == 'DONE'
            else:
                # Merge
                # https://cloud.google.com/bigquery/docs/reference/standard-sql/dml-syntax#merge_statement
                merge_job_config = QueryJobConfig()
                # Finishes faster, query limit for concurrent interactive queries is 50
                merge_job_config.priority = QueryPriority.INTERACTIVE

                merge_sql_path = os.path.join(
                    dags_folder,
                    'resources/stages/enrich/sqls/merge_{task}.sql'.format(
                        task=task))
                merge_sql_template = read_file(merge_sql_path)
                template_context['params']['source_table'] = temp_table_name
                merge_sql = kwargs['task'].render_template(
                    '', merge_sql_template, template_context)
                print('Merge sql:')
                print(merge_sql)
                merge_job = client.query(merge_sql,
                                         location='US',
                                         job_config=merge_job_config)
                submit_bigquery_job(merge_job, merge_job_config)
                assert merge_job.state == 'DONE'

            # Delete temp table
            client.delete_table(temp_table_ref)