Esempio n. 1
0
    def test_descriptions(self):
        """
        Test that differences in description are ignored
        when ignore_descriptions=True.
        """
        schema1 = bigquery.TableSchema(fields=[
            bigquery.TableFieldSchema(
                name="a",
                mode="REQUIRED",
                type="FLOAT64",
                description="Field A",
            ),
            bigquery.TableFieldSchema(
                name="b",
                mode="REQUIRED",
                type="INT64",
            ),
        ])

        schema2 = bigquery.TableSchema(fields=[
            bigquery.TableFieldSchema(name="a",
                                      mode="REQUIRED",
                                      type="FLOAT64",
                                      description="Field A is for Apple"),
            bigquery.TableFieldSchema(
                name="b",
                mode="REQUIRED",
                type="INT64",
                description="Field B",
            ),
        ])

        self.assertFalse(check_schema_equal(schema1, schema2))
        self.assertTrue(
            check_schema_equal(schema1, schema2, ignore_descriptions=True))
Esempio n. 2
0
    def test_field_order(self):
        """Test that field order is ignored when ignore_field_order=True."""
        schema1 = bigquery.TableSchema(fields=[
            bigquery.TableFieldSchema(
                name="a", mode="REQUIRED", type="FLOAT64"),
            bigquery.TableFieldSchema(name="b", mode="REQUIRED", type="INT64"),
        ])

        schema2 = bigquery.TableSchema(fields=list(reversed(schema1.fields)))

        self.assertFalse(check_schema_equal(schema1, schema2))
        self.assertTrue(
            check_schema_equal(schema1, schema2, ignore_field_order=True))
Esempio n. 3
0
    def test_simple_schemas(self):
        schema1 = bigquery.TableSchema(fields=[])
        self.assertTrue(check_schema_equal(schema1, schema1))

        schema2 = bigquery.TableSchema(fields=[
            bigquery.TableFieldSchema(name="a", mode="NULLABLE", type="INT64")
        ])
        self.assertTrue(check_schema_equal(schema2, schema2))
        self.assertFalse(check_schema_equal(schema1, schema2))

        schema3 = bigquery.TableSchema(fields=[
            bigquery.TableFieldSchema(
                name="b",
                mode="REPEATED",
                type="RECORD",
                fields=[
                    bigquery.TableFieldSchema(
                        name="c", mode="REQUIRED", type="BOOL")
                ])
        ])
        self.assertTrue(check_schema_equal(schema3, schema3))
        self.assertFalse(check_schema_equal(schema2, schema3))
    def process(self, element, schema_mod_job_name_prefix):
        destination = element[0]
        temp_table_load_job_reference = element[1]

        if callable(self._additional_bq_parameters):
            additional_parameters = self._additional_bq_parameters(destination)
        elif isinstance(self._additional_bq_parameters, vp.ValueProvider):
            additional_parameters = self._additional_bq_parameters.get()
        else:
            additional_parameters = self._additional_bq_parameters

        # When writing to normal tables WRITE_TRUNCATE will overwrite the schema but
        # when writing to a partition, care needs to be taken to update the schema
        # even on WRITE_TRUNCATE.
        if (self._write_disposition not in ('WRITE_TRUNCATE', 'WRITE_APPEND')
                or not additional_parameters
                or not additional_parameters.get("schemaUpdateOptions")):
            # No need to modify schema of destination table
            return

        table_reference = bigquery_tools.parse_table_reference(destination)
        if table_reference.projectId is None:
            table_reference.projectId = vp.RuntimeValueProvider.get_value(
                'project', str, '')

        try:
            # Check if destination table exists
            destination_table = self._bq_wrapper.get_table(
                project_id=table_reference.projectId,
                dataset_id=table_reference.datasetId,
                table_id=table_reference.tableId)
        except HttpError as exn:
            if exn.status_code == 404:
                # Destination table does not exist, so no need to modify its schema
                # ahead of the copy jobs.
                return
            else:
                raise

        temp_table_load_job = self._bq_wrapper.get_job(
            project=temp_table_load_job_reference.projectId,
            job_id=temp_table_load_job_reference.jobId,
            location=temp_table_load_job_reference.location)
        temp_table_schema = temp_table_load_job.configuration.load.schema

        if bigquery_tools.check_schema_equal(temp_table_schema,
                                             destination_table.schema,
                                             ignore_descriptions=True,
                                             ignore_field_order=True):
            # Destination table schema is already the same as the temp table schema,
            # so no need to run a job to update the destination table schema.
            return

        destination_hash = _bq_uuid(
            '%s:%s.%s' % (table_reference.projectId, table_reference.datasetId,
                          table_reference.tableId))
        uid = _bq_uuid()
        job_name = '%s_%s_%s' % (schema_mod_job_name_prefix, destination_hash,
                                 uid)

        _LOGGER.debug('Triggering schema modification job %s on %s', job_name,
                      table_reference)
        # Trigger potential schema modification by loading zero rows into the
        # destination table with the temporary table schema.
        schema_update_job_reference = self._bq_wrapper.perform_load_job(
            destination=table_reference,
            source_stream=io.BytesIO(),  # file with zero rows
            job_id=job_name,
            schema=temp_table_schema,
            write_disposition='WRITE_APPEND',
            create_disposition='CREATE_NEVER',
            additional_load_parameters=additional_parameters,
            job_labels=self._bq_io_metadata.add_additional_bq_job_labels())
        yield (destination, schema_update_job_reference)