class BigQueryWriteIntegrationTests(unittest.TestCase): BIG_QUERY_DATASET_ID = 'python_write_to_table_' def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.runner_name = type(self.test_pipeline.runner).__name__ self.project = self.test_pipeline.get_option('project') self.bigquery_client = BigQueryWrapper() self.dataset_id = '%s%s%d' % (self.BIG_QUERY_DATASET_ID, str(int(time.time())), random.randint(0, 10000)) self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id) _LOGGER.info("Created dataset %s in project %s", self.dataset_id, self.project) def tearDown(self): request = bigquery.BigqueryDatasetsDeleteRequest( projectId=self.project, datasetId=self.dataset_id, deleteContents=True) try: _LOGGER.info("Deleting dataset %s in project %s", self.dataset_id, self.project) self.bigquery_client.client.datasets.Delete(request) except HttpError: _LOGGER.debug('Failed to clean up dataset %s in project %s', self.dataset_id, self.project) def create_table(self, table_name): table_schema = bigquery.TableSchema() table_field = bigquery.TableFieldSchema() table_field.name = 'bytes' table_field.type = 'BYTES' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'date' table_field.type = 'DATE' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'time' table_field.type = 'TIME' table_schema.fields.append(table_field) table = bigquery.Table( tableReference=bigquery.TableReference( projectId=self.project, datasetId=self.dataset_id, tableId=table_name), schema=table_schema) request = bigquery.BigqueryTablesInsertRequest( projectId=self.project, datasetId=self.dataset_id, table=table) self.bigquery_client.client.tables.Insert(request) @attr('IT') def test_big_query_write(self): table_name = 'python_write_table' table_id = '{}.{}'.format(self.dataset_id, table_name) input_data = [ {'number': 1, 'str': 'abc'}, {'number': 2, 'str': 'def'}, {'number': 3, 'str': u'你好'}, {'number': 4, 'str': u'привет'}, ] table_schema = {"fields": [ {"name": "number", "type": "INTEGER"}, {"name": "str", "type": "STRING"}]} pipeline_verifiers = [ BigqueryFullResultMatcher( project=self.project, query="SELECT number, str FROM %s" % table_id, data=[(1, 'abc',), (2, 'def',), (3, u'你好',), (4, u'привет',)])] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers)) with beam.Pipeline(argv=args) as p: # pylint: disable=expression-not-assigned (p | 'create' >> beam.Create(input_data) | 'write' >> beam.io.WriteToBigQuery( table_id, schema=table_schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY)) @attr('IT') def test_big_query_write_schema_autodetect(self): if self.runner_name == 'TestDataflowRunner': self.skipTest('DataflowRunner does not support schema autodetection') table_name = 'python_write_table' table_id = '{}.{}'.format(self.dataset_id, table_name) input_data = [ {'number': 1, 'str': 'abc'}, {'number': 2, 'str': 'def'}, ] pipeline_verifiers = [ BigqueryFullResultMatcher( project=self.project, query="SELECT number, str FROM %s" % table_id, data=[(1, 'abc',), (2, 'def',)])] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers), experiments='use_beam_bq_sink') with beam.Pipeline(argv=args) as p: # pylint: disable=expression-not-assigned (p | 'create' >> beam.Create(input_data) | 'write' >> beam.io.WriteToBigQuery( table_id, method=beam.io.WriteToBigQuery.Method.FILE_LOADS, schema=beam.io.gcp.bigquery.SCHEMA_AUTODETECT, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY)) @attr('IT') def test_big_query_write_new_types(self): table_name = 'python_new_types_table' table_id = '{}.{}'.format(self.dataset_id, table_name) row_data = { 'float': 0.33, 'numeric': Decimal('10'), 'bytes': base64.b64encode(b'\xab\xac').decode('utf-8'), 'date': '3000-12-31', 'time': '23:59:59', 'datetime': '2018-12-31T12:44:31', 'timestamp': '2018-12-31 12:44:31.744957 UTC', 'geo': 'POINT(30 10)' } input_data = [row_data] # add rows with only one key value pair and None values for all other keys for key, value in iteritems(row_data): input_data.append({key: value}) table_schema = {"fields": [ {"name": "float", "type": "FLOAT"}, {"name": "numeric", "type": "NUMERIC"}, {"name": "bytes", "type": "BYTES"}, {"name": "date", "type": "DATE"}, {"name": "time", "type": "TIME"}, {"name": "datetime", "type": "DATETIME"}, {"name": "timestamp", "type": "TIMESTAMP"}, {"name": "geo", "type": "GEOGRAPHY"} ]} expected_row = (0.33, Decimal('10'), b'\xab\xac', datetime.date(3000, 12, 31), datetime.time(23, 59, 59), datetime.datetime(2018, 12, 31, 12, 44, 31), datetime.datetime(2018, 12, 31, 12, 44, 31, 744957, tzinfo=pytz.utc), 'POINT(30 10)', ) expected_data = [expected_row] # add rows with only one key value pair and None values for all other keys for i, value in enumerate(expected_row): row = [None]*len(expected_row) row[i] = value expected_data.append(tuple(row)) pipeline_verifiers = [ BigqueryFullResultMatcher( project=self.project, query='SELECT float, numeric, bytes, date, time, datetime,' 'timestamp, geo FROM %s' % table_id, data=expected_data)] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers)) with beam.Pipeline(argv=args) as p: # pylint: disable=expression-not-assigned (p | 'create' >> beam.Create(input_data) | 'write' >> beam.io.WriteToBigQuery( table_id, schema=table_schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY)) @attr('IT') def test_big_query_write_without_schema(self): table_name = 'python_no_schema_table' self.create_table(table_name) table_id = '{}.{}'.format(self.dataset_id, table_name) input_data = [ {'bytes': b'xyw', 'date': '2011-01-01', 'time': '23:59:59.999999'}, {'bytes': b'abc', 'date': '2000-01-01', 'time': '00:00:00'}, {'bytes': b'\xe4\xbd\xa0\xe5\xa5\xbd', 'date': '3000-12-31', 'time': '23:59:59'}, {'bytes': b'\xab\xac\xad', 'date': '2000-01-01', 'time': '00:00:00'} ] # bigquery io expects bytes to be base64 encoded values for row in input_data: row['bytes'] = base64.b64encode(row['bytes']) pipeline_verifiers = [ BigqueryFullResultMatcher( project=self.project, query="SELECT bytes, date, time FROM %s" % table_id, data=[(b'xyw', datetime.date(2011, 1, 1), datetime.time(23, 59, 59, 999999), ), (b'abc', datetime.date(2000, 1, 1), datetime.time(0, 0, 0), ), (b'\xe4\xbd\xa0\xe5\xa5\xbd', datetime.date(3000, 12, 31), datetime.time(23, 59, 59), ), (b'\xab\xac\xad', datetime.date(2000, 1, 1), datetime.time(0, 0, 0), )])] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers)) with beam.Pipeline(argv=args) as p: # pylint: disable=expression-not-assigned (p | 'create' >> beam.Create(input_data) | 'write' >> beam.io.WriteToBigQuery( table_id, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
class BigQueryQueryToTableIT(unittest.TestCase): def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.runner_name = type(self.test_pipeline.runner).__name__ self.project = self.test_pipeline.get_option('project') self.bigquery_client = BigQueryWrapper() self.dataset_id = '%s%s%d' % (BIG_QUERY_DATASET_ID, str(int(time.time())), random.randint(0, 10000)) self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id) self.output_table = "%s.output_table" % (self.dataset_id) def tearDown(self): request = bigquery.BigqueryDatasetsDeleteRequest( projectId=self.project, datasetId=self.dataset_id, deleteContents=True) try: self.bigquery_client.client.datasets.Delete(request) except HttpError: logging.debug('Failed to clean up dataset %s' % self.dataset_id) def _setup_new_types_env(self): table_schema = bigquery.TableSchema() table_field = bigquery.TableFieldSchema() table_field.name = 'bytes' table_field.type = 'BYTES' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'date' table_field.type = 'DATE' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'time' table_field.type = 'TIME' table_schema.fields.append(table_field) table = bigquery.Table( tableReference=bigquery.TableReference( projectId=self.project, datasetId=self.dataset_id, tableId=NEW_TYPES_INPUT_TABLE), schema=table_schema) request = bigquery.BigqueryTablesInsertRequest( projectId=self.project, datasetId=self.dataset_id, table=table) self.bigquery_client.client.tables.Insert(request) table_data = [ {'bytes':b'xyw=', 'date':'2011-01-01', 'time':'23:59:59.999999'}, {'bytes':b'abc=', 'date':'2000-01-01', 'time':'00:00:00'}, {'bytes':b'dec=', 'date':'3000-12-31', 'time':'23:59:59.990000'} ] self.bigquery_client.insert_rows( self.project, self.dataset_id, NEW_TYPES_INPUT_TABLE, table_data) @attr('IT') def test_big_query_legacy_sql(self): verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED) pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=self.project, query=verify_query, checksum=expected_checksum)] extra_opts = {'query': LEGACY_QUERY, 'output': self.output_table, 'output_schema': DIALECT_OUTPUT_SCHEMA, 'use_standard_sql': False, 'on_success_matcher': all_of(*pipeline_verifiers)} options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options) @attr('IT') def test_big_query_standard_sql(self): verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED) pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=self.project, query=verify_query, checksum=expected_checksum)] extra_opts = {'query': STANDARD_QUERY, 'output': self.output_table, 'output_schema': DIALECT_OUTPUT_SCHEMA, 'use_standard_sql': True, 'on_success_matcher': all_of(*pipeline_verifiers)} options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options) # TODO(BEAM-6660): Enable this test when ready. @unittest.skip('This test requires BQ Dataflow native source support for ' + 'KMS, which is not available yet.') @attr('IT') def test_big_query_standard_sql_kms_key(self): verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED) pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=self.project, query=verify_query, checksum=expected_checksum)] extra_opts = {'query': STANDARD_QUERY, 'output': self.output_table, 'output_schema': DIALECT_OUTPUT_SCHEMA, 'use_standard_sql': True, 'on_success_matcher': all_of(*pipeline_verifiers), 'kms_key': KMS_KEY } options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options) table = self.bigquery_client.get_table( self.project, self.dataset_id, 'output_table') self.assertEqual(KMS_KEY, table.encryptionConfiguration.kmsKeyName) @unittest.skipIf(sys.version_info[0] == 3 and os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1', 'This test still needs to be fixed on Python 3' 'TODO: BEAM-6769') @attr('IT') def test_big_query_new_types(self): expected_checksum = test_utils.compute_hash(NEW_TYPES_OUTPUT_EXPECTED) verify_query = NEW_TYPES_OUTPUT_VERIFY_QUERY % self.output_table pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=self.project, query=verify_query, checksum=expected_checksum)] self._setup_new_types_env() extra_opts = { 'query': NEW_TYPES_QUERY % (self.dataset_id, NEW_TYPES_INPUT_TABLE), 'output': self.output_table, 'output_schema': NEW_TYPES_OUTPUT_SCHEMA, 'use_standard_sql': False, 'on_success_matcher': all_of(*pipeline_verifiers)} options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options)
def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.project = self.test_pipeline.get_option('project') # Set up PubSub environment. from google.cloud import pubsub self.pub_client = pubsub.PublisherClient() self.pubsub_setup_client = PubSubSetupClient(project=self.project) self.input_topic = self.pubsub_setup_client.create_topic(INPUT_TOPIC) self.output_topic = self.pubsub_setup_client.create_topic(OUTPUT_TOPIC) self.input_sub = self.pubsub_setup_client.create_subscription( self.input_topic, INPUT_SUB) self.output_sub = self.pubsub_setup_client.create_subscription( self.output_topic, OUTPUT_SUB) # Set up BigQuery tables self.dataset_ref = utils.create_bq_dataset(self.project, OUTPUT_DATASET) self.bq_wrapper = BigQueryWrapper() table_schema = parse_table_schema_from_json(schemas.get_test_schema()) def _create_table(table_id, schema): return self.bq_wrapper.get_or_create_table( project_id=self.project, dataset_id=self.dataset_ref.dataset_id, table_id=table_id, schema=schema, create_disposition='CREATE_IF_NEEDED', write_disposition='WRITE_APPEND') self.table_ref = _create_table(OUTPUT_TABLE, table_schema)
def _setup_temporary_dataset( self, bq: bigquery_tools.BigQueryWrapper, element: 'ReadFromBigQueryRequest'): location = bq.get_query_location( self._get_project(), element.query, not element.use_standard_sql) bq.create_temporary_dataset(self._get_project(), location)
def _check_for_input_data(self): """Checks if a BQ table with input data exists and creates it if not.""" wrapper = BigQueryWrapper() try: wrapper.get_table(self.project_id, self.input_dataset, self.input_table) except HttpError as exn: if exn.status_code == 404: self._create_input_data()
def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.runner_name = type(self.test_pipeline.runner).__name__ self.project = self.test_pipeline.get_option('project') self.bigquery_client = BigQueryWrapper() self.dataset_id = '%s%s%d' % (BIG_QUERY_DATASET_ID, str(int(time.time())), random.randint(0, 10000)) self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id) self.output_table = "%s.output_table" % (self.dataset_id)
def _export_files( self, bq: bigquery_tools.BigQueryWrapper, element: 'ReadFromBigQueryRequest', table_reference: TableReference): """Runs a BigQuery export job. Returns: bigquery.TableSchema instance, a list of FileMetadata instances """ job_labels = self._get_bq_metadata().add_additional_bq_job_labels( self.bigquery_job_labels) export_job_name = bigquery_tools.generate_bq_job_name( self._job_name, self._source_uuid, bigquery_tools.BigQueryJobTypes.EXPORT, element.obj_id) temp_location = self.options.view_as(GoogleCloudOptions).temp_location gcs_location = bigquery_export_destination_uri( self.gcs_location, temp_location, '%s%s' % (self._source_uuid, element.obj_id)) if self.use_json_exports: job_ref = bq.perform_extract_job([gcs_location], export_job_name, table_reference, bigquery_tools.FileFormat.JSON, project=self._get_project(), job_labels=job_labels, include_header=False) else: job_ref = bq.perform_extract_job([gcs_location], export_job_name, table_reference, bigquery_tools.FileFormat.AVRO, project=self._get_project(), include_header=False, job_labels=job_labels, use_avro_logical_types=True) bq.wait_for_bq_job(job_ref) metadata_list = FileSystems.match([gcs_location])[0].metadata_list if isinstance(table_reference, ValueProvider): table_ref = bigquery_tools.parse_table_reference( element.table, project=self._get_project()) else: table_ref = table_reference table = bq.get_table( table_ref.projectId, table_ref.datasetId, table_ref.tableId) return table.schema, metadata_list
def _execute_query(self, bq: bigquery_tools.BigQueryWrapper, element: 'ReadFromBigQueryRequest'): query_job_name = bigquery_tools.generate_bq_job_name( self._job_name, self._source_uuid, bigquery_tools.BigQueryJobTypes.QUERY, random.randint(0, 1000)) job = bq._start_query_job( self._get_project(), element.query, not element.use_standard_sql, element.flatten_results, job_id=query_job_name, kms_key=self.kms_key, job_labels=self._get_bq_metadata().add_additional_bq_job_labels( self.bigquery_job_labels)) job_ref = job.jobReference bq.wait_for_bq_job(job_ref, max_retries=0) return bq._get_temp_table(self._get_project())
def setUpClass(cls): cls.test_pipeline = TestPipeline(is_integration_test=True) cls.args = cls.test_pipeline.get_full_options_as_args() cls.runner_name = type(cls.test_pipeline.runner).__name__ cls.project = cls.test_pipeline.get_option('project') cls.bigquery_client = BigQueryWrapper() cls.dataset_id = '%s%s%d' % (cls.BIG_QUERY_DATASET_ID, str(int(time.time())), random.randint(0, 10000)) cls.bigquery_client.get_or_create_dataset(cls.project, cls.dataset_id) _LOGGER.info("Created dataset %s in project %s", cls.dataset_id, cls.project)
class BigQueryQueryToTableIT(unittest.TestCase): def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.runner_name = type(self.test_pipeline.runner).__name__ self.project = self.test_pipeline.get_option('project') self.bigquery_client = BigQueryWrapper() self.dataset_id = '%s%s%d' % (BIG_QUERY_DATASET_ID, str(int(time.time())), random.randint(0, 10000)) self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id) self.output_table = "%s.output_table" % (self.dataset_id) def tearDown(self): request = bigquery.BigqueryDatasetsDeleteRequest( projectId=self.project, datasetId=self.dataset_id, deleteContents=True) try: self.bigquery_client.client.datasets.Delete(request) except HttpError: logging.debug('Failed to clean up dataset %s' % self.dataset_id) def _setup_new_types_env(self): table_schema = bigquery.TableSchema() table_field = bigquery.TableFieldSchema() table_field.name = 'bytes' table_field.type = 'BYTES' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'date' table_field.type = 'DATE' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'time' table_field.type = 'TIME' table_schema.fields.append(table_field) table = bigquery.Table(tableReference=bigquery.TableReference( projectId=self.project, datasetId=self.dataset_id, tableId=NEW_TYPES_INPUT_TABLE), schema=table_schema) request = bigquery.BigqueryTablesInsertRequest( projectId=self.project, datasetId=self.dataset_id, table=table) self.bigquery_client.client.tables.Insert(request) table_data = [{ 'bytes': b'xyw=', 'date': '2011-01-01', 'time': '23:59:59.999999' }, { 'bytes': b'abc=', 'date': '2000-01-01', 'time': '00:00:00' }, { 'bytes': b'dec=', 'date': '3000-12-31', 'time': '23:59:59.990000' }] self.bigquery_client.insert_rows(self.project, self.dataset_id, NEW_TYPES_INPUT_TABLE, table_data) @attr('IT') def test_big_query_legacy_sql(self): verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED) pipeline_verifiers = [ PipelineStateMatcher(), BigqueryMatcher(project=self.project, query=verify_query, checksum=expected_checksum) ] gs_location = 'gs://temp-storage-for-upload-tests/%s' % self.output_table extra_opts = { 'query': LEGACY_QUERY, 'output': self.output_table, 'bq_temp_location': gs_location, 'output_schema': DIALECT_OUTPUT_SCHEMA, 'use_standard_sql': False, 'on_success_matcher': all_of(*pipeline_verifiers) } options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options) @attr('IT') def test_big_query_standard_sql(self): verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED) pipeline_verifiers = [ PipelineStateMatcher(), BigqueryMatcher(project=self.project, query=verify_query, checksum=expected_checksum) ] gs_location = 'gs://temp-storage-for-upload-tests/%s' % self.output_table extra_opts = { 'query': STANDARD_QUERY, 'output': self.output_table, 'bq_temp_location': gs_location, 'output_schema': DIALECT_OUTPUT_SCHEMA, 'use_standard_sql': True, 'on_success_matcher': all_of(*pipeline_verifiers) } options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options) # TODO(BEAM-6660): Enable this test when ready. @unittest.skip( 'This test requires BQ Dataflow native source support for ' + 'KMS, which is not available yet.') @attr('IT') def test_big_query_standard_sql_kms_key(self): verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED) pipeline_verifiers = [ PipelineStateMatcher(), BigqueryMatcher(project=self.project, query=verify_query, checksum=expected_checksum) ] extra_opts = { 'query': STANDARD_QUERY, 'output': self.output_table, 'output_schema': DIALECT_OUTPUT_SCHEMA, 'use_standard_sql': True, 'on_success_matcher': all_of(*pipeline_verifiers), 'kms_key': KMS_KEY } options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options) table = self.bigquery_client.get_table(self.project, self.dataset_id, 'output_table') self.assertEqual(KMS_KEY, table.encryptionConfiguration.kmsKeyName) @unittest.skipIf(sys.version_info[0] == 3 and os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1', 'This test still needs to be fixed on Python 3' 'TODO: BEAM-6769') @attr('IT') def test_big_query_new_types(self): expected_checksum = test_utils.compute_hash(NEW_TYPES_OUTPUT_EXPECTED) verify_query = NEW_TYPES_OUTPUT_VERIFY_QUERY % self.output_table pipeline_verifiers = [ PipelineStateMatcher(), BigqueryMatcher(project=self.project, query=verify_query, checksum=expected_checksum) ] self._setup_new_types_env() gs_location = 'gs://temp-storage-for-upload-tests/%s' % self.output_table extra_opts = { 'query': NEW_TYPES_QUERY % (self.dataset_id, NEW_TYPES_INPUT_TABLE), 'output': self.output_table, 'bq_temp_location': gs_location, 'output_schema': NEW_TYPES_OUTPUT_SCHEMA, 'use_standard_sql': False, 'on_success_matcher': all_of(*pipeline_verifiers) } options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options)
class BigQueryQueryToTableIT(unittest.TestCase): def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.runner_name = type(self.test_pipeline.runner).__name__ self.project = self.test_pipeline.get_option('project') self.bigquery_client = BigQueryWrapper() self.dataset_id = '%s%s%d' % (BIG_QUERY_DATASET_ID, str(int(time.time())), random.randint(0, 10000)) self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id) self.output_table = "%s.output_table" % (self.dataset_id) def tearDown(self): request = bigquery.BigqueryDatasetsDeleteRequest( projectId=self.project, datasetId=self.dataset_id, deleteContents=True) try: self.bigquery_client.client.datasets.Delete(request) except HttpError: logging.debug('Failed to clean up dataset %s' % self.dataset_id) def _setup_new_types_env(self): table_schema = bigquery.TableSchema() table_field = bigquery.TableFieldSchema() table_field.name = 'bytes' table_field.type = 'BYTES' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'date' table_field.type = 'DATE' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'time' table_field.type = 'TIME' table_schema.fields.append(table_field) table = bigquery.Table( tableReference=bigquery.TableReference( projectId=self.project, datasetId=self.dataset_id, tableId=NEW_TYPES_INPUT_TABLE), schema=table_schema) request = bigquery.BigqueryTablesInsertRequest( projectId=self.project, datasetId=self.dataset_id, table=table) self.bigquery_client.client.tables.Insert(request) table_data = [ {'bytes':b'xyw=', 'date':'2011-01-01', 'time':'23:59:59.999999'}, {'bytes':b'abc=', 'date':'2000-01-01', 'time':'00:00:00'}, {'bytes':b'dec=', 'date':'3000-12-31', 'time':'23:59:59.990000'} ] self.bigquery_client.insert_rows( self.project, self.dataset_id, NEW_TYPES_INPUT_TABLE, table_data) @attr('IT') def test_big_query_legacy_sql(self): verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED) pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=self.project, query=verify_query, checksum=expected_checksum)] extra_opts = {'query': LEGACY_QUERY, 'output': self.output_table, 'output_schema': DIALECT_OUTPUT_SCHEMA, 'use_standard_sql': False, 'on_success_matcher': all_of(*pipeline_verifiers)} options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options) @attr('IT') def test_big_query_standard_sql(self): verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED) pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=self.project, query=verify_query, checksum=expected_checksum)] extra_opts = {'query': STANDARD_QUERY, 'output': self.output_table, 'output_schema': DIALECT_OUTPUT_SCHEMA, 'use_standard_sql': True, 'on_success_matcher': all_of(*pipeline_verifiers)} options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options) @attr('IT') def test_big_query_new_types(self): expected_checksum = test_utils.compute_hash(NEW_TYPES_OUTPUT_EXPECTED) verify_query = NEW_TYPES_OUTPUT_VERIFY_QUERY % self.output_table pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=self.project, query=verify_query, checksum=expected_checksum)] self._setup_new_types_env() extra_opts = { 'query': NEW_TYPES_QUERY % (self.dataset_id, NEW_TYPES_INPUT_TABLE), 'output': self.output_table, 'output_schema': NEW_TYPES_OUTPUT_SCHEMA, 'use_standard_sql': False, 'on_success_matcher': all_of(*pipeline_verifiers)} options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options)
class BigQueryWriteIntegrationTests(unittest.TestCase): BIG_QUERY_DATASET_ID = 'python_write_to_table_' def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.runner_name = type(self.test_pipeline.runner).__name__ self.project = self.test_pipeline.get_option('project') self.bigquery_client = BigQueryWrapper() self.dataset_id = '%s%s%d' % (self.BIG_QUERY_DATASET_ID, str(int(time.time())), random.randint(0, 10000)) self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id) _LOGGER.info("Created dataset %s in project %s", self.dataset_id, self.project) def tearDown(self): request = bigquery.BigqueryDatasetsDeleteRequest( projectId=self.project, datasetId=self.dataset_id, deleteContents=True) try: _LOGGER.info("Deleting dataset %s in project %s", self.dataset_id, self.project) self.bigquery_client.client.datasets.Delete(request) except HttpError: _LOGGER.debug('Failed to clean up dataset %s in project %s', self.dataset_id, self.project) def create_table(self, table_name): table_schema = bigquery.TableSchema() table_field = bigquery.TableFieldSchema() table_field.name = 'int64' table_field.type = 'INT64' table_field.mode = 'REQUIRED' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'bytes' table_field.type = 'BYTES' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'date' table_field.type = 'DATE' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'time' table_field.type = 'TIME' table_schema.fields.append(table_field) table = bigquery.Table(tableReference=bigquery.TableReference( projectId=self.project, datasetId=self.dataset_id, tableId=table_name), schema=table_schema) request = bigquery.BigqueryTablesInsertRequest( projectId=self.project, datasetId=self.dataset_id, table=table) self.bigquery_client.client.tables.Insert(request) @pytest.mark.it_postcommit def test_big_query_write(self): table_name = 'python_write_table' table_id = '{}.{}'.format(self.dataset_id, table_name) input_data = [ { 'number': 1, 'str': 'abc' }, { 'number': 2, 'str': 'def' }, { 'number': 3, 'str': u'你好' }, { 'number': 4, 'str': u'привет' }, ] table_schema = { "fields": [{ "name": "number", "type": "INTEGER" }, { "name": "str", "type": "STRING" }] } pipeline_verifiers = [ BigqueryFullResultMatcher(project=self.project, query="SELECT number, str FROM %s" % table_id, data=[( 1, 'abc', ), ( 2, 'def', ), ( 3, u'你好', ), ( 4, u'привет', )]) ] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers)) with beam.Pipeline(argv=args) as p: # pylint: disable=expression-not-assigned (p | 'create' >> beam.Create(input_data) | 'write' >> beam.io.WriteToBigQuery( table_id, schema=table_schema, create_disposition=beam.io.BigQueryDisposition. CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY)) @pytest.mark.it_postcommit def test_big_query_write_schema_autodetect(self): if self.runner_name == 'TestDataflowRunner': self.skipTest( 'DataflowRunner does not support schema autodetection') table_name = 'python_write_table' table_id = '{}.{}'.format(self.dataset_id, table_name) input_data = [ { 'number': 1, 'str': 'abc' }, { 'number': 2, 'str': 'def' }, ] pipeline_verifiers = [ BigqueryFullResultMatcher(project=self.project, query="SELECT number, str FROM %s" % table_id, data=[( 1, 'abc', ), ( 2, 'def', )]) ] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers)) with beam.Pipeline(argv=args) as p: # pylint: disable=expression-not-assigned (p | 'create' >> beam.Create(input_data) | 'write' >> beam.io.WriteToBigQuery( table_id, method=beam.io.WriteToBigQuery.Method.FILE_LOADS, schema=beam.io.gcp.bigquery.SCHEMA_AUTODETECT, create_disposition=beam.io.BigQueryDisposition. CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY, temp_file_format=FileFormat.JSON)) @pytest.mark.it_postcommit def test_big_query_write_new_types(self): table_name = 'python_new_types_table' table_id = '{}.{}'.format(self.dataset_id, table_name) row_data = { 'float': 0.33, 'numeric': Decimal('10'), 'bytes': base64.b64encode(b'\xab\xac').decode('utf-8'), 'date': '3000-12-31', 'time': '23:59:59', 'datetime': '2018-12-31T12:44:31', 'timestamp': '2018-12-31 12:44:31.744957 UTC', 'geo': 'POINT(30 10)' } input_data = [row_data] # add rows with only one key value pair and None values for all other keys for key, value in row_data.items(): input_data.append({key: value}) table_schema = { "fields": [{ "name": "float", "type": "FLOAT" }, { "name": "numeric", "type": "NUMERIC" }, { "name": "bytes", "type": "BYTES" }, { "name": "date", "type": "DATE" }, { "name": "time", "type": "TIME" }, { "name": "datetime", "type": "DATETIME" }, { "name": "timestamp", "type": "TIMESTAMP" }, { "name": "geo", "type": "GEOGRAPHY" }] } expected_row = ( 0.33, Decimal('10'), b'\xab\xac', datetime.date(3000, 12, 31), datetime.time(23, 59, 59), datetime.datetime(2018, 12, 31, 12, 44, 31), datetime.datetime(2018, 12, 31, 12, 44, 31, 744957, tzinfo=pytz.utc), 'POINT(30 10)', ) expected_data = [expected_row] # add rows with only one key value pair and None values for all other keys for i, value in enumerate(expected_row): row = [None] * len(expected_row) row[i] = value expected_data.append(tuple(row)) pipeline_verifiers = [ BigqueryFullResultMatcher( project=self.project, query='SELECT float, numeric, bytes, date, time, datetime,' 'timestamp, geo FROM %s' % table_id, data=expected_data) ] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers)) with beam.Pipeline(argv=args) as p: # pylint: disable=expression-not-assigned (p | 'create' >> beam.Create(input_data) | 'write' >> beam.io.WriteToBigQuery( table_id, schema=table_schema, create_disposition=beam.io.BigQueryDisposition. CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY)) @pytest.mark.it_postcommit def test_big_query_write_without_schema(self): table_name = 'python_no_schema_table' self.create_table(table_name) table_id = '{}.{}'.format(self.dataset_id, table_name) input_data = [{ 'int64': 1, 'bytes': b'xyw', 'date': '2011-01-01', 'time': '23:59:59.999999' }, { 'int64': 2, 'bytes': b'abc', 'date': '2000-01-01', 'time': '00:00:00' }, { 'int64': 3, 'bytes': b'\xe4\xbd\xa0\xe5\xa5\xbd', 'date': '3000-12-31', 'time': '23:59:59' }, { 'int64': 4, 'bytes': b'\xab\xac\xad', 'date': '2000-01-01', 'time': '00:00:00' }] # bigquery io expects bytes to be base64 encoded values for row in input_data: row['bytes'] = base64.b64encode(row['bytes']) pipeline_verifiers = [ BigqueryFullResultMatcher( project=self.project, query="SELECT int64, bytes, date, time FROM %s" % table_id, data=[( 1, b'xyw', datetime.date(2011, 1, 1), datetime.time(23, 59, 59, 999999), ), ( 2, b'abc', datetime.date(2000, 1, 1), datetime.time(0, 0, 0), ), ( 3, b'\xe4\xbd\xa0\xe5\xa5\xbd', datetime.date(3000, 12, 31), datetime.time(23, 59, 59), ), ( 4, b'\xab\xac\xad', datetime.date(2000, 1, 1), datetime.time(0, 0, 0), )]) ] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers)) with beam.Pipeline(argv=args) as p: # pylint: disable=expression-not-assigned (p | 'create' >> beam.Create(input_data) | 'write' >> beam.io.WriteToBigQuery( table_id, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, temp_file_format=FileFormat.JSON)) @pytest.mark.it_postcommit def test_big_query_write_insert_errors_reporting(self): """ Test that errors returned by beam.io.WriteToBigQuery contain both the failed rows amd the reason for it failing. """ table_name = 'python_write_table' table_id = '{}.{}'.format(self.dataset_id, table_name) input_data = [{ 'number': 1, 'str': 'some_string', }, { 'number': 2 }, { 'number': 3, 'str': 'some_string', 'additional_field_str': 'some_string', }] table_schema = { "fields": [{ "name": "number", "type": "INTEGER", 'mode': 'REQUIRED' }, { "name": "str", "type": "STRING", 'mode': 'REQUIRED' }] } bq_result_errors = [( { "number": 2 }, [{ "reason": "invalid", "location": "", "debugInfo": "", "message": "Missing required field: Msg_0_CLOUD_QUERY_TABLE.str." }], ), ({ "number": 3, "str": "some_string", "additional_field_str": "some_string" }, [{ "reason": "invalid", "location": "additional_field_str", "debugInfo": "", "message": "no such field: additional_field_str." }])] pipeline_verifiers = [ BigqueryFullResultMatcher(project=self.project, query="SELECT number, str FROM %s" % table_id, data=[(1, 'some_string')]), ] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers)) with beam.Pipeline(argv=args) as p: # pylint: disable=expression-not-assigned errors = ( p | 'create' >> beam.Create(input_data) | 'write' >> beam.io.WriteToBigQuery( table_id, schema=table_schema, method='STREAMING_INSERTS', insert_retry_strategy='RETRY_NEVER', create_disposition=beam.io.BigQueryDisposition. CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND) ) assert_that( errors[BigQueryWriteFn.FAILED_ROWS_WITH_ERRORS] | 'ParseErrors' >> beam.Map(lambda err: (err[1], err[2])), equal_to(bq_result_errors)) @pytest.mark.it_postcommit @parameterized.expand([ param(file_format=FileFormat.AVRO), param(file_format=FileFormat.JSON), param(file_format=None), ]) @mock.patch("apache_beam.io.gcp.bigquery_file_loads._MAXIMUM_SOURCE_URIS", new=1) def test_big_query_write_temp_table_append_schema_update( self, file_format): """ Test that nested schema update options and schema relaxation are respected when appending to an existing table via temporary tables. _MAXIMUM_SOURCE_URIS and max_file_size are both set to 1 to force multiple load jobs and usage of temporary tables. """ table_name = 'python_append_schema_update' self.create_table(table_name) table_id = '{}.{}'.format(self.dataset_id, table_name) # bytes, date, time fields are optional and omitted in the test # only required and new columns are specified table_schema = { "fields": [{ "name": "int64", "type": "INT64", "mode": "NULLABLE", }, { "name": "bool", "type": "BOOL", }, { "name": "nested_field", "type": "RECORD", "mode": "REPEATED", "fields": [ { "name": "fruit", "type": "STRING", "mode": "NULLABLE" }, ] }] } input_data = [{ "int64": 1, "bool": True, "nested_field": [{ "fruit": "Apple" }] }, { "bool": False, "nested_field": [{ "fruit": "Mango" }] }, { "int64": None, "bool": True, "nested_field": [{ "fruit": "Banana" }] }] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=BigqueryFullResultMatcher( project=self.project, query=""" SELECT bytes, date, time, int64, bool, fruit FROM {}, UNNEST(nested_field) as nested_field ORDER BY fruit """.format(table_id), data=[(None, None, None, 1, True, "Apple"), ( None, None, None, None, True, "Banana"), (None, None, None, None, False, "Mango")])) with beam.Pipeline(argv=args) as p: # pylint: disable=expression-not-assigned (p | 'create' >> beam.Create(input_data) | 'write' >> beam.io.WriteToBigQuery( table_id, schema=table_schema, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, max_file_size=1, # bytes method=beam.io.WriteToBigQuery.Method.FILE_LOADS, additional_bq_parameters={ 'schemaUpdateOptions': ['ALLOW_FIELD_ADDITION', 'ALLOW_FIELD_RELAXATION'] }, temp_file_format=file_format))
class BigQueryQueryToTableIT(unittest.TestCase): def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.runner_name = type(self.test_pipeline.runner).__name__ self.project = self.test_pipeline.get_option('project') self.bigquery_client = BigQueryWrapper() self.dataset_id = '%s%s%d' % (BIG_QUERY_DATASET_ID, str(int(time.time())), random.randint(0, 10000)) self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id) self.output_table = "%s.output_table" % (self.dataset_id) def tearDown(self): request = bigquery.BigqueryDatasetsDeleteRequest( projectId=self.project, datasetId=self.dataset_id, deleteContents=True) try: self.bigquery_client.client.datasets.Delete(request) except HttpError: _LOGGER.debug('Failed to clean up dataset %s' % self.dataset_id) def _setup_new_types_env(self): table_schema = bigquery.TableSchema() table_field = bigquery.TableFieldSchema() table_field.name = 'bytes' table_field.type = 'BYTES' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'date' table_field.type = 'DATE' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'time' table_field.type = 'TIME' table_schema.fields.append(table_field) table = bigquery.Table(tableReference=bigquery.TableReference( projectId=self.project, datasetId=self.dataset_id, tableId=NEW_TYPES_INPUT_TABLE), schema=table_schema) request = bigquery.BigqueryTablesInsertRequest( projectId=self.project, datasetId=self.dataset_id, table=table) self.bigquery_client.client.tables.Insert(request) table_data = [{ 'bytes': b'xyw', 'date': '2011-01-01', 'time': '23:59:59.999999' }, { 'bytes': b'abc', 'date': '2000-01-01', 'time': '00:00:00' }, { 'bytes': b'\xe4\xbd\xa0\xe5\xa5\xbd', 'date': '3000-12-31', 'time': '23:59:59.990000' }, { 'bytes': b'\xab\xac\xad', 'date': '2000-01-01', 'time': '00:00:00' }] # the API Tools bigquery client expects byte values to be base-64 encoded # TODO BEAM-4850: upgrade to google-cloud-bigquery which does not require # handling the encoding in beam for row in table_data: row['bytes'] = base64.b64encode(row['bytes']).decode('utf-8') self.bigquery_client.insert_rows(self.project, self.dataset_id, NEW_TYPES_INPUT_TABLE, table_data) @attr('IT') def test_big_query_legacy_sql(self): verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED) pipeline_verifiers = [ PipelineStateMatcher(), BigqueryMatcher(project=self.project, query=verify_query, checksum=expected_checksum) ] extra_opts = { 'query': LEGACY_QUERY, 'output': self.output_table, 'output_schema': DIALECT_OUTPUT_SCHEMA, 'use_standard_sql': False, 'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION_MS, 'on_success_matcher': all_of(*pipeline_verifiers), } options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options) @attr('IT') def test_big_query_standard_sql(self): verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED) pipeline_verifiers = [ PipelineStateMatcher(), BigqueryMatcher(project=self.project, query=verify_query, checksum=expected_checksum) ] extra_opts = { 'query': STANDARD_QUERY, 'output': self.output_table, 'output_schema': DIALECT_OUTPUT_SCHEMA, 'use_standard_sql': True, 'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION_MS, 'on_success_matcher': all_of(*pipeline_verifiers), } options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options) @attr('IT') def test_big_query_standard_sql_kms_key_native(self): if isinstance(self.test_pipeline.runner, TestDirectRunner): self.skipTest("This test doesn't work on DirectRunner.") verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED) pipeline_verifiers = [ PipelineStateMatcher(), BigqueryMatcher(project=self.project, query=verify_query, checksum=expected_checksum) ] kms_key = self.test_pipeline.get_option('kms_key_name') self.assertTrue(kms_key) extra_opts = { 'query': STANDARD_QUERY, 'output': self.output_table, 'output_schema': DIALECT_OUTPUT_SCHEMA, 'use_standard_sql': True, 'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION_MS, 'on_success_matcher': all_of(*pipeline_verifiers), 'kms_key': kms_key, 'native': True, 'experiments': 'use_legacy_bq_sink', } options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options) table = self.bigquery_client.get_table(self.project, self.dataset_id, 'output_table') self.assertIsNotNone(table.encryptionConfiguration, 'No encryption configuration found: %s' % table) self.assertEqual(kms_key, table.encryptionConfiguration.kmsKeyName) @attr('IT') def test_big_query_new_types(self): expected_checksum = test_utils.compute_hash(NEW_TYPES_OUTPUT_EXPECTED) verify_query = NEW_TYPES_OUTPUT_VERIFY_QUERY % self.output_table pipeline_verifiers = [ PipelineStateMatcher(), BigqueryMatcher(project=self.project, query=verify_query, checksum=expected_checksum) ] self._setup_new_types_env() extra_opts = { 'query': NEW_TYPES_QUERY % (self.dataset_id, NEW_TYPES_INPUT_TABLE), 'output': self.output_table, 'output_schema': NEW_TYPES_OUTPUT_SCHEMA, 'use_standard_sql': False, 'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION_MS, 'use_json_exports': True, 'on_success_matcher': all_of(*pipeline_verifiers) } options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options) @attr('IT') def test_big_query_new_types_avro(self): expected_checksum = test_utils.compute_hash(NEW_TYPES_OUTPUT_EXPECTED) verify_query = NEW_TYPES_OUTPUT_VERIFY_QUERY % self.output_table pipeline_verifiers = [ PipelineStateMatcher(), BigqueryMatcher(project=self.project, query=verify_query, checksum=expected_checksum) ] self._setup_new_types_env() extra_opts = { 'query': NEW_TYPES_QUERY % (self.dataset_id, NEW_TYPES_INPUT_TABLE), 'output': self.output_table, 'output_schema': NEW_TYPES_OUTPUT_SCHEMA, 'use_standard_sql': False, 'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION_MS, 'on_success_matcher': all_of(*pipeline_verifiers), } options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options) @attr('IT') def test_big_query_new_types_native(self): expected_checksum = test_utils.compute_hash(NEW_TYPES_OUTPUT_EXPECTED) verify_query = NEW_TYPES_OUTPUT_VERIFY_QUERY % self.output_table pipeline_verifiers = [ PipelineStateMatcher(), BigqueryMatcher(project=self.project, query=verify_query, checksum=expected_checksum) ] self._setup_new_types_env() extra_opts = { 'query': NEW_TYPES_QUERY % (self.dataset_id, NEW_TYPES_INPUT_TABLE), 'output': self.output_table, 'output_schema': NEW_TYPES_OUTPUT_SCHEMA, 'use_standard_sql': False, 'native': True, 'use_json_exports': True, 'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION_MS, 'on_success_matcher': all_of(*pipeline_verifiers), 'experiments': 'use_legacy_bq_sink', } options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options)
class BigQueryReadIntegrationTests(unittest.TestCase): BIG_QUERY_DATASET_ID = 'python_read_table_' def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.runner_name = type(self.test_pipeline.runner).__name__ self.project = self.test_pipeline.get_option('project') self.bigquery_client = BigQueryWrapper() self.dataset_id = '%s%s%d' % (self.BIG_QUERY_DATASET_ID, str(int(time.time())), random.randint(0, 10000)) self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id) logging.info("Created dataset %s in project %s", self.dataset_id, self.project) def tearDown(self): request = bigquery.BigqueryDatasetsDeleteRequest( projectId=self.project, datasetId=self.dataset_id, deleteContents=True) try: logging.info("Deleting dataset %s in project %s", self.dataset_id, self.project) self.bigquery_client.client.datasets.Delete(request) except HttpError: logging.debug('Failed to clean up dataset %s in project %s', self.dataset_id, self.project) def create_table(self, tablename): table_schema = bigquery.TableSchema() table_field = bigquery.TableFieldSchema() table_field.name = 'number' table_field.type = 'INTEGER' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'str' table_field.type = 'STRING' table_schema.fields.append(table_field) table = bigquery.Table(tableReference=bigquery.TableReference( projectId=self.project, datasetId=self.dataset_id, tableId=tablename), schema=table_schema) request = bigquery.BigqueryTablesInsertRequest( projectId=self.project, datasetId=self.dataset_id, table=table) self.bigquery_client.client.tables.Insert(request) table_data = [{ 'number': 1, 'str': 'abc' }, { 'number': 2, 'str': 'def' }, { 'number': 3, 'str': u'你好' }, { 'number': 4, 'str': u'привет' }] self.bigquery_client.insert_rows(self.project, self.dataset_id, tablename, table_data) def create_table_new_types(self, table_name): table_schema = bigquery.TableSchema() table_field = bigquery.TableFieldSchema() table_field.name = 'float' table_field.type = 'FLOAT' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'numeric' table_field.type = 'NUMERIC' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'bytes' table_field.type = 'BYTES' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'date' table_field.type = 'DATE' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'time' table_field.type = 'TIME' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'datetime' table_field.type = 'DATETIME' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'timestamp' table_field.type = 'TIMESTAMP' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'geo' table_field.type = 'GEOGRAPHY' table_schema.fields.append(table_field) table = bigquery.Table(tableReference=bigquery.TableReference( projectId=self.project, datasetId=self.dataset_id, tableId=table_name), schema=table_schema) request = bigquery.BigqueryTablesInsertRequest( projectId=self.project, datasetId=self.dataset_id, table=table) self.bigquery_client.client.tables.Insert(request) row_data = { 'float': 0.33, 'numeric': Decimal('10'), 'bytes': base64.b64encode(b'\xab\xac').decode('utf-8'), 'date': '3000-12-31', 'time': '23:59:59', 'datetime': '2018-12-31T12:44:31', 'timestamp': '2018-12-31 12:44:31.744957 UTC', 'geo': 'POINT(30 10)' } table_data = [row_data] # add rows with only one key value pair and None values for all other keys for key, value in iteritems(row_data): table_data.append({key: value}) self.bigquery_client.insert_rows(self.project, self.dataset_id, table_name, table_data) @attr('IT') def test_big_query_read(self): table_name = 'python_write_table' self.create_table(table_name) table_id = '{}.{}'.format(self.dataset_id, table_name) args = self.test_pipeline.get_full_options_as_args() with beam.Pipeline(argv=args) as p: result = (p | 'read' >> beam.io.Read( beam.io.BigQuerySource( query='SELECT number, str FROM `%s`' % table_id, use_standard_sql=True))) assert_that( result, equal_to([{ 'number': 1, 'str': 'abc' }, { 'number': 2, 'str': 'def' }, { 'number': 3, 'str': u'你好' }, { 'number': 4, 'str': u'привет' }])) @attr('IT') def test_big_query_read_new_types(self): table_name = 'python_new_types' self.create_table_new_types(table_name) table_id = '{}.{}'.format(self.dataset_id, table_name) args = self.test_pipeline.get_full_options_as_args() expected_row = { 'float': 0.33, 'numeric': Decimal('10'), 'bytes': base64.b64encode(b'\xab\xac'), 'date': '3000-12-31', 'time': '23:59:59', 'datetime': '2018-12-31T12:44:31', 'timestamp': '2018-12-31 12:44:31.744957 UTC', 'geo': 'POINT(30 10)' } expected_data = [expected_row] # add rows with only one key value pair and None values for all other keys for key, value in iteritems(expected_row): row = {k: None for k in expected_row} row[key] = value expected_data.append(row) with beam.Pipeline(argv=args) as p: result = (p | 'read' >> beam.io.Read( beam.io.BigQuerySource( query='SELECT float, numeric, bytes, date, time, datetime,' 'timestamp, geo FROM `%s`' % table_id, use_standard_sql=True))) assert_that(result, equal_to(expected_data))
class BigQueryWriteIntegrationTests(unittest.TestCase): BIG_QUERY_DATASET_ID = 'python_write_to_table_' def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.runner_name = type(self.test_pipeline.runner).__name__ self.project = self.test_pipeline.get_option('project') self.bigquery_client = BigQueryWrapper() self.dataset_id = '%s%s%d' % (self.BIG_QUERY_DATASET_ID, str(int(time.time())), random.randint(0, 10000)) self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id) logging.info("Created dataset %s in project %s", self.dataset_id, self.project) def tearDown(self): request = bigquery.BigqueryDatasetsDeleteRequest( projectId=self.project, datasetId=self.dataset_id, deleteContents=True) try: logging.info("Deleting dataset %s in project %s", self.dataset_id, self.project) self.bigquery_client.client.datasets.Delete(request) except HttpError: logging.debug('Failed to clean up dataset %s in project %s', self.dataset_id, self.project) def create_table(self, table_name): table_schema = bigquery.TableSchema() table_field = bigquery.TableFieldSchema() table_field.name = 'bytes' table_field.type = 'BYTES' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'date' table_field.type = 'DATE' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'time' table_field.type = 'TIME' table_schema.fields.append(table_field) table = bigquery.Table( tableReference=bigquery.TableReference( projectId=self.project, datasetId=self.dataset_id, tableId=table_name), schema=table_schema) request = bigquery.BigqueryTablesInsertRequest( projectId=self.project, datasetId=self.dataset_id, table=table) self.bigquery_client.client.tables.Insert(request) @attr('IT') def test_big_query_write(self): table_name = 'python_write_table' table_id = '{}.{}'.format(self.dataset_id, table_name) input_data = [ {'number': 1, 'str': 'abc'}, {'number': 2, 'str': 'def'}, {'number': 3, 'str': u'你好'}, {'number': 4, 'str': u'привет'}, ] table_schema = {"fields": [ {"name": "number", "type": "INTEGER"}, {"name": "str", "type": "STRING"}]} pipeline_verifiers = [ BigqueryFullResultMatcher( project=self.project, query="SELECT number, str FROM %s" % table_id, data=[(1, 'abc',), (2, 'def',), (3, u'你好',), (4, u'привет',)])] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers)) with beam.Pipeline(argv=args) as p: # pylint: disable=expression-not-assigned (p | 'create' >> beam.Create(input_data) | 'write' >> beam.io.WriteToBigQuery( table_id, schema=table_schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY)) @attr('IT') def test_big_query_write_schema_autodetect(self): if self.runner_name == 'TestDataflowRunner': self.skipTest('DataflowRunner does not support schema autodetection') table_name = 'python_write_table' table_id = '{}.{}'.format(self.dataset_id, table_name) input_data = [ {'number': 1, 'str': 'abc'}, {'number': 2, 'str': 'def'}, ] pipeline_verifiers = [ BigqueryFullResultMatcher( project=self.project, query="SELECT number, str FROM %s" % table_id, data=[(1, 'abc',), (2, 'def',)])] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers), experiments='use_beam_bq_sink') with beam.Pipeline(argv=args) as p: # pylint: disable=expression-not-assigned (p | 'create' >> beam.Create(input_data) | 'write' >> beam.io.WriteToBigQuery( table_id, method=beam.io.WriteToBigQuery.Method.FILE_LOADS, schema=beam.io.gcp.bigquery.SCHEMA_AUTODETECT, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY)) @attr('IT') def test_big_query_write_new_types(self): table_name = 'python_new_types_table' table_id = '{}.{}'.format(self.dataset_id, table_name) input_data = [ {'bytes': b'xyw', 'date': '2011-01-01', 'time': '23:59:59.999999'}, {'bytes': b'abc', 'date': '2000-01-01', 'time': '00:00:00'}, {'bytes': b'\xe4\xbd\xa0\xe5\xa5\xbd', 'date': '3000-12-31', 'time': '23:59:59'}, {'bytes': b'\xab\xac\xad', 'date': '2000-01-01', 'time': '00:00:00'} ] # bigquery io expects bytes to be base64 encoded values for row in input_data: row['bytes'] = base64.b64encode(row['bytes']) table_schema = {"fields": [ {"name": "bytes", "type": "BYTES"}, {"name": "date", "type": "DATE"}, {"name": "time", "type": "TIME"}]} pipeline_verifiers = [ BigqueryFullResultMatcher( project=self.project, query="SELECT bytes, date, time FROM %s" % table_id, data=[(b'xyw', datetime.date(2011, 1, 1), datetime.time(23, 59, 59, 999999), ), (b'abc', datetime.date(2000, 1, 1), datetime.time(0, 0, 0), ), (b'\xe4\xbd\xa0\xe5\xa5\xbd', datetime.date(3000, 12, 31), datetime.time(23, 59, 59), ), (b'\xab\xac\xad', datetime.date(2000, 1, 1), datetime.time(0, 0, 0), )])] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers)) with beam.Pipeline(argv=args) as p: # pylint: disable=expression-not-assigned (p | 'create' >> beam.Create(input_data) | 'write' >> beam.io.WriteToBigQuery( table_id, schema=table_schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY)) @attr('IT') def test_big_query_write_without_schema(self): table_name = 'python_no_schema_table' self.create_table(table_name) table_id = '{}.{}'.format(self.dataset_id, table_name) input_data = [ {'bytes': b'xyw', 'date': '2011-01-01', 'time': '23:59:59.999999'}, {'bytes': b'abc', 'date': '2000-01-01', 'time': '00:00:00'}, {'bytes': b'\xe4\xbd\xa0\xe5\xa5\xbd', 'date': '3000-12-31', 'time': '23:59:59'}, {'bytes': b'\xab\xac\xad', 'date': '2000-01-01', 'time': '00:00:00'} ] # bigquery io expects bytes to be base64 encoded values for row in input_data: row['bytes'] = base64.b64encode(row['bytes']) pipeline_verifiers = [ BigqueryFullResultMatcher( project=self.project, query="SELECT bytes, date, time FROM %s" % table_id, data=[(b'xyw', datetime.date(2011, 1, 1), datetime.time(23, 59, 59, 999999), ), (b'abc', datetime.date(2000, 1, 1), datetime.time(0, 0, 0), ), (b'\xe4\xbd\xa0\xe5\xa5\xbd', datetime.date(3000, 12, 31), datetime.time(23, 59, 59), ), (b'\xab\xac\xad', datetime.date(2000, 1, 1), datetime.time(0, 0, 0), )])] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers)) with beam.Pipeline(argv=args) as p: # pylint: disable=expression-not-assigned (p | 'create' >> beam.Create(input_data) | 'write' >> beam.io.WriteToBigQuery( table_id, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
class TestIT(unittest.TestCase): def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.project = self.test_pipeline.get_option('project') self.uuid = str(uuid.uuid4()) # Set up PubSub environment. from google.cloud import pubsub self.pub_client = pubsub.PublisherClient() self.input_topic = self.pub_client.create_topic( self.pub_client.topic_path(self.project, INPUT_TOPIC + self.uuid)) self.output_topic = self.pub_client.create_topic( self.pub_client.topic_path(self.project, OUTPUT_TOPIC + self.uuid)) self.sub_client = pubsub.SubscriberClient() self.input_sub = self.sub_client.create_subscription( self.sub_client.subscription_path(self.project, INPUT_SUB + self.uuid), self.input_topic.name) self.output_sub = self.sub_client.create_subscription( self.sub_client.subscription_path(self.project, OUTPUT_SUB + self.uuid), self.output_topic.name, ack_deadline_seconds=60) # Set up BigQuery tables self.dataset_ref = utils.create_bq_dataset(self.project, OUTPUT_DATASET) self.bq_wrapper = BigQueryWrapper() table_schema = parse_table_schema_from_json(schemas.get_test_schema()) def _create_table(table_id, schema): return self.bq_wrapper.get_or_create_table( project_id=self.project, dataset_id=self.dataset_ref.dataset_id, table_id=table_id, schema=schema, create_disposition='CREATE_IF_NEEDED', write_disposition='WRITE_APPEND') self.table_ref = _create_table(OUTPUT_TABLE, table_schema) def _inject_numbers(self, topic, num_messages): """Inject numbers as test data to PubSub.""" for n in range(num_messages): user = {'name': f'conall_{n}'} user_str = json.dumps(user) # logging.info(f'Injecting {user_str} to topic {topic.name}') msg = PubsubMessage(b'conall_0', {'timestamp': '1608051184000'}) self.pub_client.publish(self.input_topic.name, msg.data, **msg.attributes) def _cleanup_pubsub(self): test_utils.cleanup_subscriptions(self.sub_client, [self.input_sub, self.output_sub]) test_utils.cleanup_topics(self.pub_client, [self.input_topic, self.output_topic]) @attr('IT') def test_pubsub_pipe_it(self): # Build expected dataset. expected_msg = ['conall_0 - 1608051184'.encode('utf-8')] # Set extra options to the pipeline for test purpose state_verifier = PipelineStateMatcher(PipelineState.RUNNING) pubsub_msg_verifier = PubSubMessageMatcher(self.project, self.output_sub.name, expected_msg, timeout=60 * 7) # in seconds EXPECTED_BQ_CHECKSUM = 'da39a3ee5e6b4b0d3255bfef95601890afd80709' # SELECT SHA1(text) FROM `<project>.<dataset>.<table>` validation_query = f'SELECT text FROM `{self.project}.{self.dataset_ref.dataset_id}.{OUTPUT_TABLE}`' bq_sessions_verifier = BigqueryMatcher(self.project, validation_query, EXPECTED_BQ_CHECKSUM) # bq_sessions_verifier extra_opts = { 'bigquery_dataset': self.dataset_ref.dataset_id, 'bigquery_table': OUTPUT_TABLE, 'input_subscription': self.input_sub.name, 'output_topic': self.output_topic.name, 'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION, 'on_success_matcher': all_of(state_verifier, pubsub_msg_verifier) } # Generate input data and inject to PubSub. self._inject_numbers(self.input_topic, DEFAULT_INPUT_NUMBERS) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. pipeline.run(self.test_pipeline.get_full_options_as_args(**extra_opts)) # Cleanup PubSub self.addCleanup(self._cleanup_pubsub) self.addCleanup(utils.delete_bq_dataset, self.project, self.dataset_ref)
class BigQueryWriteIntegrationTests(unittest.TestCase): BIG_QUERY_DATASET_ID = 'python_write_to_table_' def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.runner_name = type(self.test_pipeline.runner).__name__ self.project = self.test_pipeline.get_option('project') self.bigquery_client = BigQueryWrapper() self.dataset_id = '%s%s%d' % (self.BIG_QUERY_DATASET_ID, str(int(time.time())), random.randint(0, 10000)) self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id) _LOGGER.info("Created dataset %s in project %s", self.dataset_id, self.project) def tearDown(self): request = bigquery.BigqueryDatasetsDeleteRequest( projectId=self.project, datasetId=self.dataset_id, deleteContents=True) try: _LOGGER.info("Deleting dataset %s in project %s", self.dataset_id, self.project) self.bigquery_client.client.datasets.Delete(request) except HttpError: _LOGGER.debug('Failed to clean up dataset %s in project %s', self.dataset_id, self.project) def create_table(self, table_name): table_schema = bigquery.TableSchema() table_field = bigquery.TableFieldSchema() table_field.name = 'bytes' table_field.type = 'BYTES' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'date' table_field.type = 'DATE' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'time' table_field.type = 'TIME' table_schema.fields.append(table_field) table = bigquery.Table(tableReference=bigquery.TableReference( projectId=self.project, datasetId=self.dataset_id, tableId=table_name), schema=table_schema) request = bigquery.BigqueryTablesInsertRequest( projectId=self.project, datasetId=self.dataset_id, table=table) self.bigquery_client.client.tables.Insert(request) @pytest.mark.it_postcommit def test_big_query_write(self): table_name = 'python_write_table' table_id = '{}.{}'.format(self.dataset_id, table_name) input_data = [ { 'number': 1, 'str': 'abc' }, { 'number': 2, 'str': 'def' }, { 'number': 3, 'str': u'你好' }, { 'number': 4, 'str': u'привет' }, ] table_schema = { "fields": [{ "name": "number", "type": "INTEGER" }, { "name": "str", "type": "STRING" }] } pipeline_verifiers = [ BigqueryFullResultMatcher(project=self.project, query="SELECT number, str FROM %s" % table_id, data=[( 1, 'abc', ), ( 2, 'def', ), ( 3, u'你好', ), ( 4, u'привет', )]) ] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers)) with beam.Pipeline(argv=args) as p: # pylint: disable=expression-not-assigned (p | 'create' >> beam.Create(input_data) | 'write' >> beam.io.WriteToBigQuery( table_id, schema=table_schema, create_disposition=beam.io.BigQueryDisposition. CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY)) @pytest.mark.it_postcommit def test_big_query_write_schema_autodetect(self): if self.runner_name == 'TestDataflowRunner': self.skipTest( 'DataflowRunner does not support schema autodetection') table_name = 'python_write_table' table_id = '{}.{}'.format(self.dataset_id, table_name) input_data = [ { 'number': 1, 'str': 'abc' }, { 'number': 2, 'str': 'def' }, ] pipeline_verifiers = [ BigqueryFullResultMatcher(project=self.project, query="SELECT number, str FROM %s" % table_id, data=[( 1, 'abc', ), ( 2, 'def', )]) ] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers)) with beam.Pipeline(argv=args) as p: # pylint: disable=expression-not-assigned (p | 'create' >> beam.Create(input_data) | 'write' >> beam.io.WriteToBigQuery( table_id, method=beam.io.WriteToBigQuery.Method.FILE_LOADS, schema=beam.io.gcp.bigquery.SCHEMA_AUTODETECT, create_disposition=beam.io.BigQueryDisposition. CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY, temp_file_format=FileFormat.JSON)) @pytest.mark.it_postcommit def test_big_query_write_new_types(self): table_name = 'python_new_types_table' table_id = '{}.{}'.format(self.dataset_id, table_name) row_data = { 'float': 0.33, 'numeric': Decimal('10'), 'bytes': base64.b64encode(b'\xab\xac').decode('utf-8'), 'date': '3000-12-31', 'time': '23:59:59', 'datetime': '2018-12-31T12:44:31', 'timestamp': '2018-12-31 12:44:31.744957 UTC', 'geo': 'POINT(30 10)' } input_data = [row_data] # add rows with only one key value pair and None values for all other keys for key, value in row_data.items(): input_data.append({key: value}) table_schema = { "fields": [{ "name": "float", "type": "FLOAT" }, { "name": "numeric", "type": "NUMERIC" }, { "name": "bytes", "type": "BYTES" }, { "name": "date", "type": "DATE" }, { "name": "time", "type": "TIME" }, { "name": "datetime", "type": "DATETIME" }, { "name": "timestamp", "type": "TIMESTAMP" }, { "name": "geo", "type": "GEOGRAPHY" }] } expected_row = ( 0.33, Decimal('10'), b'\xab\xac', datetime.date(3000, 12, 31), datetime.time(23, 59, 59), datetime.datetime(2018, 12, 31, 12, 44, 31), datetime.datetime(2018, 12, 31, 12, 44, 31, 744957, tzinfo=pytz.utc), 'POINT(30 10)', ) expected_data = [expected_row] # add rows with only one key value pair and None values for all other keys for i, value in enumerate(expected_row): row = [None] * len(expected_row) row[i] = value expected_data.append(tuple(row)) pipeline_verifiers = [ BigqueryFullResultMatcher( project=self.project, query='SELECT float, numeric, bytes, date, time, datetime,' 'timestamp, geo FROM %s' % table_id, data=expected_data) ] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers)) with beam.Pipeline(argv=args) as p: # pylint: disable=expression-not-assigned (p | 'create' >> beam.Create(input_data) | 'write' >> beam.io.WriteToBigQuery( table_id, schema=table_schema, create_disposition=beam.io.BigQueryDisposition. CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY)) @pytest.mark.it_postcommit def test_big_query_write_without_schema(self): table_name = 'python_no_schema_table' self.create_table(table_name) table_id = '{}.{}'.format(self.dataset_id, table_name) input_data = [{ 'bytes': b'xyw', 'date': '2011-01-01', 'time': '23:59:59.999999' }, { 'bytes': b'abc', 'date': '2000-01-01', 'time': '00:00:00' }, { 'bytes': b'\xe4\xbd\xa0\xe5\xa5\xbd', 'date': '3000-12-31', 'time': '23:59:59' }, { 'bytes': b'\xab\xac\xad', 'date': '2000-01-01', 'time': '00:00:00' }] # bigquery io expects bytes to be base64 encoded values for row in input_data: row['bytes'] = base64.b64encode(row['bytes']) pipeline_verifiers = [ BigqueryFullResultMatcher( project=self.project, query="SELECT bytes, date, time FROM %s" % table_id, data=[( b'xyw', datetime.date(2011, 1, 1), datetime.time(23, 59, 59, 999999), ), ( b'abc', datetime.date(2000, 1, 1), datetime.time(0, 0, 0), ), ( b'\xe4\xbd\xa0\xe5\xa5\xbd', datetime.date(3000, 12, 31), datetime.time(23, 59, 59), ), ( b'\xab\xac\xad', datetime.date(2000, 1, 1), datetime.time(0, 0, 0), )]) ] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers)) with beam.Pipeline(argv=args) as p: # pylint: disable=expression-not-assigned (p | 'create' >> beam.Create(input_data) | 'write' >> beam.io.WriteToBigQuery( table_id, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, temp_file_format=FileFormat.JSON)) @pytest.mark.it_postcommit @mock.patch("apache_beam.io.gcp.bigquery_file_loads._MAXIMUM_SOURCE_URIS", new=1) def test_big_query_write_temp_table_append_schema_update(self): """ Test that schema update options are respected when appending to an existing table via temporary tables. _MAXIMUM_SOURCE_URIS and max_file_size are both set to 1 to force multiple load jobs and usage of temporary tables. """ table_name = 'python_append_schema_update' self.create_table(table_name) table_id = '{}.{}'.format(self.dataset_id, table_name) input_data = [{ "int64": num, "bool": True, "nested_field": { "fruit": "Apple" } } for num in range(1, 3)] table_schema = { "fields": [{ "name": "int64", "type": "INT64" }, { "name": "bool", "type": "BOOL" }, { "name": "nested_field", "type": "RECORD", "mode": "REPEATED", "fields": [ { "name": "fruit", "type": "STRING", "mode": "NULLABLE" }, ] }] } args = self.test_pipeline.get_full_options_as_args( on_success_matcher=BigqueryFullResultMatcher( project=self.project, query= "SELECT bytes, date, time, int64, bool, nested_field.fruit FROM %s" % table_id, data=[(None, None, None, num, True, "Apple") for num in range(1, 3)])) with beam.Pipeline(argv=args) as p: # pylint: disable=expression-not-assigned (p | 'create' >> beam.Create(input_data) | 'write' >> beam.io.WriteToBigQuery( table_id, schema=table_schema, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, max_file_size=1, # bytes method=beam.io.WriteToBigQuery.Method.FILE_LOADS, additional_bq_parameters={ 'schemaUpdateOptions': ['ALLOW_FIELD_ADDITION'] }))
class BigQueryReadIntegrationTests(unittest.TestCase): BIG_QUERY_DATASET_ID = 'python_read_table_' def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.runner_name = type(self.test_pipeline.runner).__name__ self.project = self.test_pipeline.get_option('project') self.bigquery_client = BigQueryWrapper() self.dataset_id = '%s%s%d' % (self.BIG_QUERY_DATASET_ID, str(int(time.time())), random.randint(0, 10000)) self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id) logging.info("Created dataset %s in project %s", self.dataset_id, self.project) def tearDown(self): request = bigquery.BigqueryDatasetsDeleteRequest( projectId=self.project, datasetId=self.dataset_id, deleteContents=True) try: logging.info("Deleting dataset %s in project %s", self.dataset_id, self.project) self.bigquery_client.client.datasets.Delete(request) except HttpError: logging.debug('Failed to clean up dataset %s in project %s', self.dataset_id, self.project) def create_table(self, tablename): table_schema = bigquery.TableSchema() table_field = bigquery.TableFieldSchema() table_field.name = 'number' table_field.type = 'INTEGER' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'str' table_field.type = 'STRING' table_schema.fields.append(table_field) table = bigquery.Table(tableReference=bigquery.TableReference( projectId=self.project, datasetId=self.dataset_id, tableId=tablename), schema=table_schema) request = bigquery.BigqueryTablesInsertRequest( projectId=self.project, datasetId=self.dataset_id, table=table) self.bigquery_client.client.tables.Insert(request) table_data = [{ 'number': 1, 'str': 'abc' }, { 'number': 2, 'str': 'def' }, { 'number': 3, 'str': u'你好' }, { 'number': 4, 'str': u'привет' }] self.bigquery_client.insert_rows(self.project, self.dataset_id, tablename, table_data) def create_table_new_types(self, table_name): table_schema = bigquery.TableSchema() table_field = bigquery.TableFieldSchema() table_field.name = 'bytes' table_field.type = 'BYTES' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'date' table_field.type = 'DATE' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'time' table_field.type = 'TIME' table_schema.fields.append(table_field) table = bigquery.Table(tableReference=bigquery.TableReference( projectId=self.project, datasetId=self.dataset_id, tableId=table_name), schema=table_schema) request = bigquery.BigqueryTablesInsertRequest( projectId=self.project, datasetId=self.dataset_id, table=table) self.bigquery_client.client.tables.Insert(request) table_data = [{ 'bytes': b'xyw', 'date': '2011-01-01', 'time': '23:59:59.999999' }, { 'bytes': b'abc', 'date': '2000-01-01', 'time': '00:00:00' }, { 'bytes': b'\xe4\xbd\xa0\xe5\xa5\xbd', 'date': '3000-12-31', 'time': '23:59:59' }, { 'bytes': b'\xab\xac\xad', 'date': '2000-01-01', 'time': '00:00:00' }] # bigquery client expects base64 encoded bytes for row in table_data: row['bytes'] = base64.b64encode(row['bytes']).decode('utf-8') self.bigquery_client.insert_rows(self.project, self.dataset_id, table_name, table_data) @attr('IT') def test_big_query_read(self): table_name = 'python_write_table' self.create_table(table_name) table_id = '{}.{}'.format(self.dataset_id, table_name) args = self.test_pipeline.get_full_options_as_args() with beam.Pipeline(argv=args) as p: result = (p | 'read' >> beam.io.Read( beam.io.BigQuerySource( query='SELECT number, str FROM `%s`' % table_id, use_standard_sql=True))) assert_that( result, equal_to([{ 'number': 1, 'str': 'abc' }, { 'number': 2, 'str': 'def' }, { 'number': 3, 'str': u'你好' }, { 'number': 4, 'str': u'привет' }])) @attr('IT') def test_big_query_read_new_types(self): table_name = 'python_new_types_table' self.create_table_new_types(table_name) table_id = '{}.{}'.format(self.dataset_id, table_name) args = self.test_pipeline.get_full_options_as_args() expected_data = [{ 'bytes': b'xyw', 'date': '2011-01-01', 'time': '23:59:59.999999' }, { 'bytes': b'abc', 'date': '2000-01-01', 'time': '00:00:00' }, { 'bytes': b'\xe4\xbd\xa0\xe5\xa5\xbd', 'date': '3000-12-31', 'time': '23:59:59' }, { 'bytes': b'\xab\xac\xad', 'date': '2000-01-01', 'time': '00:00:00' }] # bigquery io returns bytes as base64 encoded values for row in expected_data: row['bytes'] = base64.b64encode(row['bytes']) with beam.Pipeline(argv=args) as p: result = (p | 'read' >> beam.io.Read( beam.io.BigQuerySource( query='SELECT bytes, date, time FROM `%s`' % table_id, use_standard_sql=True))) assert_that(result, equal_to(expected_data))
def _clean_temporary_dataset(self, bq: bigquery_tools.BigQueryWrapper, element: 'ReadFromBigQueryRequest'): bq.clean_up_temporary_dataset(self._get_project())