Esempio n. 1
0
class BigQueryReadIntegrationTests(unittest.TestCase):
    BIG_QUERY_DATASET_ID = 'python_read_table_'

    def setUp(self):
        self.test_pipeline = TestPipeline(is_integration_test=True)
        self.runner_name = type(self.test_pipeline.runner).__name__
        self.project = self.test_pipeline.get_option('project')

        self.bigquery_client = BigQueryWrapper()
        self.dataset_id = '%s%s%d' % (self.BIG_QUERY_DATASET_ID,
                                      str(int(time.time())),
                                      random.randint(0, 10000))
        self.bigquery_client.get_or_create_dataset(self.project,
                                                   self.dataset_id)
        logging.info("Created dataset %s in project %s", self.dataset_id,
                     self.project)

    def tearDown(self):
        request = bigquery.BigqueryDatasetsDeleteRequest(
            projectId=self.project,
            datasetId=self.dataset_id,
            deleteContents=True)
        try:
            logging.info("Deleting dataset %s in project %s", self.dataset_id,
                         self.project)
            self.bigquery_client.client.datasets.Delete(request)
        except HttpError:
            logging.debug('Failed to clean up dataset %s in project %s',
                          self.dataset_id, self.project)

    def create_table(self, tablename):
        table_schema = bigquery.TableSchema()
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'number'
        table_field.type = 'INTEGER'
        table_schema.fields.append(table_field)
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'str'
        table_field.type = 'STRING'
        table_schema.fields.append(table_field)
        table = bigquery.Table(tableReference=bigquery.TableReference(
            projectId=self.project,
            datasetId=self.dataset_id,
            tableId=tablename),
                               schema=table_schema)
        request = bigquery.BigqueryTablesInsertRequest(
            projectId=self.project, datasetId=self.dataset_id, table=table)
        self.bigquery_client.client.tables.Insert(request)
        table_data = [{
            'number': 1,
            'str': 'abc'
        }, {
            'number': 2,
            'str': 'def'
        }, {
            'number': 3,
            'str': u'你好'
        }, {
            'number': 4,
            'str': u'привет'
        }]
        self.bigquery_client.insert_rows(self.project, self.dataset_id,
                                         tablename, table_data)

    def create_table_new_types(self, table_name):
        table_schema = bigquery.TableSchema()
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'float'
        table_field.type = 'FLOAT'
        table_schema.fields.append(table_field)
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'numeric'
        table_field.type = 'NUMERIC'
        table_schema.fields.append(table_field)
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'bytes'
        table_field.type = 'BYTES'
        table_schema.fields.append(table_field)
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'date'
        table_field.type = 'DATE'
        table_schema.fields.append(table_field)
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'time'
        table_field.type = 'TIME'
        table_schema.fields.append(table_field)
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'datetime'
        table_field.type = 'DATETIME'
        table_schema.fields.append(table_field)
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'timestamp'
        table_field.type = 'TIMESTAMP'
        table_schema.fields.append(table_field)
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'geo'
        table_field.type = 'GEOGRAPHY'
        table_schema.fields.append(table_field)
        table = bigquery.Table(tableReference=bigquery.TableReference(
            projectId=self.project,
            datasetId=self.dataset_id,
            tableId=table_name),
                               schema=table_schema)
        request = bigquery.BigqueryTablesInsertRequest(
            projectId=self.project, datasetId=self.dataset_id, table=table)
        self.bigquery_client.client.tables.Insert(request)
        row_data = {
            'float': 0.33,
            'numeric': Decimal('10'),
            'bytes': base64.b64encode(b'\xab\xac').decode('utf-8'),
            'date': '3000-12-31',
            'time': '23:59:59',
            'datetime': '2018-12-31T12:44:31',
            'timestamp': '2018-12-31 12:44:31.744957 UTC',
            'geo': 'POINT(30 10)'
        }

        table_data = [row_data]
        # add rows with only one key value pair and None values for all other keys
        for key, value in iteritems(row_data):
            table_data.append({key: value})

        self.bigquery_client.insert_rows(self.project, self.dataset_id,
                                         table_name, table_data)

    @attr('IT')
    def test_big_query_read(self):
        table_name = 'python_write_table'
        self.create_table(table_name)
        table_id = '{}.{}'.format(self.dataset_id, table_name)

        args = self.test_pipeline.get_full_options_as_args()

        with beam.Pipeline(argv=args) as p:
            result = (p | 'read' >> beam.io.Read(
                beam.io.BigQuerySource(
                    query='SELECT number, str FROM `%s`' % table_id,
                    use_standard_sql=True)))
            assert_that(
                result,
                equal_to([{
                    'number': 1,
                    'str': 'abc'
                }, {
                    'number': 2,
                    'str': 'def'
                }, {
                    'number': 3,
                    'str': u'你好'
                }, {
                    'number': 4,
                    'str': u'привет'
                }]))

    @attr('IT')
    def test_big_query_read_new_types(self):
        table_name = 'python_new_types'
        self.create_table_new_types(table_name)
        table_id = '{}.{}'.format(self.dataset_id, table_name)

        args = self.test_pipeline.get_full_options_as_args()

        expected_row = {
            'float': 0.33,
            'numeric': Decimal('10'),
            'bytes': base64.b64encode(b'\xab\xac'),
            'date': '3000-12-31',
            'time': '23:59:59',
            'datetime': '2018-12-31T12:44:31',
            'timestamp': '2018-12-31 12:44:31.744957 UTC',
            'geo': 'POINT(30 10)'
        }

        expected_data = [expected_row]

        # add rows with only one key value pair and None values for all other keys
        for key, value in iteritems(expected_row):
            row = {k: None for k in expected_row}
            row[key] = value
            expected_data.append(row)

        with beam.Pipeline(argv=args) as p:
            result = (p | 'read' >> beam.io.Read(
                beam.io.BigQuerySource(
                    query='SELECT float, numeric, bytes, date, time, datetime,'
                    'timestamp, geo FROM `%s`' % table_id,
                    use_standard_sql=True)))
            assert_that(result, equal_to(expected_data))
Esempio n. 2
0
class BigQueryQueryToTableIT(unittest.TestCase):
    def setUp(self):
        self.test_pipeline = TestPipeline(is_integration_test=True)
        self.runner_name = type(self.test_pipeline.runner).__name__
        self.project = self.test_pipeline.get_option('project')

        self.bigquery_client = BigQueryWrapper()
        self.dataset_id = '%s%s%d' % (BIG_QUERY_DATASET_ID,
                                      str(int(time.time())),
                                      random.randint(0, 10000))
        self.bigquery_client.get_or_create_dataset(self.project,
                                                   self.dataset_id)
        self.output_table = "%s.output_table" % (self.dataset_id)

    def tearDown(self):
        request = bigquery.BigqueryDatasetsDeleteRequest(
            projectId=self.project,
            datasetId=self.dataset_id,
            deleteContents=True)
        try:
            self.bigquery_client.client.datasets.Delete(request)
        except HttpError:
            _LOGGER.debug('Failed to clean up dataset %s' % self.dataset_id)

    def _setup_new_types_env(self):
        table_schema = bigquery.TableSchema()
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'bytes'
        table_field.type = 'BYTES'
        table_schema.fields.append(table_field)
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'date'
        table_field.type = 'DATE'
        table_schema.fields.append(table_field)
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'time'
        table_field.type = 'TIME'
        table_schema.fields.append(table_field)
        table = bigquery.Table(tableReference=bigquery.TableReference(
            projectId=self.project,
            datasetId=self.dataset_id,
            tableId=NEW_TYPES_INPUT_TABLE),
                               schema=table_schema)
        request = bigquery.BigqueryTablesInsertRequest(
            projectId=self.project, datasetId=self.dataset_id, table=table)
        self.bigquery_client.client.tables.Insert(request)
        table_data = [{
            'bytes': b'xyw',
            'date': '2011-01-01',
            'time': '23:59:59.999999'
        }, {
            'bytes': b'abc',
            'date': '2000-01-01',
            'time': '00:00:00'
        }, {
            'bytes': b'\xe4\xbd\xa0\xe5\xa5\xbd',
            'date': '3000-12-31',
            'time': '23:59:59.990000'
        }, {
            'bytes': b'\xab\xac\xad',
            'date': '2000-01-01',
            'time': '00:00:00'
        }]
        # the API Tools bigquery client expects byte values to be base-64 encoded
        # TODO BEAM-4850: upgrade to google-cloud-bigquery which does not require
        # handling the encoding in beam
        for row in table_data:
            row['bytes'] = base64.b64encode(row['bytes']).decode('utf-8')
        self.bigquery_client.insert_rows(self.project, self.dataset_id,
                                         NEW_TYPES_INPUT_TABLE, table_data)

    @attr('IT')
    def test_big_query_legacy_sql(self):
        verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table
        expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED)
        pipeline_verifiers = [
            PipelineStateMatcher(),
            BigqueryMatcher(project=self.project,
                            query=verify_query,
                            checksum=expected_checksum)
        ]

        extra_opts = {
            'query': LEGACY_QUERY,
            'output': self.output_table,
            'output_schema': DIALECT_OUTPUT_SCHEMA,
            'use_standard_sql': False,
            'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION_MS,
            'on_success_matcher': all_of(*pipeline_verifiers),
        }
        options = self.test_pipeline.get_full_options_as_args(**extra_opts)
        big_query_query_to_table_pipeline.run_bq_pipeline(options)

    @attr('IT')
    def test_big_query_standard_sql(self):
        verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table
        expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED)
        pipeline_verifiers = [
            PipelineStateMatcher(),
            BigqueryMatcher(project=self.project,
                            query=verify_query,
                            checksum=expected_checksum)
        ]

        extra_opts = {
            'query': STANDARD_QUERY,
            'output': self.output_table,
            'output_schema': DIALECT_OUTPUT_SCHEMA,
            'use_standard_sql': True,
            'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION_MS,
            'on_success_matcher': all_of(*pipeline_verifiers),
        }
        options = self.test_pipeline.get_full_options_as_args(**extra_opts)
        big_query_query_to_table_pipeline.run_bq_pipeline(options)

    @attr('IT')
    def test_big_query_standard_sql_kms_key_native(self):
        if isinstance(self.test_pipeline.runner, TestDirectRunner):
            self.skipTest("This test doesn't work on DirectRunner.")
        verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table
        expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED)
        pipeline_verifiers = [
            PipelineStateMatcher(),
            BigqueryMatcher(project=self.project,
                            query=verify_query,
                            checksum=expected_checksum)
        ]
        kms_key = self.test_pipeline.get_option('kms_key_name')
        self.assertTrue(kms_key)
        extra_opts = {
            'query': STANDARD_QUERY,
            'output': self.output_table,
            'output_schema': DIALECT_OUTPUT_SCHEMA,
            'use_standard_sql': True,
            'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION_MS,
            'on_success_matcher': all_of(*pipeline_verifiers),
            'kms_key': kms_key,
            'native': True,
            'experiments': 'use_legacy_bq_sink',
        }
        options = self.test_pipeline.get_full_options_as_args(**extra_opts)
        big_query_query_to_table_pipeline.run_bq_pipeline(options)

        table = self.bigquery_client.get_table(self.project, self.dataset_id,
                                               'output_table')
        self.assertIsNotNone(table.encryptionConfiguration,
                             'No encryption configuration found: %s' % table)
        self.assertEqual(kms_key, table.encryptionConfiguration.kmsKeyName)

    @attr('IT')
    def test_big_query_new_types(self):
        expected_checksum = test_utils.compute_hash(NEW_TYPES_OUTPUT_EXPECTED)
        verify_query = NEW_TYPES_OUTPUT_VERIFY_QUERY % self.output_table
        pipeline_verifiers = [
            PipelineStateMatcher(),
            BigqueryMatcher(project=self.project,
                            query=verify_query,
                            checksum=expected_checksum)
        ]
        self._setup_new_types_env()
        extra_opts = {
            'query':
            NEW_TYPES_QUERY % (self.dataset_id, NEW_TYPES_INPUT_TABLE),
            'output': self.output_table,
            'output_schema': NEW_TYPES_OUTPUT_SCHEMA,
            'use_standard_sql': False,
            'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION_MS,
            'use_json_exports': True,
            'on_success_matcher': all_of(*pipeline_verifiers)
        }
        options = self.test_pipeline.get_full_options_as_args(**extra_opts)
        big_query_query_to_table_pipeline.run_bq_pipeline(options)

    @attr('IT')
    def test_big_query_new_types_avro(self):
        expected_checksum = test_utils.compute_hash(NEW_TYPES_OUTPUT_EXPECTED)
        verify_query = NEW_TYPES_OUTPUT_VERIFY_QUERY % self.output_table
        pipeline_verifiers = [
            PipelineStateMatcher(),
            BigqueryMatcher(project=self.project,
                            query=verify_query,
                            checksum=expected_checksum)
        ]
        self._setup_new_types_env()
        extra_opts = {
            'query':
            NEW_TYPES_QUERY % (self.dataset_id, NEW_TYPES_INPUT_TABLE),
            'output': self.output_table,
            'output_schema': NEW_TYPES_OUTPUT_SCHEMA,
            'use_standard_sql': False,
            'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION_MS,
            'on_success_matcher': all_of(*pipeline_verifiers),
        }
        options = self.test_pipeline.get_full_options_as_args(**extra_opts)
        big_query_query_to_table_pipeline.run_bq_pipeline(options)

    @attr('IT')
    def test_big_query_new_types_native(self):
        expected_checksum = test_utils.compute_hash(NEW_TYPES_OUTPUT_EXPECTED)
        verify_query = NEW_TYPES_OUTPUT_VERIFY_QUERY % self.output_table
        pipeline_verifiers = [
            PipelineStateMatcher(),
            BigqueryMatcher(project=self.project,
                            query=verify_query,
                            checksum=expected_checksum)
        ]
        self._setup_new_types_env()
        extra_opts = {
            'query':
            NEW_TYPES_QUERY % (self.dataset_id, NEW_TYPES_INPUT_TABLE),
            'output': self.output_table,
            'output_schema': NEW_TYPES_OUTPUT_SCHEMA,
            'use_standard_sql': False,
            'native': True,
            'use_json_exports': True,
            'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION_MS,
            'on_success_matcher': all_of(*pipeline_verifiers),
            'experiments': 'use_legacy_bq_sink',
        }
        options = self.test_pipeline.get_full_options_as_args(**extra_opts)
        big_query_query_to_table_pipeline.run_bq_pipeline(options)
class BigQueryQueryToTableIT(unittest.TestCase):
    def setUp(self):
        self.test_pipeline = TestPipeline(is_integration_test=True)
        self.runner_name = type(self.test_pipeline.runner).__name__
        self.project = self.test_pipeline.get_option('project')

        self.bigquery_client = BigQueryWrapper()
        self.dataset_id = '%s%s%d' % (BIG_QUERY_DATASET_ID,
                                      str(int(time.time())),
                                      random.randint(0, 10000))
        self.bigquery_client.get_or_create_dataset(self.project,
                                                   self.dataset_id)
        self.output_table = "%s.output_table" % (self.dataset_id)

    def tearDown(self):
        request = bigquery.BigqueryDatasetsDeleteRequest(
            projectId=self.project,
            datasetId=self.dataset_id,
            deleteContents=True)
        try:
            self.bigquery_client.client.datasets.Delete(request)
        except HttpError:
            logging.debug('Failed to clean up dataset %s' % self.dataset_id)

    def _setup_new_types_env(self):
        table_schema = bigquery.TableSchema()
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'bytes'
        table_field.type = 'BYTES'
        table_schema.fields.append(table_field)
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'date'
        table_field.type = 'DATE'
        table_schema.fields.append(table_field)
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'time'
        table_field.type = 'TIME'
        table_schema.fields.append(table_field)
        table = bigquery.Table(tableReference=bigquery.TableReference(
            projectId=self.project,
            datasetId=self.dataset_id,
            tableId=NEW_TYPES_INPUT_TABLE),
                               schema=table_schema)
        request = bigquery.BigqueryTablesInsertRequest(
            projectId=self.project, datasetId=self.dataset_id, table=table)
        self.bigquery_client.client.tables.Insert(request)
        table_data = [{
            'bytes': b'xyw=',
            'date': '2011-01-01',
            'time': '23:59:59.999999'
        }, {
            'bytes': b'abc=',
            'date': '2000-01-01',
            'time': '00:00:00'
        }, {
            'bytes': b'dec=',
            'date': '3000-12-31',
            'time': '23:59:59.990000'
        }]
        self.bigquery_client.insert_rows(self.project, self.dataset_id,
                                         NEW_TYPES_INPUT_TABLE, table_data)

    @attr('IT')
    def test_big_query_legacy_sql(self):
        verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table
        expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED)
        pipeline_verifiers = [
            PipelineStateMatcher(),
            BigqueryMatcher(project=self.project,
                            query=verify_query,
                            checksum=expected_checksum)
        ]

        gs_location = 'gs://temp-storage-for-upload-tests/%s' % self.output_table
        extra_opts = {
            'query': LEGACY_QUERY,
            'output': self.output_table,
            'bq_temp_location': gs_location,
            'output_schema': DIALECT_OUTPUT_SCHEMA,
            'use_standard_sql': False,
            'on_success_matcher': all_of(*pipeline_verifiers)
        }
        options = self.test_pipeline.get_full_options_as_args(**extra_opts)
        big_query_query_to_table_pipeline.run_bq_pipeline(options)

    @attr('IT')
    def test_big_query_standard_sql(self):
        verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table
        expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED)
        pipeline_verifiers = [
            PipelineStateMatcher(),
            BigqueryMatcher(project=self.project,
                            query=verify_query,
                            checksum=expected_checksum)
        ]
        gs_location = 'gs://temp-storage-for-upload-tests/%s' % self.output_table
        extra_opts = {
            'query': STANDARD_QUERY,
            'output': self.output_table,
            'bq_temp_location': gs_location,
            'output_schema': DIALECT_OUTPUT_SCHEMA,
            'use_standard_sql': True,
            'on_success_matcher': all_of(*pipeline_verifiers)
        }
        options = self.test_pipeline.get_full_options_as_args(**extra_opts)
        big_query_query_to_table_pipeline.run_bq_pipeline(options)

    # TODO(BEAM-6660): Enable this test when ready.
    @unittest.skip(
        'This test requires BQ Dataflow native source support for ' +
        'KMS, which is not available yet.')
    @attr('IT')
    def test_big_query_standard_sql_kms_key(self):
        verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table
        expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED)
        pipeline_verifiers = [
            PipelineStateMatcher(),
            BigqueryMatcher(project=self.project,
                            query=verify_query,
                            checksum=expected_checksum)
        ]
        extra_opts = {
            'query': STANDARD_QUERY,
            'output': self.output_table,
            'output_schema': DIALECT_OUTPUT_SCHEMA,
            'use_standard_sql': True,
            'on_success_matcher': all_of(*pipeline_verifiers),
            'kms_key': KMS_KEY
        }
        options = self.test_pipeline.get_full_options_as_args(**extra_opts)
        big_query_query_to_table_pipeline.run_bq_pipeline(options)

        table = self.bigquery_client.get_table(self.project, self.dataset_id,
                                               'output_table')
        self.assertEqual(KMS_KEY, table.encryptionConfiguration.kmsKeyName)

    @unittest.skipIf(sys.version_info[0] == 3
                     and os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1',
                     'This test still needs to be fixed on Python 3'
                     'TODO: BEAM-6769')
    @attr('IT')
    def test_big_query_new_types(self):
        expected_checksum = test_utils.compute_hash(NEW_TYPES_OUTPUT_EXPECTED)
        verify_query = NEW_TYPES_OUTPUT_VERIFY_QUERY % self.output_table
        pipeline_verifiers = [
            PipelineStateMatcher(),
            BigqueryMatcher(project=self.project,
                            query=verify_query,
                            checksum=expected_checksum)
        ]
        self._setup_new_types_env()
        gs_location = 'gs://temp-storage-for-upload-tests/%s' % self.output_table
        extra_opts = {
            'query':
            NEW_TYPES_QUERY % (self.dataset_id, NEW_TYPES_INPUT_TABLE),
            'output': self.output_table,
            'bq_temp_location': gs_location,
            'output_schema': NEW_TYPES_OUTPUT_SCHEMA,
            'use_standard_sql': False,
            'on_success_matcher': all_of(*pipeline_verifiers)
        }
        options = self.test_pipeline.get_full_options_as_args(**extra_opts)
        big_query_query_to_table_pipeline.run_bq_pipeline(options)
class BigQueryReadIntegrationTests(unittest.TestCase):
    BIG_QUERY_DATASET_ID = 'python_read_table_'

    def setUp(self):
        self.test_pipeline = TestPipeline(is_integration_test=True)
        self.runner_name = type(self.test_pipeline.runner).__name__
        self.project = self.test_pipeline.get_option('project')

        self.bigquery_client = BigQueryWrapper()
        self.dataset_id = '%s%s%d' % (self.BIG_QUERY_DATASET_ID,
                                      str(int(time.time())),
                                      random.randint(0, 10000))
        self.bigquery_client.get_or_create_dataset(self.project,
                                                   self.dataset_id)
        logging.info("Created dataset %s in project %s", self.dataset_id,
                     self.project)

    def tearDown(self):
        request = bigquery.BigqueryDatasetsDeleteRequest(
            projectId=self.project,
            datasetId=self.dataset_id,
            deleteContents=True)
        try:
            logging.info("Deleting dataset %s in project %s", self.dataset_id,
                         self.project)
            self.bigquery_client.client.datasets.Delete(request)
        except HttpError:
            logging.debug('Failed to clean up dataset %s in project %s',
                          self.dataset_id, self.project)

    def create_table(self, tablename):
        table_schema = bigquery.TableSchema()
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'number'
        table_field.type = 'INTEGER'
        table_schema.fields.append(table_field)
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'str'
        table_field.type = 'STRING'
        table_schema.fields.append(table_field)
        table = bigquery.Table(tableReference=bigquery.TableReference(
            projectId=self.project,
            datasetId=self.dataset_id,
            tableId=tablename),
                               schema=table_schema)
        request = bigquery.BigqueryTablesInsertRequest(
            projectId=self.project, datasetId=self.dataset_id, table=table)
        self.bigquery_client.client.tables.Insert(request)
        table_data = [{
            'number': 1,
            'str': 'abc'
        }, {
            'number': 2,
            'str': 'def'
        }, {
            'number': 3,
            'str': u'你好'
        }, {
            'number': 4,
            'str': u'привет'
        }]
        self.bigquery_client.insert_rows(self.project, self.dataset_id,
                                         tablename, table_data)

    def create_table_new_types(self, table_name):
        table_schema = bigquery.TableSchema()
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'bytes'
        table_field.type = 'BYTES'
        table_schema.fields.append(table_field)
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'date'
        table_field.type = 'DATE'
        table_schema.fields.append(table_field)
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'time'
        table_field.type = 'TIME'
        table_schema.fields.append(table_field)
        table = bigquery.Table(tableReference=bigquery.TableReference(
            projectId=self.project,
            datasetId=self.dataset_id,
            tableId=table_name),
                               schema=table_schema)
        request = bigquery.BigqueryTablesInsertRequest(
            projectId=self.project, datasetId=self.dataset_id, table=table)
        self.bigquery_client.client.tables.Insert(request)
        table_data = [{
            'bytes': b'xyw',
            'date': '2011-01-01',
            'time': '23:59:59.999999'
        }, {
            'bytes': b'abc',
            'date': '2000-01-01',
            'time': '00:00:00'
        }, {
            'bytes': b'\xe4\xbd\xa0\xe5\xa5\xbd',
            'date': '3000-12-31',
            'time': '23:59:59'
        }, {
            'bytes': b'\xab\xac\xad',
            'date': '2000-01-01',
            'time': '00:00:00'
        }]
        # bigquery client expects base64 encoded bytes
        for row in table_data:
            row['bytes'] = base64.b64encode(row['bytes']).decode('utf-8')
        self.bigquery_client.insert_rows(self.project, self.dataset_id,
                                         table_name, table_data)

    @attr('IT')
    def test_big_query_read(self):
        table_name = 'python_write_table'
        self.create_table(table_name)
        table_id = '{}.{}'.format(self.dataset_id, table_name)

        args = self.test_pipeline.get_full_options_as_args()

        with beam.Pipeline(argv=args) as p:
            result = (p | 'read' >> beam.io.Read(
                beam.io.BigQuerySource(
                    query='SELECT number, str FROM `%s`' % table_id,
                    use_standard_sql=True)))
            assert_that(
                result,
                equal_to([{
                    'number': 1,
                    'str': 'abc'
                }, {
                    'number': 2,
                    'str': 'def'
                }, {
                    'number': 3,
                    'str': u'你好'
                }, {
                    'number': 4,
                    'str': u'привет'
                }]))

    @attr('IT')
    def test_big_query_read_new_types(self):
        table_name = 'python_new_types_table'
        self.create_table_new_types(table_name)
        table_id = '{}.{}'.format(self.dataset_id, table_name)

        args = self.test_pipeline.get_full_options_as_args()

        expected_data = [{
            'bytes': b'xyw',
            'date': '2011-01-01',
            'time': '23:59:59.999999'
        }, {
            'bytes': b'abc',
            'date': '2000-01-01',
            'time': '00:00:00'
        }, {
            'bytes': b'\xe4\xbd\xa0\xe5\xa5\xbd',
            'date': '3000-12-31',
            'time': '23:59:59'
        }, {
            'bytes': b'\xab\xac\xad',
            'date': '2000-01-01',
            'time': '00:00:00'
        }]
        # bigquery io returns bytes as base64 encoded values
        for row in expected_data:
            row['bytes'] = base64.b64encode(row['bytes'])

        with beam.Pipeline(argv=args) as p:
            result = (p | 'read' >> beam.io.Read(
                beam.io.BigQuerySource(
                    query='SELECT bytes, date, time FROM `%s`' % table_id,
                    use_standard_sql=True)))
            assert_that(result, equal_to(expected_data))
class BigQueryQueryToTableIT(unittest.TestCase):
  def setUp(self):
    self.test_pipeline = TestPipeline(is_integration_test=True)
    self.runner_name = type(self.test_pipeline.runner).__name__
    self.project = self.test_pipeline.get_option('project')

    self.bigquery_client = BigQueryWrapper()
    self.dataset_id = '%s%s%d' % (BIG_QUERY_DATASET_ID, str(int(time.time())),
                                  random.randint(0, 10000))
    self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id)
    self.output_table = "%s.output_table" % (self.dataset_id)

  def tearDown(self):
    request = bigquery.BigqueryDatasetsDeleteRequest(
        projectId=self.project, datasetId=self.dataset_id,
        deleteContents=True)
    try:
      self.bigquery_client.client.datasets.Delete(request)
    except HttpError:
      logging.debug('Failed to clean up dataset %s' % self.dataset_id)

  def _setup_new_types_env(self):
    table_schema = bigquery.TableSchema()
    table_field = bigquery.TableFieldSchema()
    table_field.name = 'bytes'
    table_field.type = 'BYTES'
    table_schema.fields.append(table_field)
    table_field = bigquery.TableFieldSchema()
    table_field.name = 'date'
    table_field.type = 'DATE'
    table_schema.fields.append(table_field)
    table_field = bigquery.TableFieldSchema()
    table_field.name = 'time'
    table_field.type = 'TIME'
    table_schema.fields.append(table_field)
    table = bigquery.Table(
        tableReference=bigquery.TableReference(
            projectId=self.project,
            datasetId=self.dataset_id,
            tableId=NEW_TYPES_INPUT_TABLE),
        schema=table_schema)
    request = bigquery.BigqueryTablesInsertRequest(
        projectId=self.project, datasetId=self.dataset_id, table=table)
    self.bigquery_client.client.tables.Insert(request)
    table_data = [
        {'bytes':b'xyw=', 'date':'2011-01-01', 'time':'23:59:59.999999'},
        {'bytes':b'abc=', 'date':'2000-01-01', 'time':'00:00:00'},
        {'bytes':b'dec=', 'date':'3000-12-31', 'time':'23:59:59.990000'}
    ]
    self.bigquery_client.insert_rows(
        self.project, self.dataset_id, NEW_TYPES_INPUT_TABLE, table_data)

  @attr('IT')
  def test_big_query_legacy_sql(self):
    verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table
    expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED)
    pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher(
        project=self.project,
        query=verify_query,
        checksum=expected_checksum)]
    extra_opts = {'query': LEGACY_QUERY,
                  'output': self.output_table,
                  'output_schema': DIALECT_OUTPUT_SCHEMA,
                  'use_standard_sql': False,
                  'on_success_matcher': all_of(*pipeline_verifiers)}
    options = self.test_pipeline.get_full_options_as_args(**extra_opts)
    big_query_query_to_table_pipeline.run_bq_pipeline(options)

  @attr('IT')
  def test_big_query_standard_sql(self):
    verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table
    expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED)
    pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher(
        project=self.project,
        query=verify_query,
        checksum=expected_checksum)]
    extra_opts = {'query': STANDARD_QUERY,
                  'output': self.output_table,
                  'output_schema': DIALECT_OUTPUT_SCHEMA,
                  'use_standard_sql': True,
                  'on_success_matcher': all_of(*pipeline_verifiers)}
    options = self.test_pipeline.get_full_options_as_args(**extra_opts)
    big_query_query_to_table_pipeline.run_bq_pipeline(options)

  @attr('IT')
  def test_big_query_new_types(self):
    expected_checksum = test_utils.compute_hash(NEW_TYPES_OUTPUT_EXPECTED)
    verify_query = NEW_TYPES_OUTPUT_VERIFY_QUERY % self.output_table
    pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher(
        project=self.project,
        query=verify_query,
        checksum=expected_checksum)]
    self._setup_new_types_env()
    extra_opts = {
        'query': NEW_TYPES_QUERY % (self.dataset_id, NEW_TYPES_INPUT_TABLE),
        'output': self.output_table,
        'output_schema': NEW_TYPES_OUTPUT_SCHEMA,
        'use_standard_sql': False,
        'on_success_matcher': all_of(*pipeline_verifiers)}
    options = self.test_pipeline.get_full_options_as_args(**extra_opts)
    big_query_query_to_table_pipeline.run_bq_pipeline(options)
class BigQueryQueryToTableIT(unittest.TestCase):
  def setUp(self):
    self.test_pipeline = TestPipeline(is_integration_test=True)
    self.runner_name = type(self.test_pipeline.runner).__name__
    self.project = self.test_pipeline.get_option('project')

    self.bigquery_client = BigQueryWrapper()
    self.dataset_id = '%s%s%d' % (BIG_QUERY_DATASET_ID, str(int(time.time())),
                                  random.randint(0, 10000))
    self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id)
    self.output_table = "%s.output_table" % (self.dataset_id)

  def tearDown(self):
    request = bigquery.BigqueryDatasetsDeleteRequest(
        projectId=self.project, datasetId=self.dataset_id,
        deleteContents=True)
    try:
      self.bigquery_client.client.datasets.Delete(request)
    except HttpError:
      logging.debug('Failed to clean up dataset %s' % self.dataset_id)

  def _setup_new_types_env(self):
    table_schema = bigquery.TableSchema()
    table_field = bigquery.TableFieldSchema()
    table_field.name = 'bytes'
    table_field.type = 'BYTES'
    table_schema.fields.append(table_field)
    table_field = bigquery.TableFieldSchema()
    table_field.name = 'date'
    table_field.type = 'DATE'
    table_schema.fields.append(table_field)
    table_field = bigquery.TableFieldSchema()
    table_field.name = 'time'
    table_field.type = 'TIME'
    table_schema.fields.append(table_field)
    table = bigquery.Table(
        tableReference=bigquery.TableReference(
            projectId=self.project,
            datasetId=self.dataset_id,
            tableId=NEW_TYPES_INPUT_TABLE),
        schema=table_schema)
    request = bigquery.BigqueryTablesInsertRequest(
        projectId=self.project, datasetId=self.dataset_id, table=table)
    self.bigquery_client.client.tables.Insert(request)
    table_data = [
        {'bytes':b'xyw=', 'date':'2011-01-01', 'time':'23:59:59.999999'},
        {'bytes':b'abc=', 'date':'2000-01-01', 'time':'00:00:00'},
        {'bytes':b'dec=', 'date':'3000-12-31', 'time':'23:59:59.990000'}
    ]
    self.bigquery_client.insert_rows(
        self.project, self.dataset_id, NEW_TYPES_INPUT_TABLE, table_data)

  @attr('IT')
  def test_big_query_legacy_sql(self):
    verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table
    expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED)
    pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher(
        project=self.project,
        query=verify_query,
        checksum=expected_checksum)]

    extra_opts = {'query': LEGACY_QUERY,
                  'output': self.output_table,
                  'output_schema': DIALECT_OUTPUT_SCHEMA,
                  'use_standard_sql': False,
                  'on_success_matcher': all_of(*pipeline_verifiers)}
    options = self.test_pipeline.get_full_options_as_args(**extra_opts)
    big_query_query_to_table_pipeline.run_bq_pipeline(options)

  @attr('IT')
  def test_big_query_standard_sql(self):
    verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table
    expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED)
    pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher(
        project=self.project,
        query=verify_query,
        checksum=expected_checksum)]

    extra_opts = {'query': STANDARD_QUERY,
                  'output': self.output_table,
                  'output_schema': DIALECT_OUTPUT_SCHEMA,
                  'use_standard_sql': True,
                  'on_success_matcher': all_of(*pipeline_verifiers)}
    options = self.test_pipeline.get_full_options_as_args(**extra_opts)
    big_query_query_to_table_pipeline.run_bq_pipeline(options)

  # TODO(BEAM-6660): Enable this test when ready.
  @unittest.skip('This test requires BQ Dataflow native source support for ' +
                 'KMS, which is not available yet.')
  @attr('IT')
  def test_big_query_standard_sql_kms_key(self):
    verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table
    expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED)
    pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher(
        project=self.project,
        query=verify_query,
        checksum=expected_checksum)]
    extra_opts = {'query': STANDARD_QUERY,
                  'output': self.output_table,
                  'output_schema': DIALECT_OUTPUT_SCHEMA,
                  'use_standard_sql': True,
                  'on_success_matcher': all_of(*pipeline_verifiers),
                  'kms_key': KMS_KEY
                 }
    options = self.test_pipeline.get_full_options_as_args(**extra_opts)
    big_query_query_to_table_pipeline.run_bq_pipeline(options)

    table = self.bigquery_client.get_table(
        self.project, self.dataset_id, 'output_table')
    self.assertEqual(KMS_KEY, table.encryptionConfiguration.kmsKeyName)

  @unittest.skipIf(sys.version_info[0] == 3 and
                   os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1',
                   'This test still needs to be fixed on Python 3'
                   'TODO: BEAM-6769')
  @attr('IT')
  def test_big_query_new_types(self):
    expected_checksum = test_utils.compute_hash(NEW_TYPES_OUTPUT_EXPECTED)
    verify_query = NEW_TYPES_OUTPUT_VERIFY_QUERY % self.output_table
    pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher(
        project=self.project,
        query=verify_query,
        checksum=expected_checksum)]
    self._setup_new_types_env()
    extra_opts = {
        'query': NEW_TYPES_QUERY % (self.dataset_id, NEW_TYPES_INPUT_TABLE),
        'output': self.output_table,
        'output_schema': NEW_TYPES_OUTPUT_SCHEMA,
        'use_standard_sql': False,
        'on_success_matcher': all_of(*pipeline_verifiers)}
    options = self.test_pipeline.get_full_options_as_args(**extra_opts)
    big_query_query_to_table_pipeline.run_bq_pipeline(options)