class BigQueryWriteIntegrationTests(unittest.TestCase):
  BIG_QUERY_DATASET_ID = 'python_write_to_table_'

  def setUp(self):
    self.test_pipeline = TestPipeline(is_integration_test=True)
    self.runner_name = type(self.test_pipeline.runner).__name__
    self.project = self.test_pipeline.get_option('project')

    self.bigquery_client = BigQueryWrapper()
    self.dataset_id = '%s%s%d' % (self.BIG_QUERY_DATASET_ID,
                                  str(int(time.time())),
                                  random.randint(0, 10000))
    self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id)
    _LOGGER.info("Created dataset %s in project %s",
                 self.dataset_id, self.project)

  def tearDown(self):
    request = bigquery.BigqueryDatasetsDeleteRequest(
        projectId=self.project, datasetId=self.dataset_id,
        deleteContents=True)
    try:
      _LOGGER.info("Deleting dataset %s in project %s",
                   self.dataset_id, self.project)
      self.bigquery_client.client.datasets.Delete(request)
    except HttpError:
      _LOGGER.debug('Failed to clean up dataset %s in project %s',
                    self.dataset_id, self.project)

  def create_table(self, table_name):
    table_schema = bigquery.TableSchema()
    table_field = bigquery.TableFieldSchema()
    table_field.name = 'bytes'
    table_field.type = 'BYTES'
    table_schema.fields.append(table_field)
    table_field = bigquery.TableFieldSchema()
    table_field.name = 'date'
    table_field.type = 'DATE'
    table_schema.fields.append(table_field)
    table_field = bigquery.TableFieldSchema()
    table_field.name = 'time'
    table_field.type = 'TIME'
    table_schema.fields.append(table_field)
    table = bigquery.Table(
        tableReference=bigquery.TableReference(
            projectId=self.project,
            datasetId=self.dataset_id,
            tableId=table_name),
        schema=table_schema)
    request = bigquery.BigqueryTablesInsertRequest(
        projectId=self.project, datasetId=self.dataset_id, table=table)
    self.bigquery_client.client.tables.Insert(request)

  @attr('IT')
  def test_big_query_write(self):
    table_name = 'python_write_table'
    table_id = '{}.{}'.format(self.dataset_id, table_name)

    input_data = [
        {'number': 1, 'str': 'abc'},
        {'number': 2, 'str': 'def'},
        {'number': 3, 'str': u'你好'},
        {'number': 4, 'str': u'привет'},
    ]
    table_schema = {"fields": [
        {"name": "number", "type": "INTEGER"},
        {"name": "str", "type": "STRING"}]}

    pipeline_verifiers = [
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT number, str FROM %s" % table_id,
            data=[(1, 'abc',), (2, 'def',), (3, u'你好',), (4, u'привет',)])]

    args = self.test_pipeline.get_full_options_as_args(
        on_success_matcher=hc.all_of(*pipeline_verifiers))

    with beam.Pipeline(argv=args) as p:
      # pylint: disable=expression-not-assigned
      (p | 'create' >> beam.Create(input_data)
       | 'write' >> beam.io.WriteToBigQuery(
           table_id,
           schema=table_schema,
           create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
           write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY))

  @attr('IT')
  def test_big_query_write_schema_autodetect(self):
    if self.runner_name == 'TestDataflowRunner':
      self.skipTest('DataflowRunner does not support schema autodetection')

    table_name = 'python_write_table'
    table_id = '{}.{}'.format(self.dataset_id, table_name)

    input_data = [
        {'number': 1, 'str': 'abc'},
        {'number': 2, 'str': 'def'},
    ]

    pipeline_verifiers = [
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT number, str FROM %s" % table_id,
            data=[(1, 'abc',), (2, 'def',)])]

    args = self.test_pipeline.get_full_options_as_args(
        on_success_matcher=hc.all_of(*pipeline_verifiers),
        experiments='use_beam_bq_sink')

    with beam.Pipeline(argv=args) as p:
      # pylint: disable=expression-not-assigned
      (p | 'create' >> beam.Create(input_data)
       | 'write' >> beam.io.WriteToBigQuery(
           table_id,
           method=beam.io.WriteToBigQuery.Method.FILE_LOADS,
           schema=beam.io.gcp.bigquery.SCHEMA_AUTODETECT,
           create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
           write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY))

  @attr('IT')
  def test_big_query_write_new_types(self):
    table_name = 'python_new_types_table'
    table_id = '{}.{}'.format(self.dataset_id, table_name)

    row_data = {
        'float': 0.33, 'numeric': Decimal('10'), 'bytes':
        base64.b64encode(b'\xab\xac').decode('utf-8'), 'date': '3000-12-31',
        'time': '23:59:59', 'datetime': '2018-12-31T12:44:31',
        'timestamp': '2018-12-31 12:44:31.744957 UTC', 'geo': 'POINT(30 10)'
    }

    input_data = [row_data]
    # add rows with only one key value pair and None values for all other keys
    for key, value in iteritems(row_data):
      input_data.append({key: value})

    table_schema = {"fields": [
        {"name": "float", "type": "FLOAT"},
        {"name": "numeric", "type": "NUMERIC"},
        {"name": "bytes", "type": "BYTES"},
        {"name": "date", "type": "DATE"},
        {"name": "time", "type": "TIME"},
        {"name": "datetime", "type": "DATETIME"},
        {"name": "timestamp", "type": "TIMESTAMP"},
        {"name": "geo", "type": "GEOGRAPHY"}
    ]}

    expected_row = (0.33, Decimal('10'), b'\xab\xac',
                    datetime.date(3000, 12, 31), datetime.time(23, 59, 59),
                    datetime.datetime(2018, 12, 31, 12, 44, 31),
                    datetime.datetime(2018, 12, 31, 12, 44, 31, 744957,
                                      tzinfo=pytz.utc), 'POINT(30 10)',
                   )

    expected_data = [expected_row]

    # add rows with only one key value pair and None values for all other keys
    for i, value in enumerate(expected_row):
      row = [None]*len(expected_row)
      row[i] = value
      expected_data.append(tuple(row))

    pipeline_verifiers = [
        BigqueryFullResultMatcher(
            project=self.project,
            query='SELECT float, numeric, bytes, date, time, datetime,'
                  'timestamp, geo FROM %s' % table_id,
            data=expected_data)]

    args = self.test_pipeline.get_full_options_as_args(
        on_success_matcher=hc.all_of(*pipeline_verifiers))

    with beam.Pipeline(argv=args) as p:
      # pylint: disable=expression-not-assigned
      (p | 'create' >> beam.Create(input_data)
       | 'write' >> beam.io.WriteToBigQuery(
           table_id,
           schema=table_schema,
           create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
           write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY))

  @attr('IT')
  def test_big_query_write_without_schema(self):
    table_name = 'python_no_schema_table'
    self.create_table(table_name)
    table_id = '{}.{}'.format(self.dataset_id, table_name)

    input_data = [
        {'bytes': b'xyw', 'date': '2011-01-01', 'time': '23:59:59.999999'},
        {'bytes': b'abc', 'date': '2000-01-01', 'time': '00:00:00'},
        {'bytes': b'\xe4\xbd\xa0\xe5\xa5\xbd', 'date': '3000-12-31',
         'time': '23:59:59'},
        {'bytes': b'\xab\xac\xad', 'date': '2000-01-01', 'time': '00:00:00'}
    ]
    # bigquery io expects bytes to be base64 encoded values
    for row in input_data:
      row['bytes'] = base64.b64encode(row['bytes'])

    pipeline_verifiers = [
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT bytes, date, time FROM %s" % table_id,
            data=[(b'xyw', datetime.date(2011, 1, 1),
                   datetime.time(23, 59, 59, 999999), ),
                  (b'abc', datetime.date(2000, 1, 1),
                   datetime.time(0, 0, 0), ),
                  (b'\xe4\xbd\xa0\xe5\xa5\xbd', datetime.date(3000, 12, 31),
                   datetime.time(23, 59, 59), ),
                  (b'\xab\xac\xad', datetime.date(2000, 1, 1),
                   datetime.time(0, 0, 0), )])]

    args = self.test_pipeline.get_full_options_as_args(
        on_success_matcher=hc.all_of(*pipeline_verifiers))

    with beam.Pipeline(argv=args) as p:
      # pylint: disable=expression-not-assigned
      (p | 'create' >> beam.Create(input_data)
       | 'write' >> beam.io.WriteToBigQuery(
           table_id,
           write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
class BigQueryQueryToTableIT(unittest.TestCase):
  def setUp(self):
    self.test_pipeline = TestPipeline(is_integration_test=True)
    self.runner_name = type(self.test_pipeline.runner).__name__
    self.project = self.test_pipeline.get_option('project')

    self.bigquery_client = BigQueryWrapper()
    self.dataset_id = '%s%s%d' % (BIG_QUERY_DATASET_ID, str(int(time.time())),
                                  random.randint(0, 10000))
    self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id)
    self.output_table = "%s.output_table" % (self.dataset_id)

  def tearDown(self):
    request = bigquery.BigqueryDatasetsDeleteRequest(
        projectId=self.project, datasetId=self.dataset_id,
        deleteContents=True)
    try:
      self.bigquery_client.client.datasets.Delete(request)
    except HttpError:
      logging.debug('Failed to clean up dataset %s' % self.dataset_id)

  def _setup_new_types_env(self):
    table_schema = bigquery.TableSchema()
    table_field = bigquery.TableFieldSchema()
    table_field.name = 'bytes'
    table_field.type = 'BYTES'
    table_schema.fields.append(table_field)
    table_field = bigquery.TableFieldSchema()
    table_field.name = 'date'
    table_field.type = 'DATE'
    table_schema.fields.append(table_field)
    table_field = bigquery.TableFieldSchema()
    table_field.name = 'time'
    table_field.type = 'TIME'
    table_schema.fields.append(table_field)
    table = bigquery.Table(
        tableReference=bigquery.TableReference(
            projectId=self.project,
            datasetId=self.dataset_id,
            tableId=NEW_TYPES_INPUT_TABLE),
        schema=table_schema)
    request = bigquery.BigqueryTablesInsertRequest(
        projectId=self.project, datasetId=self.dataset_id, table=table)
    self.bigquery_client.client.tables.Insert(request)
    table_data = [
        {'bytes':b'xyw=', 'date':'2011-01-01', 'time':'23:59:59.999999'},
        {'bytes':b'abc=', 'date':'2000-01-01', 'time':'00:00:00'},
        {'bytes':b'dec=', 'date':'3000-12-31', 'time':'23:59:59.990000'}
    ]
    self.bigquery_client.insert_rows(
        self.project, self.dataset_id, NEW_TYPES_INPUT_TABLE, table_data)

  @attr('IT')
  def test_big_query_legacy_sql(self):
    verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table
    expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED)
    pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher(
        project=self.project,
        query=verify_query,
        checksum=expected_checksum)]

    extra_opts = {'query': LEGACY_QUERY,
                  'output': self.output_table,
                  'output_schema': DIALECT_OUTPUT_SCHEMA,
                  'use_standard_sql': False,
                  'on_success_matcher': all_of(*pipeline_verifiers)}
    options = self.test_pipeline.get_full_options_as_args(**extra_opts)
    big_query_query_to_table_pipeline.run_bq_pipeline(options)

  @attr('IT')
  def test_big_query_standard_sql(self):
    verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table
    expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED)
    pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher(
        project=self.project,
        query=verify_query,
        checksum=expected_checksum)]

    extra_opts = {'query': STANDARD_QUERY,
                  'output': self.output_table,
                  'output_schema': DIALECT_OUTPUT_SCHEMA,
                  'use_standard_sql': True,
                  'on_success_matcher': all_of(*pipeline_verifiers)}
    options = self.test_pipeline.get_full_options_as_args(**extra_opts)
    big_query_query_to_table_pipeline.run_bq_pipeline(options)

  # TODO(BEAM-6660): Enable this test when ready.
  @unittest.skip('This test requires BQ Dataflow native source support for ' +
                 'KMS, which is not available yet.')
  @attr('IT')
  def test_big_query_standard_sql_kms_key(self):
    verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table
    expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED)
    pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher(
        project=self.project,
        query=verify_query,
        checksum=expected_checksum)]
    extra_opts = {'query': STANDARD_QUERY,
                  'output': self.output_table,
                  'output_schema': DIALECT_OUTPUT_SCHEMA,
                  'use_standard_sql': True,
                  'on_success_matcher': all_of(*pipeline_verifiers),
                  'kms_key': KMS_KEY
                 }
    options = self.test_pipeline.get_full_options_as_args(**extra_opts)
    big_query_query_to_table_pipeline.run_bq_pipeline(options)

    table = self.bigquery_client.get_table(
        self.project, self.dataset_id, 'output_table')
    self.assertEqual(KMS_KEY, table.encryptionConfiguration.kmsKeyName)

  @unittest.skipIf(sys.version_info[0] == 3 and
                   os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1',
                   'This test still needs to be fixed on Python 3'
                   'TODO: BEAM-6769')
  @attr('IT')
  def test_big_query_new_types(self):
    expected_checksum = test_utils.compute_hash(NEW_TYPES_OUTPUT_EXPECTED)
    verify_query = NEW_TYPES_OUTPUT_VERIFY_QUERY % self.output_table
    pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher(
        project=self.project,
        query=verify_query,
        checksum=expected_checksum)]
    self._setup_new_types_env()
    extra_opts = {
        'query': NEW_TYPES_QUERY % (self.dataset_id, NEW_TYPES_INPUT_TABLE),
        'output': self.output_table,
        'output_schema': NEW_TYPES_OUTPUT_SCHEMA,
        'use_standard_sql': False,
        'on_success_matcher': all_of(*pipeline_verifiers)}
    options = self.test_pipeline.get_full_options_as_args(**extra_opts)
    big_query_query_to_table_pipeline.run_bq_pipeline(options)
Example #3
0
    def setUp(self):
        self.test_pipeline = TestPipeline(is_integration_test=True)
        self.project = self.test_pipeline.get_option('project')

        # Set up PubSub environment.
        from google.cloud import pubsub
        self.pub_client = pubsub.PublisherClient()
        self.pubsub_setup_client = PubSubSetupClient(project=self.project)

        self.input_topic = self.pubsub_setup_client.create_topic(INPUT_TOPIC)
        self.output_topic = self.pubsub_setup_client.create_topic(OUTPUT_TOPIC)

        self.input_sub = self.pubsub_setup_client.create_subscription(
            self.input_topic, INPUT_SUB)
        self.output_sub = self.pubsub_setup_client.create_subscription(
            self.output_topic, OUTPUT_SUB)

        # Set up BigQuery tables
        self.dataset_ref = utils.create_bq_dataset(self.project,
                                                   OUTPUT_DATASET)
        self.bq_wrapper = BigQueryWrapper()
        table_schema = parse_table_schema_from_json(schemas.get_test_schema())

        def _create_table(table_id, schema):
            return self.bq_wrapper.get_or_create_table(
                project_id=self.project,
                dataset_id=self.dataset_ref.dataset_id,
                table_id=table_id,
                schema=schema,
                create_disposition='CREATE_IF_NEEDED',
                write_disposition='WRITE_APPEND')

        self.table_ref = _create_table(OUTPUT_TABLE, table_schema)
 def _setup_temporary_dataset(
     self,
     bq: bigquery_tools.BigQueryWrapper,
     element: 'ReadFromBigQueryRequest'):
   location = bq.get_query_location(
       self._get_project(), element.query, not element.use_standard_sql)
   bq.create_temporary_dataset(self._get_project(), location)
 def _check_for_input_data(self):
   """Checks if a BQ table with input data exists and creates it if not."""
   wrapper = BigQueryWrapper()
   try:
     wrapper.get_table(self.project_id, self.input_dataset, self.input_table)
   except HttpError as exn:
     if exn.status_code == 404:
       self._create_input_data()
  def setUp(self):
    self.test_pipeline = TestPipeline(is_integration_test=True)
    self.runner_name = type(self.test_pipeline.runner).__name__
    self.project = self.test_pipeline.get_option('project')

    self.bigquery_client = BigQueryWrapper()
    self.dataset_id = '%s%s%d' % (BIG_QUERY_DATASET_ID, str(int(time.time())),
                                  random.randint(0, 10000))
    self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id)
    self.output_table = "%s.output_table" % (self.dataset_id)
  def _export_files(
      self,
      bq: bigquery_tools.BigQueryWrapper,
      element: 'ReadFromBigQueryRequest',
      table_reference: TableReference):
    """Runs a BigQuery export job.

    Returns:
      bigquery.TableSchema instance, a list of FileMetadata instances
    """
    job_labels = self._get_bq_metadata().add_additional_bq_job_labels(
        self.bigquery_job_labels)
    export_job_name = bigquery_tools.generate_bq_job_name(
        self._job_name,
        self._source_uuid,
        bigquery_tools.BigQueryJobTypes.EXPORT,
        element.obj_id)
    temp_location = self.options.view_as(GoogleCloudOptions).temp_location
    gcs_location = bigquery_export_destination_uri(
        self.gcs_location,
        temp_location,
        '%s%s' % (self._source_uuid, element.obj_id))
    if self.use_json_exports:
      job_ref = bq.perform_extract_job([gcs_location],
                                       export_job_name,
                                       table_reference,
                                       bigquery_tools.FileFormat.JSON,
                                       project=self._get_project(),
                                       job_labels=job_labels,
                                       include_header=False)
    else:
      job_ref = bq.perform_extract_job([gcs_location],
                                       export_job_name,
                                       table_reference,
                                       bigquery_tools.FileFormat.AVRO,
                                       project=self._get_project(),
                                       include_header=False,
                                       job_labels=job_labels,
                                       use_avro_logical_types=True)
    bq.wait_for_bq_job(job_ref)
    metadata_list = FileSystems.match([gcs_location])[0].metadata_list

    if isinstance(table_reference, ValueProvider):
      table_ref = bigquery_tools.parse_table_reference(
          element.table, project=self._get_project())
    else:
      table_ref = table_reference
    table = bq.get_table(
        table_ref.projectId, table_ref.datasetId, table_ref.tableId)

    return table.schema, metadata_list
Example #8
0
 def _execute_query(self, bq: bigquery_tools.BigQueryWrapper,
                    element: 'ReadFromBigQueryRequest'):
     query_job_name = bigquery_tools.generate_bq_job_name(
         self._job_name, self._source_uuid,
         bigquery_tools.BigQueryJobTypes.QUERY, random.randint(0, 1000))
     job = bq._start_query_job(
         self._get_project(),
         element.query,
         not element.use_standard_sql,
         element.flatten_results,
         job_id=query_job_name,
         kms_key=self.kms_key,
         job_labels=self._get_bq_metadata().add_additional_bq_job_labels(
             self.bigquery_job_labels))
     job_ref = job.jobReference
     bq.wait_for_bq_job(job_ref, max_retries=0)
     return bq._get_temp_table(self._get_project())
Example #9
0
    def setUpClass(cls):
        cls.test_pipeline = TestPipeline(is_integration_test=True)
        cls.args = cls.test_pipeline.get_full_options_as_args()
        cls.runner_name = type(cls.test_pipeline.runner).__name__
        cls.project = cls.test_pipeline.get_option('project')

        cls.bigquery_client = BigQueryWrapper()
        cls.dataset_id = '%s%s%d' % (cls.BIG_QUERY_DATASET_ID,
                                     str(int(time.time())),
                                     random.randint(0, 10000))
        cls.bigquery_client.get_or_create_dataset(cls.project, cls.dataset_id)
        _LOGGER.info("Created dataset %s in project %s", cls.dataset_id,
                     cls.project)
class BigQueryQueryToTableIT(unittest.TestCase):
    def setUp(self):
        self.test_pipeline = TestPipeline(is_integration_test=True)
        self.runner_name = type(self.test_pipeline.runner).__name__
        self.project = self.test_pipeline.get_option('project')

        self.bigquery_client = BigQueryWrapper()
        self.dataset_id = '%s%s%d' % (BIG_QUERY_DATASET_ID,
                                      str(int(time.time())),
                                      random.randint(0, 10000))
        self.bigquery_client.get_or_create_dataset(self.project,
                                                   self.dataset_id)
        self.output_table = "%s.output_table" % (self.dataset_id)

    def tearDown(self):
        request = bigquery.BigqueryDatasetsDeleteRequest(
            projectId=self.project,
            datasetId=self.dataset_id,
            deleteContents=True)
        try:
            self.bigquery_client.client.datasets.Delete(request)
        except HttpError:
            logging.debug('Failed to clean up dataset %s' % self.dataset_id)

    def _setup_new_types_env(self):
        table_schema = bigquery.TableSchema()
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'bytes'
        table_field.type = 'BYTES'
        table_schema.fields.append(table_field)
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'date'
        table_field.type = 'DATE'
        table_schema.fields.append(table_field)
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'time'
        table_field.type = 'TIME'
        table_schema.fields.append(table_field)
        table = bigquery.Table(tableReference=bigquery.TableReference(
            projectId=self.project,
            datasetId=self.dataset_id,
            tableId=NEW_TYPES_INPUT_TABLE),
                               schema=table_schema)
        request = bigquery.BigqueryTablesInsertRequest(
            projectId=self.project, datasetId=self.dataset_id, table=table)
        self.bigquery_client.client.tables.Insert(request)
        table_data = [{
            'bytes': b'xyw=',
            'date': '2011-01-01',
            'time': '23:59:59.999999'
        }, {
            'bytes': b'abc=',
            'date': '2000-01-01',
            'time': '00:00:00'
        }, {
            'bytes': b'dec=',
            'date': '3000-12-31',
            'time': '23:59:59.990000'
        }]
        self.bigquery_client.insert_rows(self.project, self.dataset_id,
                                         NEW_TYPES_INPUT_TABLE, table_data)

    @attr('IT')
    def test_big_query_legacy_sql(self):
        verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table
        expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED)
        pipeline_verifiers = [
            PipelineStateMatcher(),
            BigqueryMatcher(project=self.project,
                            query=verify_query,
                            checksum=expected_checksum)
        ]

        gs_location = 'gs://temp-storage-for-upload-tests/%s' % self.output_table
        extra_opts = {
            'query': LEGACY_QUERY,
            'output': self.output_table,
            'bq_temp_location': gs_location,
            'output_schema': DIALECT_OUTPUT_SCHEMA,
            'use_standard_sql': False,
            'on_success_matcher': all_of(*pipeline_verifiers)
        }
        options = self.test_pipeline.get_full_options_as_args(**extra_opts)
        big_query_query_to_table_pipeline.run_bq_pipeline(options)

    @attr('IT')
    def test_big_query_standard_sql(self):
        verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table
        expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED)
        pipeline_verifiers = [
            PipelineStateMatcher(),
            BigqueryMatcher(project=self.project,
                            query=verify_query,
                            checksum=expected_checksum)
        ]
        gs_location = 'gs://temp-storage-for-upload-tests/%s' % self.output_table
        extra_opts = {
            'query': STANDARD_QUERY,
            'output': self.output_table,
            'bq_temp_location': gs_location,
            'output_schema': DIALECT_OUTPUT_SCHEMA,
            'use_standard_sql': True,
            'on_success_matcher': all_of(*pipeline_verifiers)
        }
        options = self.test_pipeline.get_full_options_as_args(**extra_opts)
        big_query_query_to_table_pipeline.run_bq_pipeline(options)

    # TODO(BEAM-6660): Enable this test when ready.
    @unittest.skip(
        'This test requires BQ Dataflow native source support for ' +
        'KMS, which is not available yet.')
    @attr('IT')
    def test_big_query_standard_sql_kms_key(self):
        verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table
        expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED)
        pipeline_verifiers = [
            PipelineStateMatcher(),
            BigqueryMatcher(project=self.project,
                            query=verify_query,
                            checksum=expected_checksum)
        ]
        extra_opts = {
            'query': STANDARD_QUERY,
            'output': self.output_table,
            'output_schema': DIALECT_OUTPUT_SCHEMA,
            'use_standard_sql': True,
            'on_success_matcher': all_of(*pipeline_verifiers),
            'kms_key': KMS_KEY
        }
        options = self.test_pipeline.get_full_options_as_args(**extra_opts)
        big_query_query_to_table_pipeline.run_bq_pipeline(options)

        table = self.bigquery_client.get_table(self.project, self.dataset_id,
                                               'output_table')
        self.assertEqual(KMS_KEY, table.encryptionConfiguration.kmsKeyName)

    @unittest.skipIf(sys.version_info[0] == 3
                     and os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1',
                     'This test still needs to be fixed on Python 3'
                     'TODO: BEAM-6769')
    @attr('IT')
    def test_big_query_new_types(self):
        expected_checksum = test_utils.compute_hash(NEW_TYPES_OUTPUT_EXPECTED)
        verify_query = NEW_TYPES_OUTPUT_VERIFY_QUERY % self.output_table
        pipeline_verifiers = [
            PipelineStateMatcher(),
            BigqueryMatcher(project=self.project,
                            query=verify_query,
                            checksum=expected_checksum)
        ]
        self._setup_new_types_env()
        gs_location = 'gs://temp-storage-for-upload-tests/%s' % self.output_table
        extra_opts = {
            'query':
            NEW_TYPES_QUERY % (self.dataset_id, NEW_TYPES_INPUT_TABLE),
            'output': self.output_table,
            'bq_temp_location': gs_location,
            'output_schema': NEW_TYPES_OUTPUT_SCHEMA,
            'use_standard_sql': False,
            'on_success_matcher': all_of(*pipeline_verifiers)
        }
        options = self.test_pipeline.get_full_options_as_args(**extra_opts)
        big_query_query_to_table_pipeline.run_bq_pipeline(options)
class BigQueryQueryToTableIT(unittest.TestCase):
  def setUp(self):
    self.test_pipeline = TestPipeline(is_integration_test=True)
    self.runner_name = type(self.test_pipeline.runner).__name__
    self.project = self.test_pipeline.get_option('project')

    self.bigquery_client = BigQueryWrapper()
    self.dataset_id = '%s%s%d' % (BIG_QUERY_DATASET_ID, str(int(time.time())),
                                  random.randint(0, 10000))
    self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id)
    self.output_table = "%s.output_table" % (self.dataset_id)

  def tearDown(self):
    request = bigquery.BigqueryDatasetsDeleteRequest(
        projectId=self.project, datasetId=self.dataset_id,
        deleteContents=True)
    try:
      self.bigquery_client.client.datasets.Delete(request)
    except HttpError:
      logging.debug('Failed to clean up dataset %s' % self.dataset_id)

  def _setup_new_types_env(self):
    table_schema = bigquery.TableSchema()
    table_field = bigquery.TableFieldSchema()
    table_field.name = 'bytes'
    table_field.type = 'BYTES'
    table_schema.fields.append(table_field)
    table_field = bigquery.TableFieldSchema()
    table_field.name = 'date'
    table_field.type = 'DATE'
    table_schema.fields.append(table_field)
    table_field = bigquery.TableFieldSchema()
    table_field.name = 'time'
    table_field.type = 'TIME'
    table_schema.fields.append(table_field)
    table = bigquery.Table(
        tableReference=bigquery.TableReference(
            projectId=self.project,
            datasetId=self.dataset_id,
            tableId=NEW_TYPES_INPUT_TABLE),
        schema=table_schema)
    request = bigquery.BigqueryTablesInsertRequest(
        projectId=self.project, datasetId=self.dataset_id, table=table)
    self.bigquery_client.client.tables.Insert(request)
    table_data = [
        {'bytes':b'xyw=', 'date':'2011-01-01', 'time':'23:59:59.999999'},
        {'bytes':b'abc=', 'date':'2000-01-01', 'time':'00:00:00'},
        {'bytes':b'dec=', 'date':'3000-12-31', 'time':'23:59:59.990000'}
    ]
    self.bigquery_client.insert_rows(
        self.project, self.dataset_id, NEW_TYPES_INPUT_TABLE, table_data)

  @attr('IT')
  def test_big_query_legacy_sql(self):
    verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table
    expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED)
    pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher(
        project=self.project,
        query=verify_query,
        checksum=expected_checksum)]
    extra_opts = {'query': LEGACY_QUERY,
                  'output': self.output_table,
                  'output_schema': DIALECT_OUTPUT_SCHEMA,
                  'use_standard_sql': False,
                  'on_success_matcher': all_of(*pipeline_verifiers)}
    options = self.test_pipeline.get_full_options_as_args(**extra_opts)
    big_query_query_to_table_pipeline.run_bq_pipeline(options)

  @attr('IT')
  def test_big_query_standard_sql(self):
    verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table
    expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED)
    pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher(
        project=self.project,
        query=verify_query,
        checksum=expected_checksum)]
    extra_opts = {'query': STANDARD_QUERY,
                  'output': self.output_table,
                  'output_schema': DIALECT_OUTPUT_SCHEMA,
                  'use_standard_sql': True,
                  'on_success_matcher': all_of(*pipeline_verifiers)}
    options = self.test_pipeline.get_full_options_as_args(**extra_opts)
    big_query_query_to_table_pipeline.run_bq_pipeline(options)

  @attr('IT')
  def test_big_query_new_types(self):
    expected_checksum = test_utils.compute_hash(NEW_TYPES_OUTPUT_EXPECTED)
    verify_query = NEW_TYPES_OUTPUT_VERIFY_QUERY % self.output_table
    pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher(
        project=self.project,
        query=verify_query,
        checksum=expected_checksum)]
    self._setup_new_types_env()
    extra_opts = {
        'query': NEW_TYPES_QUERY % (self.dataset_id, NEW_TYPES_INPUT_TABLE),
        'output': self.output_table,
        'output_schema': NEW_TYPES_OUTPUT_SCHEMA,
        'use_standard_sql': False,
        'on_success_matcher': all_of(*pipeline_verifiers)}
    options = self.test_pipeline.get_full_options_as_args(**extra_opts)
    big_query_query_to_table_pipeline.run_bq_pipeline(options)
Example #12
0
class BigQueryWriteIntegrationTests(unittest.TestCase):
    BIG_QUERY_DATASET_ID = 'python_write_to_table_'

    def setUp(self):
        self.test_pipeline = TestPipeline(is_integration_test=True)
        self.runner_name = type(self.test_pipeline.runner).__name__
        self.project = self.test_pipeline.get_option('project')

        self.bigquery_client = BigQueryWrapper()
        self.dataset_id = '%s%s%d' % (self.BIG_QUERY_DATASET_ID,
                                      str(int(time.time())),
                                      random.randint(0, 10000))
        self.bigquery_client.get_or_create_dataset(self.project,
                                                   self.dataset_id)
        _LOGGER.info("Created dataset %s in project %s", self.dataset_id,
                     self.project)

    def tearDown(self):
        request = bigquery.BigqueryDatasetsDeleteRequest(
            projectId=self.project,
            datasetId=self.dataset_id,
            deleteContents=True)
        try:
            _LOGGER.info("Deleting dataset %s in project %s", self.dataset_id,
                         self.project)
            self.bigquery_client.client.datasets.Delete(request)
        except HttpError:
            _LOGGER.debug('Failed to clean up dataset %s in project %s',
                          self.dataset_id, self.project)

    def create_table(self, table_name):
        table_schema = bigquery.TableSchema()
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'int64'
        table_field.type = 'INT64'
        table_field.mode = 'REQUIRED'
        table_schema.fields.append(table_field)
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'bytes'
        table_field.type = 'BYTES'
        table_schema.fields.append(table_field)
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'date'
        table_field.type = 'DATE'
        table_schema.fields.append(table_field)
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'time'
        table_field.type = 'TIME'
        table_schema.fields.append(table_field)
        table = bigquery.Table(tableReference=bigquery.TableReference(
            projectId=self.project,
            datasetId=self.dataset_id,
            tableId=table_name),
                               schema=table_schema)
        request = bigquery.BigqueryTablesInsertRequest(
            projectId=self.project, datasetId=self.dataset_id, table=table)
        self.bigquery_client.client.tables.Insert(request)

    @pytest.mark.it_postcommit
    def test_big_query_write(self):
        table_name = 'python_write_table'
        table_id = '{}.{}'.format(self.dataset_id, table_name)

        input_data = [
            {
                'number': 1,
                'str': 'abc'
            },
            {
                'number': 2,
                'str': 'def'
            },
            {
                'number': 3,
                'str': u'你好'
            },
            {
                'number': 4,
                'str': u'привет'
            },
        ]
        table_schema = {
            "fields": [{
                "name": "number",
                "type": "INTEGER"
            }, {
                "name": "str",
                "type": "STRING"
            }]
        }

        pipeline_verifiers = [
            BigqueryFullResultMatcher(project=self.project,
                                      query="SELECT number, str FROM %s" %
                                      table_id,
                                      data=[(
                                          1,
                                          'abc',
                                      ), (
                                          2,
                                          'def',
                                      ), (
                                          3,
                                          u'你好',
                                      ), (
                                          4,
                                          u'привет',
                                      )])
        ]

        args = self.test_pipeline.get_full_options_as_args(
            on_success_matcher=hc.all_of(*pipeline_verifiers))

        with beam.Pipeline(argv=args) as p:
            # pylint: disable=expression-not-assigned
            (p | 'create' >> beam.Create(input_data)
             | 'write' >> beam.io.WriteToBigQuery(
                 table_id,
                 schema=table_schema,
                 create_disposition=beam.io.BigQueryDisposition.
                 CREATE_IF_NEEDED,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY))

    @pytest.mark.it_postcommit
    def test_big_query_write_schema_autodetect(self):
        if self.runner_name == 'TestDataflowRunner':
            self.skipTest(
                'DataflowRunner does not support schema autodetection')

        table_name = 'python_write_table'
        table_id = '{}.{}'.format(self.dataset_id, table_name)

        input_data = [
            {
                'number': 1,
                'str': 'abc'
            },
            {
                'number': 2,
                'str': 'def'
            },
        ]

        pipeline_verifiers = [
            BigqueryFullResultMatcher(project=self.project,
                                      query="SELECT number, str FROM %s" %
                                      table_id,
                                      data=[(
                                          1,
                                          'abc',
                                      ), (
                                          2,
                                          'def',
                                      )])
        ]

        args = self.test_pipeline.get_full_options_as_args(
            on_success_matcher=hc.all_of(*pipeline_verifiers))

        with beam.Pipeline(argv=args) as p:
            # pylint: disable=expression-not-assigned
            (p | 'create' >> beam.Create(input_data)
             | 'write' >> beam.io.WriteToBigQuery(
                 table_id,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS,
                 schema=beam.io.gcp.bigquery.SCHEMA_AUTODETECT,
                 create_disposition=beam.io.BigQueryDisposition.
                 CREATE_IF_NEEDED,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY,
                 temp_file_format=FileFormat.JSON))

    @pytest.mark.it_postcommit
    def test_big_query_write_new_types(self):
        table_name = 'python_new_types_table'
        table_id = '{}.{}'.format(self.dataset_id, table_name)

        row_data = {
            'float': 0.33,
            'numeric': Decimal('10'),
            'bytes': base64.b64encode(b'\xab\xac').decode('utf-8'),
            'date': '3000-12-31',
            'time': '23:59:59',
            'datetime': '2018-12-31T12:44:31',
            'timestamp': '2018-12-31 12:44:31.744957 UTC',
            'geo': 'POINT(30 10)'
        }

        input_data = [row_data]
        # add rows with only one key value pair and None values for all other keys
        for key, value in row_data.items():
            input_data.append({key: value})

        table_schema = {
            "fields": [{
                "name": "float",
                "type": "FLOAT"
            }, {
                "name": "numeric",
                "type": "NUMERIC"
            }, {
                "name": "bytes",
                "type": "BYTES"
            }, {
                "name": "date",
                "type": "DATE"
            }, {
                "name": "time",
                "type": "TIME"
            }, {
                "name": "datetime",
                "type": "DATETIME"
            }, {
                "name": "timestamp",
                "type": "TIMESTAMP"
            }, {
                "name": "geo",
                "type": "GEOGRAPHY"
            }]
        }

        expected_row = (
            0.33,
            Decimal('10'),
            b'\xab\xac',
            datetime.date(3000, 12, 31),
            datetime.time(23, 59, 59),
            datetime.datetime(2018, 12, 31, 12, 44, 31),
            datetime.datetime(2018,
                              12,
                              31,
                              12,
                              44,
                              31,
                              744957,
                              tzinfo=pytz.utc),
            'POINT(30 10)',
        )

        expected_data = [expected_row]

        # add rows with only one key value pair and None values for all other keys
        for i, value in enumerate(expected_row):
            row = [None] * len(expected_row)
            row[i] = value
            expected_data.append(tuple(row))

        pipeline_verifiers = [
            BigqueryFullResultMatcher(
                project=self.project,
                query='SELECT float, numeric, bytes, date, time, datetime,'
                'timestamp, geo FROM %s' % table_id,
                data=expected_data)
        ]

        args = self.test_pipeline.get_full_options_as_args(
            on_success_matcher=hc.all_of(*pipeline_verifiers))

        with beam.Pipeline(argv=args) as p:
            # pylint: disable=expression-not-assigned
            (p | 'create' >> beam.Create(input_data)
             | 'write' >> beam.io.WriteToBigQuery(
                 table_id,
                 schema=table_schema,
                 create_disposition=beam.io.BigQueryDisposition.
                 CREATE_IF_NEEDED,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY))

    @pytest.mark.it_postcommit
    def test_big_query_write_without_schema(self):
        table_name = 'python_no_schema_table'
        self.create_table(table_name)
        table_id = '{}.{}'.format(self.dataset_id, table_name)

        input_data = [{
            'int64': 1,
            'bytes': b'xyw',
            'date': '2011-01-01',
            'time': '23:59:59.999999'
        }, {
            'int64': 2,
            'bytes': b'abc',
            'date': '2000-01-01',
            'time': '00:00:00'
        }, {
            'int64': 3,
            'bytes': b'\xe4\xbd\xa0\xe5\xa5\xbd',
            'date': '3000-12-31',
            'time': '23:59:59'
        }, {
            'int64': 4,
            'bytes': b'\xab\xac\xad',
            'date': '2000-01-01',
            'time': '00:00:00'
        }]
        # bigquery io expects bytes to be base64 encoded values
        for row in input_data:
            row['bytes'] = base64.b64encode(row['bytes'])

        pipeline_verifiers = [
            BigqueryFullResultMatcher(
                project=self.project,
                query="SELECT int64, bytes, date, time FROM %s" % table_id,
                data=[(
                    1,
                    b'xyw',
                    datetime.date(2011, 1, 1),
                    datetime.time(23, 59, 59, 999999),
                ),
                      (
                          2,
                          b'abc',
                          datetime.date(2000, 1, 1),
                          datetime.time(0, 0, 0),
                      ),
                      (
                          3,
                          b'\xe4\xbd\xa0\xe5\xa5\xbd',
                          datetime.date(3000, 12, 31),
                          datetime.time(23, 59, 59),
                      ),
                      (
                          4,
                          b'\xab\xac\xad',
                          datetime.date(2000, 1, 1),
                          datetime.time(0, 0, 0),
                      )])
        ]

        args = self.test_pipeline.get_full_options_as_args(
            on_success_matcher=hc.all_of(*pipeline_verifiers))

        with beam.Pipeline(argv=args) as p:
            # pylint: disable=expression-not-assigned
            (p | 'create' >> beam.Create(input_data)
             | 'write' >> beam.io.WriteToBigQuery(
                 table_id,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 temp_file_format=FileFormat.JSON))

    @pytest.mark.it_postcommit
    def test_big_query_write_insert_errors_reporting(self):
        """
    Test that errors returned by beam.io.WriteToBigQuery
    contain both the failed rows amd the reason for it failing.
    """
        table_name = 'python_write_table'
        table_id = '{}.{}'.format(self.dataset_id, table_name)

        input_data = [{
            'number': 1,
            'str': 'some_string',
        }, {
            'number': 2
        }, {
            'number': 3,
            'str': 'some_string',
            'additional_field_str': 'some_string',
        }]

        table_schema = {
            "fields": [{
                "name": "number",
                "type": "INTEGER",
                'mode': 'REQUIRED'
            }, {
                "name": "str",
                "type": "STRING",
                'mode': 'REQUIRED'
            }]
        }

        bq_result_errors = [(
            {
                "number": 2
            },
            [{
                "reason": "invalid",
                "location": "",
                "debugInfo": "",
                "message":
                "Missing required field: Msg_0_CLOUD_QUERY_TABLE.str."
            }],
        ),
                            ({
                                "number": 3,
                                "str": "some_string",
                                "additional_field_str": "some_string"
                            }, [{
                                "reason":
                                "invalid",
                                "location":
                                "additional_field_str",
                                "debugInfo":
                                "",
                                "message":
                                "no such field: additional_field_str."
                            }])]

        pipeline_verifiers = [
            BigqueryFullResultMatcher(project=self.project,
                                      query="SELECT number, str FROM %s" %
                                      table_id,
                                      data=[(1, 'some_string')]),
        ]

        args = self.test_pipeline.get_full_options_as_args(
            on_success_matcher=hc.all_of(*pipeline_verifiers))

        with beam.Pipeline(argv=args) as p:
            # pylint: disable=expression-not-assigned
            errors = (
                p | 'create' >> beam.Create(input_data)
                | 'write' >> beam.io.WriteToBigQuery(
                    table_id,
                    schema=table_schema,
                    method='STREAMING_INSERTS',
                    insert_retry_strategy='RETRY_NEVER',
                    create_disposition=beam.io.BigQueryDisposition.
                    CREATE_IF_NEEDED,
                    write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)
            )

            assert_that(
                errors[BigQueryWriteFn.FAILED_ROWS_WITH_ERRORS]
                | 'ParseErrors' >> beam.Map(lambda err: (err[1], err[2])),
                equal_to(bq_result_errors))

    @pytest.mark.it_postcommit
    @parameterized.expand([
        param(file_format=FileFormat.AVRO),
        param(file_format=FileFormat.JSON),
        param(file_format=None),
    ])
    @mock.patch("apache_beam.io.gcp.bigquery_file_loads._MAXIMUM_SOURCE_URIS",
                new=1)
    def test_big_query_write_temp_table_append_schema_update(
            self, file_format):
        """
    Test that nested schema update options and schema relaxation
    are respected when appending to an existing table via temporary tables.

    _MAXIMUM_SOURCE_URIS and max_file_size are both set to 1 to force multiple
    load jobs and usage of temporary tables.
    """
        table_name = 'python_append_schema_update'
        self.create_table(table_name)
        table_id = '{}.{}'.format(self.dataset_id, table_name)

        # bytes, date, time fields are optional and omitted in the test
        # only required and new columns are specified
        table_schema = {
            "fields": [{
                "name": "int64",
                "type": "INT64",
                "mode": "NULLABLE",
            }, {
                "name": "bool",
                "type": "BOOL",
            }, {
                "name":
                "nested_field",
                "type":
                "RECORD",
                "mode":
                "REPEATED",
                "fields": [
                    {
                        "name": "fruit",
                        "type": "STRING",
                        "mode": "NULLABLE"
                    },
                ]
            }]
        }
        input_data = [{
            "int64": 1,
            "bool": True,
            "nested_field": [{
                "fruit": "Apple"
            }]
        }, {
            "bool": False,
            "nested_field": [{
                "fruit": "Mango"
            }]
        }, {
            "int64": None,
            "bool": True,
            "nested_field": [{
                "fruit": "Banana"
            }]
        }]
        args = self.test_pipeline.get_full_options_as_args(
            on_success_matcher=BigqueryFullResultMatcher(
                project=self.project,
                query="""
            SELECT bytes, date, time, int64, bool, fruit
            FROM {},
            UNNEST(nested_field) as nested_field
            ORDER BY fruit
            """.format(table_id),
                data=[(None, None, None, 1, True,
                       "Apple"), (
                           None, None, None, None, True,
                           "Banana"), (None, None, None, None, False,
                                       "Mango")]))

        with beam.Pipeline(argv=args) as p:
            # pylint: disable=expression-not-assigned
            (p | 'create' >> beam.Create(input_data)
             | 'write' >> beam.io.WriteToBigQuery(
                 table_id,
                 schema=table_schema,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 max_file_size=1,  # bytes
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS,
                 additional_bq_parameters={
                     'schemaUpdateOptions':
                     ['ALLOW_FIELD_ADDITION', 'ALLOW_FIELD_RELAXATION']
                 },
                 temp_file_format=file_format))
Example #13
0
class BigQueryQueryToTableIT(unittest.TestCase):
    def setUp(self):
        self.test_pipeline = TestPipeline(is_integration_test=True)
        self.runner_name = type(self.test_pipeline.runner).__name__
        self.project = self.test_pipeline.get_option('project')

        self.bigquery_client = BigQueryWrapper()
        self.dataset_id = '%s%s%d' % (BIG_QUERY_DATASET_ID,
                                      str(int(time.time())),
                                      random.randint(0, 10000))
        self.bigquery_client.get_or_create_dataset(self.project,
                                                   self.dataset_id)
        self.output_table = "%s.output_table" % (self.dataset_id)

    def tearDown(self):
        request = bigquery.BigqueryDatasetsDeleteRequest(
            projectId=self.project,
            datasetId=self.dataset_id,
            deleteContents=True)
        try:
            self.bigquery_client.client.datasets.Delete(request)
        except HttpError:
            _LOGGER.debug('Failed to clean up dataset %s' % self.dataset_id)

    def _setup_new_types_env(self):
        table_schema = bigquery.TableSchema()
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'bytes'
        table_field.type = 'BYTES'
        table_schema.fields.append(table_field)
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'date'
        table_field.type = 'DATE'
        table_schema.fields.append(table_field)
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'time'
        table_field.type = 'TIME'
        table_schema.fields.append(table_field)
        table = bigquery.Table(tableReference=bigquery.TableReference(
            projectId=self.project,
            datasetId=self.dataset_id,
            tableId=NEW_TYPES_INPUT_TABLE),
                               schema=table_schema)
        request = bigquery.BigqueryTablesInsertRequest(
            projectId=self.project, datasetId=self.dataset_id, table=table)
        self.bigquery_client.client.tables.Insert(request)
        table_data = [{
            'bytes': b'xyw',
            'date': '2011-01-01',
            'time': '23:59:59.999999'
        }, {
            'bytes': b'abc',
            'date': '2000-01-01',
            'time': '00:00:00'
        }, {
            'bytes': b'\xe4\xbd\xa0\xe5\xa5\xbd',
            'date': '3000-12-31',
            'time': '23:59:59.990000'
        }, {
            'bytes': b'\xab\xac\xad',
            'date': '2000-01-01',
            'time': '00:00:00'
        }]
        # the API Tools bigquery client expects byte values to be base-64 encoded
        # TODO BEAM-4850: upgrade to google-cloud-bigquery which does not require
        # handling the encoding in beam
        for row in table_data:
            row['bytes'] = base64.b64encode(row['bytes']).decode('utf-8')
        self.bigquery_client.insert_rows(self.project, self.dataset_id,
                                         NEW_TYPES_INPUT_TABLE, table_data)

    @attr('IT')
    def test_big_query_legacy_sql(self):
        verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table
        expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED)
        pipeline_verifiers = [
            PipelineStateMatcher(),
            BigqueryMatcher(project=self.project,
                            query=verify_query,
                            checksum=expected_checksum)
        ]

        extra_opts = {
            'query': LEGACY_QUERY,
            'output': self.output_table,
            'output_schema': DIALECT_OUTPUT_SCHEMA,
            'use_standard_sql': False,
            'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION_MS,
            'on_success_matcher': all_of(*pipeline_verifiers),
        }
        options = self.test_pipeline.get_full_options_as_args(**extra_opts)
        big_query_query_to_table_pipeline.run_bq_pipeline(options)

    @attr('IT')
    def test_big_query_standard_sql(self):
        verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table
        expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED)
        pipeline_verifiers = [
            PipelineStateMatcher(),
            BigqueryMatcher(project=self.project,
                            query=verify_query,
                            checksum=expected_checksum)
        ]

        extra_opts = {
            'query': STANDARD_QUERY,
            'output': self.output_table,
            'output_schema': DIALECT_OUTPUT_SCHEMA,
            'use_standard_sql': True,
            'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION_MS,
            'on_success_matcher': all_of(*pipeline_verifiers),
        }
        options = self.test_pipeline.get_full_options_as_args(**extra_opts)
        big_query_query_to_table_pipeline.run_bq_pipeline(options)

    @attr('IT')
    def test_big_query_standard_sql_kms_key_native(self):
        if isinstance(self.test_pipeline.runner, TestDirectRunner):
            self.skipTest("This test doesn't work on DirectRunner.")
        verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table
        expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED)
        pipeline_verifiers = [
            PipelineStateMatcher(),
            BigqueryMatcher(project=self.project,
                            query=verify_query,
                            checksum=expected_checksum)
        ]
        kms_key = self.test_pipeline.get_option('kms_key_name')
        self.assertTrue(kms_key)
        extra_opts = {
            'query': STANDARD_QUERY,
            'output': self.output_table,
            'output_schema': DIALECT_OUTPUT_SCHEMA,
            'use_standard_sql': True,
            'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION_MS,
            'on_success_matcher': all_of(*pipeline_verifiers),
            'kms_key': kms_key,
            'native': True,
            'experiments': 'use_legacy_bq_sink',
        }
        options = self.test_pipeline.get_full_options_as_args(**extra_opts)
        big_query_query_to_table_pipeline.run_bq_pipeline(options)

        table = self.bigquery_client.get_table(self.project, self.dataset_id,
                                               'output_table')
        self.assertIsNotNone(table.encryptionConfiguration,
                             'No encryption configuration found: %s' % table)
        self.assertEqual(kms_key, table.encryptionConfiguration.kmsKeyName)

    @attr('IT')
    def test_big_query_new_types(self):
        expected_checksum = test_utils.compute_hash(NEW_TYPES_OUTPUT_EXPECTED)
        verify_query = NEW_TYPES_OUTPUT_VERIFY_QUERY % self.output_table
        pipeline_verifiers = [
            PipelineStateMatcher(),
            BigqueryMatcher(project=self.project,
                            query=verify_query,
                            checksum=expected_checksum)
        ]
        self._setup_new_types_env()
        extra_opts = {
            'query':
            NEW_TYPES_QUERY % (self.dataset_id, NEW_TYPES_INPUT_TABLE),
            'output': self.output_table,
            'output_schema': NEW_TYPES_OUTPUT_SCHEMA,
            'use_standard_sql': False,
            'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION_MS,
            'use_json_exports': True,
            'on_success_matcher': all_of(*pipeline_verifiers)
        }
        options = self.test_pipeline.get_full_options_as_args(**extra_opts)
        big_query_query_to_table_pipeline.run_bq_pipeline(options)

    @attr('IT')
    def test_big_query_new_types_avro(self):
        expected_checksum = test_utils.compute_hash(NEW_TYPES_OUTPUT_EXPECTED)
        verify_query = NEW_TYPES_OUTPUT_VERIFY_QUERY % self.output_table
        pipeline_verifiers = [
            PipelineStateMatcher(),
            BigqueryMatcher(project=self.project,
                            query=verify_query,
                            checksum=expected_checksum)
        ]
        self._setup_new_types_env()
        extra_opts = {
            'query':
            NEW_TYPES_QUERY % (self.dataset_id, NEW_TYPES_INPUT_TABLE),
            'output': self.output_table,
            'output_schema': NEW_TYPES_OUTPUT_SCHEMA,
            'use_standard_sql': False,
            'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION_MS,
            'on_success_matcher': all_of(*pipeline_verifiers),
        }
        options = self.test_pipeline.get_full_options_as_args(**extra_opts)
        big_query_query_to_table_pipeline.run_bq_pipeline(options)

    @attr('IT')
    def test_big_query_new_types_native(self):
        expected_checksum = test_utils.compute_hash(NEW_TYPES_OUTPUT_EXPECTED)
        verify_query = NEW_TYPES_OUTPUT_VERIFY_QUERY % self.output_table
        pipeline_verifiers = [
            PipelineStateMatcher(),
            BigqueryMatcher(project=self.project,
                            query=verify_query,
                            checksum=expected_checksum)
        ]
        self._setup_new_types_env()
        extra_opts = {
            'query':
            NEW_TYPES_QUERY % (self.dataset_id, NEW_TYPES_INPUT_TABLE),
            'output': self.output_table,
            'output_schema': NEW_TYPES_OUTPUT_SCHEMA,
            'use_standard_sql': False,
            'native': True,
            'use_json_exports': True,
            'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION_MS,
            'on_success_matcher': all_of(*pipeline_verifiers),
            'experiments': 'use_legacy_bq_sink',
        }
        options = self.test_pipeline.get_full_options_as_args(**extra_opts)
        big_query_query_to_table_pipeline.run_bq_pipeline(options)
Example #14
0
class BigQueryReadIntegrationTests(unittest.TestCase):
    BIG_QUERY_DATASET_ID = 'python_read_table_'

    def setUp(self):
        self.test_pipeline = TestPipeline(is_integration_test=True)
        self.runner_name = type(self.test_pipeline.runner).__name__
        self.project = self.test_pipeline.get_option('project')

        self.bigquery_client = BigQueryWrapper()
        self.dataset_id = '%s%s%d' % (self.BIG_QUERY_DATASET_ID,
                                      str(int(time.time())),
                                      random.randint(0, 10000))
        self.bigquery_client.get_or_create_dataset(self.project,
                                                   self.dataset_id)
        logging.info("Created dataset %s in project %s", self.dataset_id,
                     self.project)

    def tearDown(self):
        request = bigquery.BigqueryDatasetsDeleteRequest(
            projectId=self.project,
            datasetId=self.dataset_id,
            deleteContents=True)
        try:
            logging.info("Deleting dataset %s in project %s", self.dataset_id,
                         self.project)
            self.bigquery_client.client.datasets.Delete(request)
        except HttpError:
            logging.debug('Failed to clean up dataset %s in project %s',
                          self.dataset_id, self.project)

    def create_table(self, tablename):
        table_schema = bigquery.TableSchema()
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'number'
        table_field.type = 'INTEGER'
        table_schema.fields.append(table_field)
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'str'
        table_field.type = 'STRING'
        table_schema.fields.append(table_field)
        table = bigquery.Table(tableReference=bigquery.TableReference(
            projectId=self.project,
            datasetId=self.dataset_id,
            tableId=tablename),
                               schema=table_schema)
        request = bigquery.BigqueryTablesInsertRequest(
            projectId=self.project, datasetId=self.dataset_id, table=table)
        self.bigquery_client.client.tables.Insert(request)
        table_data = [{
            'number': 1,
            'str': 'abc'
        }, {
            'number': 2,
            'str': 'def'
        }, {
            'number': 3,
            'str': u'你好'
        }, {
            'number': 4,
            'str': u'привет'
        }]
        self.bigquery_client.insert_rows(self.project, self.dataset_id,
                                         tablename, table_data)

    def create_table_new_types(self, table_name):
        table_schema = bigquery.TableSchema()
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'float'
        table_field.type = 'FLOAT'
        table_schema.fields.append(table_field)
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'numeric'
        table_field.type = 'NUMERIC'
        table_schema.fields.append(table_field)
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'bytes'
        table_field.type = 'BYTES'
        table_schema.fields.append(table_field)
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'date'
        table_field.type = 'DATE'
        table_schema.fields.append(table_field)
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'time'
        table_field.type = 'TIME'
        table_schema.fields.append(table_field)
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'datetime'
        table_field.type = 'DATETIME'
        table_schema.fields.append(table_field)
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'timestamp'
        table_field.type = 'TIMESTAMP'
        table_schema.fields.append(table_field)
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'geo'
        table_field.type = 'GEOGRAPHY'
        table_schema.fields.append(table_field)
        table = bigquery.Table(tableReference=bigquery.TableReference(
            projectId=self.project,
            datasetId=self.dataset_id,
            tableId=table_name),
                               schema=table_schema)
        request = bigquery.BigqueryTablesInsertRequest(
            projectId=self.project, datasetId=self.dataset_id, table=table)
        self.bigquery_client.client.tables.Insert(request)
        row_data = {
            'float': 0.33,
            'numeric': Decimal('10'),
            'bytes': base64.b64encode(b'\xab\xac').decode('utf-8'),
            'date': '3000-12-31',
            'time': '23:59:59',
            'datetime': '2018-12-31T12:44:31',
            'timestamp': '2018-12-31 12:44:31.744957 UTC',
            'geo': 'POINT(30 10)'
        }

        table_data = [row_data]
        # add rows with only one key value pair and None values for all other keys
        for key, value in iteritems(row_data):
            table_data.append({key: value})

        self.bigquery_client.insert_rows(self.project, self.dataset_id,
                                         table_name, table_data)

    @attr('IT')
    def test_big_query_read(self):
        table_name = 'python_write_table'
        self.create_table(table_name)
        table_id = '{}.{}'.format(self.dataset_id, table_name)

        args = self.test_pipeline.get_full_options_as_args()

        with beam.Pipeline(argv=args) as p:
            result = (p | 'read' >> beam.io.Read(
                beam.io.BigQuerySource(
                    query='SELECT number, str FROM `%s`' % table_id,
                    use_standard_sql=True)))
            assert_that(
                result,
                equal_to([{
                    'number': 1,
                    'str': 'abc'
                }, {
                    'number': 2,
                    'str': 'def'
                }, {
                    'number': 3,
                    'str': u'你好'
                }, {
                    'number': 4,
                    'str': u'привет'
                }]))

    @attr('IT')
    def test_big_query_read_new_types(self):
        table_name = 'python_new_types'
        self.create_table_new_types(table_name)
        table_id = '{}.{}'.format(self.dataset_id, table_name)

        args = self.test_pipeline.get_full_options_as_args()

        expected_row = {
            'float': 0.33,
            'numeric': Decimal('10'),
            'bytes': base64.b64encode(b'\xab\xac'),
            'date': '3000-12-31',
            'time': '23:59:59',
            'datetime': '2018-12-31T12:44:31',
            'timestamp': '2018-12-31 12:44:31.744957 UTC',
            'geo': 'POINT(30 10)'
        }

        expected_data = [expected_row]

        # add rows with only one key value pair and None values for all other keys
        for key, value in iteritems(expected_row):
            row = {k: None for k in expected_row}
            row[key] = value
            expected_data.append(row)

        with beam.Pipeline(argv=args) as p:
            result = (p | 'read' >> beam.io.Read(
                beam.io.BigQuerySource(
                    query='SELECT float, numeric, bytes, date, time, datetime,'
                    'timestamp, geo FROM `%s`' % table_id,
                    use_standard_sql=True)))
            assert_that(result, equal_to(expected_data))
Example #15
0
class BigQueryWriteIntegrationTests(unittest.TestCase):
  BIG_QUERY_DATASET_ID = 'python_write_to_table_'

  def setUp(self):
    self.test_pipeline = TestPipeline(is_integration_test=True)
    self.runner_name = type(self.test_pipeline.runner).__name__
    self.project = self.test_pipeline.get_option('project')

    self.bigquery_client = BigQueryWrapper()
    self.dataset_id = '%s%s%d' % (self.BIG_QUERY_DATASET_ID,
                                  str(int(time.time())),
                                  random.randint(0, 10000))
    self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id)
    logging.info("Created dataset %s in project %s",
                 self.dataset_id, self.project)

  def tearDown(self):
    request = bigquery.BigqueryDatasetsDeleteRequest(
        projectId=self.project, datasetId=self.dataset_id,
        deleteContents=True)
    try:
      logging.info("Deleting dataset %s in project %s",
                   self.dataset_id, self.project)
      self.bigquery_client.client.datasets.Delete(request)
    except HttpError:
      logging.debug('Failed to clean up dataset %s in project %s',
                    self.dataset_id, self.project)

  def create_table(self, table_name):
    table_schema = bigquery.TableSchema()
    table_field = bigquery.TableFieldSchema()
    table_field.name = 'bytes'
    table_field.type = 'BYTES'
    table_schema.fields.append(table_field)
    table_field = bigquery.TableFieldSchema()
    table_field.name = 'date'
    table_field.type = 'DATE'
    table_schema.fields.append(table_field)
    table_field = bigquery.TableFieldSchema()
    table_field.name = 'time'
    table_field.type = 'TIME'
    table_schema.fields.append(table_field)
    table = bigquery.Table(
        tableReference=bigquery.TableReference(
            projectId=self.project,
            datasetId=self.dataset_id,
            tableId=table_name),
        schema=table_schema)
    request = bigquery.BigqueryTablesInsertRequest(
        projectId=self.project, datasetId=self.dataset_id, table=table)
    self.bigquery_client.client.tables.Insert(request)

  @attr('IT')
  def test_big_query_write(self):
    table_name = 'python_write_table'
    table_id = '{}.{}'.format(self.dataset_id, table_name)

    input_data = [
        {'number': 1, 'str': 'abc'},
        {'number': 2, 'str': 'def'},
        {'number': 3, 'str': u'你好'},
        {'number': 4, 'str': u'привет'},
    ]
    table_schema = {"fields": [
        {"name": "number", "type": "INTEGER"},
        {"name": "str", "type": "STRING"}]}

    pipeline_verifiers = [
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT number, str FROM %s" % table_id,
            data=[(1, 'abc',), (2, 'def',), (3, u'你好',), (4, u'привет',)])]

    args = self.test_pipeline.get_full_options_as_args(
        on_success_matcher=hc.all_of(*pipeline_verifiers))

    with beam.Pipeline(argv=args) as p:
      # pylint: disable=expression-not-assigned
      (p | 'create' >> beam.Create(input_data)
       | 'write' >> beam.io.WriteToBigQuery(
           table_id,
           schema=table_schema,
           create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
           write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY))

  @attr('IT')
  def test_big_query_write_schema_autodetect(self):
    if self.runner_name == 'TestDataflowRunner':
      self.skipTest('DataflowRunner does not support schema autodetection')

    table_name = 'python_write_table'
    table_id = '{}.{}'.format(self.dataset_id, table_name)

    input_data = [
        {'number': 1, 'str': 'abc'},
        {'number': 2, 'str': 'def'},
    ]

    pipeline_verifiers = [
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT number, str FROM %s" % table_id,
            data=[(1, 'abc',), (2, 'def',)])]

    args = self.test_pipeline.get_full_options_as_args(
        on_success_matcher=hc.all_of(*pipeline_verifiers),
        experiments='use_beam_bq_sink')

    with beam.Pipeline(argv=args) as p:
      # pylint: disable=expression-not-assigned
      (p | 'create' >> beam.Create(input_data)
       | 'write' >> beam.io.WriteToBigQuery(
           table_id,
           method=beam.io.WriteToBigQuery.Method.FILE_LOADS,
           schema=beam.io.gcp.bigquery.SCHEMA_AUTODETECT,
           create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
           write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY))

  @attr('IT')
  def test_big_query_write_new_types(self):
    table_name = 'python_new_types_table'
    table_id = '{}.{}'.format(self.dataset_id, table_name)

    input_data = [
        {'bytes': b'xyw', 'date': '2011-01-01', 'time': '23:59:59.999999'},
        {'bytes': b'abc', 'date': '2000-01-01', 'time': '00:00:00'},
        {'bytes': b'\xe4\xbd\xa0\xe5\xa5\xbd', 'date': '3000-12-31',
         'time': '23:59:59'},
        {'bytes': b'\xab\xac\xad', 'date': '2000-01-01', 'time': '00:00:00'}
    ]
    # bigquery io expects bytes to be base64 encoded values
    for row in input_data:
      row['bytes'] = base64.b64encode(row['bytes'])

    table_schema = {"fields": [
        {"name": "bytes", "type": "BYTES"},
        {"name": "date", "type": "DATE"},
        {"name": "time", "type": "TIME"}]}

    pipeline_verifiers = [
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT bytes, date, time FROM %s" % table_id,
            data=[(b'xyw', datetime.date(2011, 1, 1),
                   datetime.time(23, 59, 59, 999999), ),
                  (b'abc', datetime.date(2000, 1, 1),
                   datetime.time(0, 0, 0), ),
                  (b'\xe4\xbd\xa0\xe5\xa5\xbd', datetime.date(3000, 12, 31),
                   datetime.time(23, 59, 59), ),
                  (b'\xab\xac\xad', datetime.date(2000, 1, 1),
                   datetime.time(0, 0, 0), )])]

    args = self.test_pipeline.get_full_options_as_args(
        on_success_matcher=hc.all_of(*pipeline_verifiers))

    with beam.Pipeline(argv=args) as p:
      # pylint: disable=expression-not-assigned
      (p | 'create' >> beam.Create(input_data)
       | 'write' >> beam.io.WriteToBigQuery(
           table_id,
           schema=table_schema,
           create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
           write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY))

  @attr('IT')
  def test_big_query_write_without_schema(self):
    table_name = 'python_no_schema_table'
    self.create_table(table_name)
    table_id = '{}.{}'.format(self.dataset_id, table_name)

    input_data = [
        {'bytes': b'xyw', 'date': '2011-01-01', 'time': '23:59:59.999999'},
        {'bytes': b'abc', 'date': '2000-01-01', 'time': '00:00:00'},
        {'bytes': b'\xe4\xbd\xa0\xe5\xa5\xbd', 'date': '3000-12-31',
         'time': '23:59:59'},
        {'bytes': b'\xab\xac\xad', 'date': '2000-01-01', 'time': '00:00:00'}
    ]
    # bigquery io expects bytes to be base64 encoded values
    for row in input_data:
      row['bytes'] = base64.b64encode(row['bytes'])

    pipeline_verifiers = [
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT bytes, date, time FROM %s" % table_id,
            data=[(b'xyw', datetime.date(2011, 1, 1),
                   datetime.time(23, 59, 59, 999999), ),
                  (b'abc', datetime.date(2000, 1, 1),
                   datetime.time(0, 0, 0), ),
                  (b'\xe4\xbd\xa0\xe5\xa5\xbd', datetime.date(3000, 12, 31),
                   datetime.time(23, 59, 59), ),
                  (b'\xab\xac\xad', datetime.date(2000, 1, 1),
                   datetime.time(0, 0, 0), )])]

    args = self.test_pipeline.get_full_options_as_args(
        on_success_matcher=hc.all_of(*pipeline_verifiers))

    with beam.Pipeline(argv=args) as p:
      # pylint: disable=expression-not-assigned
      (p | 'create' >> beam.Create(input_data)
       | 'write' >> beam.io.WriteToBigQuery(
           table_id,
           write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
class TestIT(unittest.TestCase):
    def setUp(self):
        self.test_pipeline = TestPipeline(is_integration_test=True)
        self.project = self.test_pipeline.get_option('project')
        self.uuid = str(uuid.uuid4())

        # Set up PubSub environment.
        from google.cloud import pubsub
        self.pub_client = pubsub.PublisherClient()
        self.input_topic = self.pub_client.create_topic(
            self.pub_client.topic_path(self.project, INPUT_TOPIC + self.uuid))
        self.output_topic = self.pub_client.create_topic(
            self.pub_client.topic_path(self.project, OUTPUT_TOPIC + self.uuid))

        self.sub_client = pubsub.SubscriberClient()
        self.input_sub = self.sub_client.create_subscription(
            self.sub_client.subscription_path(self.project,
                                              INPUT_SUB + self.uuid),
            self.input_topic.name)
        self.output_sub = self.sub_client.create_subscription(
            self.sub_client.subscription_path(self.project,
                                              OUTPUT_SUB + self.uuid),
            self.output_topic.name,
            ack_deadline_seconds=60)

        # Set up BigQuery tables
        self.dataset_ref = utils.create_bq_dataset(self.project,
                                                   OUTPUT_DATASET)
        self.bq_wrapper = BigQueryWrapper()
        table_schema = parse_table_schema_from_json(schemas.get_test_schema())

        def _create_table(table_id, schema):
            return self.bq_wrapper.get_or_create_table(
                project_id=self.project,
                dataset_id=self.dataset_ref.dataset_id,
                table_id=table_id,
                schema=schema,
                create_disposition='CREATE_IF_NEEDED',
                write_disposition='WRITE_APPEND')

        self.table_ref = _create_table(OUTPUT_TABLE, table_schema)

    def _inject_numbers(self, topic, num_messages):
        """Inject numbers as test data to PubSub."""

        for n in range(num_messages):
            user = {'name': f'conall_{n}'}
            user_str = json.dumps(user)

            # logging.info(f'Injecting {user_str} to topic {topic.name}')

            msg = PubsubMessage(b'conall_0', {'timestamp': '1608051184000'})
            self.pub_client.publish(self.input_topic.name, msg.data,
                                    **msg.attributes)

    def _cleanup_pubsub(self):
        test_utils.cleanup_subscriptions(self.sub_client,
                                         [self.input_sub, self.output_sub])
        test_utils.cleanup_topics(self.pub_client,
                                  [self.input_topic, self.output_topic])

    @attr('IT')
    def test_pubsub_pipe_it(self):
        # Build expected dataset.
        expected_msg = ['conall_0 - 1608051184'.encode('utf-8')]

        # Set extra options to the pipeline for test purpose
        state_verifier = PipelineStateMatcher(PipelineState.RUNNING)
        pubsub_msg_verifier = PubSubMessageMatcher(self.project,
                                                   self.output_sub.name,
                                                   expected_msg,
                                                   timeout=60 *
                                                   7)  # in seconds

        EXPECTED_BQ_CHECKSUM = 'da39a3ee5e6b4b0d3255bfef95601890afd80709'  # SELECT SHA1(text) FROM `<project>.<dataset>.<table>`
        validation_query = f'SELECT text FROM `{self.project}.{self.dataset_ref.dataset_id}.{OUTPUT_TABLE}`'
        bq_sessions_verifier = BigqueryMatcher(self.project, validation_query,
                                               EXPECTED_BQ_CHECKSUM)
        # bq_sessions_verifier

        extra_opts = {
            'bigquery_dataset': self.dataset_ref.dataset_id,
            'bigquery_table': OUTPUT_TABLE,
            'input_subscription': self.input_sub.name,
            'output_topic': self.output_topic.name,
            'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION,
            'on_success_matcher': all_of(state_verifier, pubsub_msg_verifier)
        }

        # Generate input data and inject to PubSub.
        self._inject_numbers(self.input_topic, DEFAULT_INPUT_NUMBERS)

        # Get pipeline options from command argument: --test-pipeline-options,
        # and start pipeline job by calling pipeline main function.
        pipeline.run(self.test_pipeline.get_full_options_as_args(**extra_opts))

        # Cleanup PubSub
        self.addCleanup(self._cleanup_pubsub)
        self.addCleanup(utils.delete_bq_dataset, self.project,
                        self.dataset_ref)
class BigQueryWriteIntegrationTests(unittest.TestCase):
    BIG_QUERY_DATASET_ID = 'python_write_to_table_'

    def setUp(self):
        self.test_pipeline = TestPipeline(is_integration_test=True)
        self.runner_name = type(self.test_pipeline.runner).__name__
        self.project = self.test_pipeline.get_option('project')

        self.bigquery_client = BigQueryWrapper()
        self.dataset_id = '%s%s%d' % (self.BIG_QUERY_DATASET_ID,
                                      str(int(time.time())),
                                      random.randint(0, 10000))
        self.bigquery_client.get_or_create_dataset(self.project,
                                                   self.dataset_id)
        _LOGGER.info("Created dataset %s in project %s", self.dataset_id,
                     self.project)

    def tearDown(self):
        request = bigquery.BigqueryDatasetsDeleteRequest(
            projectId=self.project,
            datasetId=self.dataset_id,
            deleteContents=True)
        try:
            _LOGGER.info("Deleting dataset %s in project %s", self.dataset_id,
                         self.project)
            self.bigquery_client.client.datasets.Delete(request)
        except HttpError:
            _LOGGER.debug('Failed to clean up dataset %s in project %s',
                          self.dataset_id, self.project)

    def create_table(self, table_name):
        table_schema = bigquery.TableSchema()
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'bytes'
        table_field.type = 'BYTES'
        table_schema.fields.append(table_field)
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'date'
        table_field.type = 'DATE'
        table_schema.fields.append(table_field)
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'time'
        table_field.type = 'TIME'
        table_schema.fields.append(table_field)
        table = bigquery.Table(tableReference=bigquery.TableReference(
            projectId=self.project,
            datasetId=self.dataset_id,
            tableId=table_name),
                               schema=table_schema)
        request = bigquery.BigqueryTablesInsertRequest(
            projectId=self.project, datasetId=self.dataset_id, table=table)
        self.bigquery_client.client.tables.Insert(request)

    @pytest.mark.it_postcommit
    def test_big_query_write(self):
        table_name = 'python_write_table'
        table_id = '{}.{}'.format(self.dataset_id, table_name)

        input_data = [
            {
                'number': 1,
                'str': 'abc'
            },
            {
                'number': 2,
                'str': 'def'
            },
            {
                'number': 3,
                'str': u'你好'
            },
            {
                'number': 4,
                'str': u'привет'
            },
        ]
        table_schema = {
            "fields": [{
                "name": "number",
                "type": "INTEGER"
            }, {
                "name": "str",
                "type": "STRING"
            }]
        }

        pipeline_verifiers = [
            BigqueryFullResultMatcher(project=self.project,
                                      query="SELECT number, str FROM %s" %
                                      table_id,
                                      data=[(
                                          1,
                                          'abc',
                                      ), (
                                          2,
                                          'def',
                                      ), (
                                          3,
                                          u'你好',
                                      ), (
                                          4,
                                          u'привет',
                                      )])
        ]

        args = self.test_pipeline.get_full_options_as_args(
            on_success_matcher=hc.all_of(*pipeline_verifiers))

        with beam.Pipeline(argv=args) as p:
            # pylint: disable=expression-not-assigned
            (p | 'create' >> beam.Create(input_data)
             | 'write' >> beam.io.WriteToBigQuery(
                 table_id,
                 schema=table_schema,
                 create_disposition=beam.io.BigQueryDisposition.
                 CREATE_IF_NEEDED,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY))

    @pytest.mark.it_postcommit
    def test_big_query_write_schema_autodetect(self):
        if self.runner_name == 'TestDataflowRunner':
            self.skipTest(
                'DataflowRunner does not support schema autodetection')

        table_name = 'python_write_table'
        table_id = '{}.{}'.format(self.dataset_id, table_name)

        input_data = [
            {
                'number': 1,
                'str': 'abc'
            },
            {
                'number': 2,
                'str': 'def'
            },
        ]

        pipeline_verifiers = [
            BigqueryFullResultMatcher(project=self.project,
                                      query="SELECT number, str FROM %s" %
                                      table_id,
                                      data=[(
                                          1,
                                          'abc',
                                      ), (
                                          2,
                                          'def',
                                      )])
        ]

        args = self.test_pipeline.get_full_options_as_args(
            on_success_matcher=hc.all_of(*pipeline_verifiers))

        with beam.Pipeline(argv=args) as p:
            # pylint: disable=expression-not-assigned
            (p | 'create' >> beam.Create(input_data)
             | 'write' >> beam.io.WriteToBigQuery(
                 table_id,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS,
                 schema=beam.io.gcp.bigquery.SCHEMA_AUTODETECT,
                 create_disposition=beam.io.BigQueryDisposition.
                 CREATE_IF_NEEDED,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY,
                 temp_file_format=FileFormat.JSON))

    @pytest.mark.it_postcommit
    def test_big_query_write_new_types(self):
        table_name = 'python_new_types_table'
        table_id = '{}.{}'.format(self.dataset_id, table_name)

        row_data = {
            'float': 0.33,
            'numeric': Decimal('10'),
            'bytes': base64.b64encode(b'\xab\xac').decode('utf-8'),
            'date': '3000-12-31',
            'time': '23:59:59',
            'datetime': '2018-12-31T12:44:31',
            'timestamp': '2018-12-31 12:44:31.744957 UTC',
            'geo': 'POINT(30 10)'
        }

        input_data = [row_data]
        # add rows with only one key value pair and None values for all other keys
        for key, value in row_data.items():
            input_data.append({key: value})

        table_schema = {
            "fields": [{
                "name": "float",
                "type": "FLOAT"
            }, {
                "name": "numeric",
                "type": "NUMERIC"
            }, {
                "name": "bytes",
                "type": "BYTES"
            }, {
                "name": "date",
                "type": "DATE"
            }, {
                "name": "time",
                "type": "TIME"
            }, {
                "name": "datetime",
                "type": "DATETIME"
            }, {
                "name": "timestamp",
                "type": "TIMESTAMP"
            }, {
                "name": "geo",
                "type": "GEOGRAPHY"
            }]
        }

        expected_row = (
            0.33,
            Decimal('10'),
            b'\xab\xac',
            datetime.date(3000, 12, 31),
            datetime.time(23, 59, 59),
            datetime.datetime(2018, 12, 31, 12, 44, 31),
            datetime.datetime(2018,
                              12,
                              31,
                              12,
                              44,
                              31,
                              744957,
                              tzinfo=pytz.utc),
            'POINT(30 10)',
        )

        expected_data = [expected_row]

        # add rows with only one key value pair and None values for all other keys
        for i, value in enumerate(expected_row):
            row = [None] * len(expected_row)
            row[i] = value
            expected_data.append(tuple(row))

        pipeline_verifiers = [
            BigqueryFullResultMatcher(
                project=self.project,
                query='SELECT float, numeric, bytes, date, time, datetime,'
                'timestamp, geo FROM %s' % table_id,
                data=expected_data)
        ]

        args = self.test_pipeline.get_full_options_as_args(
            on_success_matcher=hc.all_of(*pipeline_verifiers))

        with beam.Pipeline(argv=args) as p:
            # pylint: disable=expression-not-assigned
            (p | 'create' >> beam.Create(input_data)
             | 'write' >> beam.io.WriteToBigQuery(
                 table_id,
                 schema=table_schema,
                 create_disposition=beam.io.BigQueryDisposition.
                 CREATE_IF_NEEDED,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY))

    @pytest.mark.it_postcommit
    def test_big_query_write_without_schema(self):
        table_name = 'python_no_schema_table'
        self.create_table(table_name)
        table_id = '{}.{}'.format(self.dataset_id, table_name)

        input_data = [{
            'bytes': b'xyw',
            'date': '2011-01-01',
            'time': '23:59:59.999999'
        }, {
            'bytes': b'abc',
            'date': '2000-01-01',
            'time': '00:00:00'
        }, {
            'bytes': b'\xe4\xbd\xa0\xe5\xa5\xbd',
            'date': '3000-12-31',
            'time': '23:59:59'
        }, {
            'bytes': b'\xab\xac\xad',
            'date': '2000-01-01',
            'time': '00:00:00'
        }]
        # bigquery io expects bytes to be base64 encoded values
        for row in input_data:
            row['bytes'] = base64.b64encode(row['bytes'])

        pipeline_verifiers = [
            BigqueryFullResultMatcher(
                project=self.project,
                query="SELECT bytes, date, time FROM %s" % table_id,
                data=[(
                    b'xyw',
                    datetime.date(2011, 1, 1),
                    datetime.time(23, 59, 59, 999999),
                ), (
                    b'abc',
                    datetime.date(2000, 1, 1),
                    datetime.time(0, 0, 0),
                ),
                      (
                          b'\xe4\xbd\xa0\xe5\xa5\xbd',
                          datetime.date(3000, 12, 31),
                          datetime.time(23, 59, 59),
                      ),
                      (
                          b'\xab\xac\xad',
                          datetime.date(2000, 1, 1),
                          datetime.time(0, 0, 0),
                      )])
        ]

        args = self.test_pipeline.get_full_options_as_args(
            on_success_matcher=hc.all_of(*pipeline_verifiers))

        with beam.Pipeline(argv=args) as p:
            # pylint: disable=expression-not-assigned
            (p | 'create' >> beam.Create(input_data)
             | 'write' >> beam.io.WriteToBigQuery(
                 table_id,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 temp_file_format=FileFormat.JSON))

    @pytest.mark.it_postcommit
    @mock.patch("apache_beam.io.gcp.bigquery_file_loads._MAXIMUM_SOURCE_URIS",
                new=1)
    def test_big_query_write_temp_table_append_schema_update(self):
        """
    Test that schema update options are respected when appending to an existing
    table via temporary tables.

    _MAXIMUM_SOURCE_URIS and max_file_size are both set to 1 to force multiple
    load jobs and usage of temporary tables.
    """
        table_name = 'python_append_schema_update'
        self.create_table(table_name)
        table_id = '{}.{}'.format(self.dataset_id, table_name)

        input_data = [{
            "int64": num,
            "bool": True,
            "nested_field": {
                "fruit": "Apple"
            }
        } for num in range(1, 3)]

        table_schema = {
            "fields": [{
                "name": "int64",
                "type": "INT64"
            }, {
                "name": "bool",
                "type": "BOOL"
            }, {
                "name":
                "nested_field",
                "type":
                "RECORD",
                "mode":
                "REPEATED",
                "fields": [
                    {
                        "name": "fruit",
                        "type": "STRING",
                        "mode": "NULLABLE"
                    },
                ]
            }]
        }

        args = self.test_pipeline.get_full_options_as_args(
            on_success_matcher=BigqueryFullResultMatcher(
                project=self.project,
                query=
                "SELECT bytes, date, time, int64, bool, nested_field.fruit FROM %s"
                % table_id,
                data=[(None, None, None, num, True, "Apple")
                      for num in range(1, 3)]))

        with beam.Pipeline(argv=args) as p:
            # pylint: disable=expression-not-assigned
            (p | 'create' >> beam.Create(input_data)
             | 'write' >> beam.io.WriteToBigQuery(
                 table_id,
                 schema=table_schema,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 max_file_size=1,  # bytes
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS,
                 additional_bq_parameters={
                     'schemaUpdateOptions': ['ALLOW_FIELD_ADDITION']
                 }))
class BigQueryReadIntegrationTests(unittest.TestCase):
    BIG_QUERY_DATASET_ID = 'python_read_table_'

    def setUp(self):
        self.test_pipeline = TestPipeline(is_integration_test=True)
        self.runner_name = type(self.test_pipeline.runner).__name__
        self.project = self.test_pipeline.get_option('project')

        self.bigquery_client = BigQueryWrapper()
        self.dataset_id = '%s%s%d' % (self.BIG_QUERY_DATASET_ID,
                                      str(int(time.time())),
                                      random.randint(0, 10000))
        self.bigquery_client.get_or_create_dataset(self.project,
                                                   self.dataset_id)
        logging.info("Created dataset %s in project %s", self.dataset_id,
                     self.project)

    def tearDown(self):
        request = bigquery.BigqueryDatasetsDeleteRequest(
            projectId=self.project,
            datasetId=self.dataset_id,
            deleteContents=True)
        try:
            logging.info("Deleting dataset %s in project %s", self.dataset_id,
                         self.project)
            self.bigquery_client.client.datasets.Delete(request)
        except HttpError:
            logging.debug('Failed to clean up dataset %s in project %s',
                          self.dataset_id, self.project)

    def create_table(self, tablename):
        table_schema = bigquery.TableSchema()
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'number'
        table_field.type = 'INTEGER'
        table_schema.fields.append(table_field)
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'str'
        table_field.type = 'STRING'
        table_schema.fields.append(table_field)
        table = bigquery.Table(tableReference=bigquery.TableReference(
            projectId=self.project,
            datasetId=self.dataset_id,
            tableId=tablename),
                               schema=table_schema)
        request = bigquery.BigqueryTablesInsertRequest(
            projectId=self.project, datasetId=self.dataset_id, table=table)
        self.bigquery_client.client.tables.Insert(request)
        table_data = [{
            'number': 1,
            'str': 'abc'
        }, {
            'number': 2,
            'str': 'def'
        }, {
            'number': 3,
            'str': u'你好'
        }, {
            'number': 4,
            'str': u'привет'
        }]
        self.bigquery_client.insert_rows(self.project, self.dataset_id,
                                         tablename, table_data)

    def create_table_new_types(self, table_name):
        table_schema = bigquery.TableSchema()
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'bytes'
        table_field.type = 'BYTES'
        table_schema.fields.append(table_field)
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'date'
        table_field.type = 'DATE'
        table_schema.fields.append(table_field)
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'time'
        table_field.type = 'TIME'
        table_schema.fields.append(table_field)
        table = bigquery.Table(tableReference=bigquery.TableReference(
            projectId=self.project,
            datasetId=self.dataset_id,
            tableId=table_name),
                               schema=table_schema)
        request = bigquery.BigqueryTablesInsertRequest(
            projectId=self.project, datasetId=self.dataset_id, table=table)
        self.bigquery_client.client.tables.Insert(request)
        table_data = [{
            'bytes': b'xyw',
            'date': '2011-01-01',
            'time': '23:59:59.999999'
        }, {
            'bytes': b'abc',
            'date': '2000-01-01',
            'time': '00:00:00'
        }, {
            'bytes': b'\xe4\xbd\xa0\xe5\xa5\xbd',
            'date': '3000-12-31',
            'time': '23:59:59'
        }, {
            'bytes': b'\xab\xac\xad',
            'date': '2000-01-01',
            'time': '00:00:00'
        }]
        # bigquery client expects base64 encoded bytes
        for row in table_data:
            row['bytes'] = base64.b64encode(row['bytes']).decode('utf-8')
        self.bigquery_client.insert_rows(self.project, self.dataset_id,
                                         table_name, table_data)

    @attr('IT')
    def test_big_query_read(self):
        table_name = 'python_write_table'
        self.create_table(table_name)
        table_id = '{}.{}'.format(self.dataset_id, table_name)

        args = self.test_pipeline.get_full_options_as_args()

        with beam.Pipeline(argv=args) as p:
            result = (p | 'read' >> beam.io.Read(
                beam.io.BigQuerySource(
                    query='SELECT number, str FROM `%s`' % table_id,
                    use_standard_sql=True)))
            assert_that(
                result,
                equal_to([{
                    'number': 1,
                    'str': 'abc'
                }, {
                    'number': 2,
                    'str': 'def'
                }, {
                    'number': 3,
                    'str': u'你好'
                }, {
                    'number': 4,
                    'str': u'привет'
                }]))

    @attr('IT')
    def test_big_query_read_new_types(self):
        table_name = 'python_new_types_table'
        self.create_table_new_types(table_name)
        table_id = '{}.{}'.format(self.dataset_id, table_name)

        args = self.test_pipeline.get_full_options_as_args()

        expected_data = [{
            'bytes': b'xyw',
            'date': '2011-01-01',
            'time': '23:59:59.999999'
        }, {
            'bytes': b'abc',
            'date': '2000-01-01',
            'time': '00:00:00'
        }, {
            'bytes': b'\xe4\xbd\xa0\xe5\xa5\xbd',
            'date': '3000-12-31',
            'time': '23:59:59'
        }, {
            'bytes': b'\xab\xac\xad',
            'date': '2000-01-01',
            'time': '00:00:00'
        }]
        # bigquery io returns bytes as base64 encoded values
        for row in expected_data:
            row['bytes'] = base64.b64encode(row['bytes'])

        with beam.Pipeline(argv=args) as p:
            result = (p | 'read' >> beam.io.Read(
                beam.io.BigQuerySource(
                    query='SELECT bytes, date, time FROM `%s`' % table_id,
                    use_standard_sql=True)))
            assert_that(result, equal_to(expected_data))
 def _clean_temporary_dataset(self, bq: bigquery_tools.BigQueryWrapper,
                              element: 'ReadFromBigQueryRequest'):
     bq.clean_up_temporary_dataset(self._get_project())