コード例 #1
0
  def test_multiple_destinations_transform(self):
    output_table_1 = '%s%s' % (self.output_table, 1)
    output_table_2 = '%s%s' % (self.output_table, 2)
    output_table_3 = '%s%s' % (self.output_table, 3)
    output_table_4 = '%s%s' % (self.output_table, 4)
    pipeline_verifiers = [
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT * FROM %s" % output_table_1,
            data=[(d['name'], d['language'])
                  for d in _ELEMENTS
                  if 'language' in d]),
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT * FROM %s" % output_table_2,
            data=[(d['name'], d['foundation'])
                  for d in _ELEMENTS
                  if 'foundation' in d]),
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT * FROM %s" % output_table_3,
            data=[(d['name'], d['language'])
                  for d in _ELEMENTS
                  if 'language' in d]),
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT * FROM %s" % output_table_4,
            data=[(d['name'], d['foundation'])
                  for d in _ELEMENTS
                  if 'foundation' in d])]

    args = self.test_pipeline.get_full_options_as_args(
        on_success_matcher=all_of(*pipeline_verifiers))

    with beam.Pipeline(argv=args) as p:
      input = p | beam.Create(_ELEMENTS)

      # Get all input in same machine
      input = (input
               | beam.Map(lambda x: (None, x))
               | beam.GroupByKey()
               | beam.FlatMap(lambda elm: elm[1]))

      _ = (input |
           "WriteWithMultipleDestsFreely" >> bigquery.WriteToBigQuery(
               table=lambda x: (output_table_1
                                if 'language' in x
                                else output_table_2),
               create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
               write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY))

      _ = (input |
           "WriteWithMultipleDests" >> bigquery.WriteToBigQuery(
               table=lambda x: (output_table_3
                                if 'language' in x
                                else output_table_4),
               create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
               write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY,
               max_file_size=20,
               max_files_per_bundle=-1))
コード例 #2
0
ファイル: bigquery_test.py プロジェクト: team-kaiware/beam
  def test_value_provider_transform(self):
    output_table_1 = '%s%s' % (self.output_table, 1)
    output_table_2 = '%s%s' % (self.output_table, 2)
    schema = {'fields': [
        {'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE'},
        {'name': 'language', 'type': 'STRING', 'mode': 'NULLABLE'}]}

    additional_bq_parameters = {
        'timePartitioning': {'type': 'DAY'},
        'clustering': {'fields': ['language']}}

    table_ref = bigquery_tools.parse_table_reference(output_table_1)
    table_ref2 = bigquery_tools.parse_table_reference(output_table_2)

    pipeline_verifiers = [
        BigQueryTableMatcher(
            project=self.project,
            dataset=table_ref.datasetId,
            table=table_ref.tableId,
            expected_properties=additional_bq_parameters),
        BigQueryTableMatcher(
            project=self.project,
            dataset=table_ref2.datasetId,
            table=table_ref2.tableId,
            expected_properties=additional_bq_parameters),
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT name, language FROM %s" % output_table_1,
            data=[(d['name'], d['language'])
                  for d in _ELEMENTS
                  if 'language' in d]),
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT name, language FROM %s" % output_table_2,
            data=[(d['name'], d['language'])
                  for d in _ELEMENTS
                  if 'language' in d])]

    args = self.test_pipeline.get_full_options_as_args(
        on_success_matcher=hc.all_of(*pipeline_verifiers),
        experiments='use_beam_bq_sink')

    with beam.Pipeline(argv=args) as p:
      input = p | beam.Create([row for row in _ELEMENTS if 'language' in row])

      _ = (input
           | "WriteWithMultipleDests" >> beam.io.gcp.bigquery.WriteToBigQuery(
               table=value_provider.StaticValueProvider(
                   str, '%s:%s' % (self.project, output_table_1)),
               schema=value_provider.StaticValueProvider(dict, schema),
               additional_bq_parameters=additional_bq_parameters,
               method='STREAMING_INSERTS'))
      _ = (input
           | "WriteWithMultipleDests2" >> beam.io.gcp.bigquery.WriteToBigQuery(
               table=value_provider.StaticValueProvider(
                   str, '%s:%s' % (self.project, output_table_2)),
               schema=beam.io.gcp.bigquery.SCHEMA_AUTODETECT,
               additional_bq_parameters=lambda _: additional_bq_parameters,
               method='FILE_LOADS'))
コード例 #3
0
  def test_one_job_fails_all_jobs_fail(self):

    # If one of the import jobs fails, then other jobs must not be performed.
    # This is to avoid reinsertion of some records when a pipeline fails and
    # is rerun.
    output_table_1 = '%s%s' % (self.output_table, 1)
    output_table_2 = '%s%s' % (self.output_table, 2)

    self.bigquery_client.get_or_create_table(
        self.project,
        self.dataset_id,
        output_table_1.split('.')[1],
        bigquery_tools.parse_table_schema_from_json(self.BIG_QUERY_SCHEMA),
        None,
        None)
    self.bigquery_client.get_or_create_table(
        self.project,
        self.dataset_id,
        output_table_2.split('.')[1],
        bigquery_tools.parse_table_schema_from_json(self.BIG_QUERY_SCHEMA_2),
        None,
        None)

    pipeline_verifiers = [
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT name, language FROM %s" % output_table_1,
            data=[]),
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT name, foundation FROM %s" % output_table_2,
            data=[])
    ]

    args = self.test_pipeline.get_full_options_as_args(
        experiments='use_beam_bq_sink')

    with self.assertRaises(Exception):
      # The pipeline below fails because neither a schema nor SCHEMA_AUTODETECT
      # are specified.
      with beam.Pipeline(argv=args) as p:
        input = p | beam.Create(_ELEMENTS)
        input2 = p | "Broken record" >> beam.Create(['language_broken_record'])

        input = (input, input2) | beam.Flatten()

        _ = (
            input | "WriteWithMultipleDests" >> bigquery.WriteToBigQuery(
                table=lambda x:
                (output_table_1 if 'language' in x else output_table_2),
                create_disposition=(
                    beam.io.BigQueryDisposition.CREATE_IF_NEEDED),
                write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                temp_file_format=bigquery_tools.FileFormat.JSON))

    hamcrest_assert(p, all_of(*pipeline_verifiers))
コード例 #4
0
ファイル: bigquery_test.py プロジェクト: xsm110/beam
  def test_multiple_destinations_transform(self):
    output_table_1 = '%s%s' % (self.output_table, 1)
    output_table_2 = '%s%s' % (self.output_table, 2)

    full_output_table_1 = '%s:%s' % (self.project, output_table_1)
    full_output_table_2 = '%s:%s' % (self.project, output_table_2)

    schema1 = {'fields': [
        {'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE'},
        {'name': 'language', 'type': 'STRING', 'mode': 'NULLABLE'}]}
    schema2 = {'fields': [
        {'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE'},
        {'name': 'foundation', 'type': 'STRING', 'mode': 'NULLABLE'}]}

    bad_record = {'language': 1, 'manguage': 2}

    pipeline_verifiers = [
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT * FROM %s" % output_table_1,
            data=[(d['name'], d['language'])
                  for d in _ELEMENTS
                  if 'language' in d]),
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT * FROM %s" % output_table_2,
            data=[(d['name'], d['foundation'])
                  for d in _ELEMENTS
                  if 'foundation' in d])]

    args = self.test_pipeline.get_full_options_as_args(
        on_success_matcher=hc.all_of(*pipeline_verifiers),
        experiments='use_beam_bq_sink')

    with beam.Pipeline(argv=args) as p:
      input = p | beam.Create(_ELEMENTS)

      input2 = p | "Broken record" >> beam.Create([bad_record])

      input = (input, input2) | beam.Flatten()

      r = (input
           | "WriteWithMultipleDests" >> beam.io.gcp.bigquery.WriteToBigQuery(
               table=lambda x: (full_output_table_1
                                if 'language' in x
                                else full_output_table_2),
               schema=lambda dest: (schema1
                                    if dest == full_output_table_1
                                    else schema2),
               method='STREAMING_INSERTS'))

      assert_that(r[beam.io.gcp.bigquery.BigQueryWriteFn.FAILED_ROWS],
                  equal_to([(full_output_table_1, bad_record)]))
コード例 #5
0
ファイル: bigquery_test.py プロジェクト: l2pg/beam_moremmr
    def test_value_provider_transform(self):
        output_table_1 = '%s%s' % (self.output_table, 1)
        output_table_2 = '%s%s' % (self.output_table, 2)
        schema = {
            'fields': [{
                'name': 'name',
                'type': 'STRING',
                'mode': 'NULLABLE'
            }, {
                'name': 'language',
                'type': 'STRING',
                'mode': 'NULLABLE'
            }]
        }

        pipeline_verifiers = [
            BigqueryFullResultMatcher(
                project=self.project,
                query="SELECT * FROM %s" % output_table_1,
                data=[(d['name'], d['language']) for d in _ELEMENTS
                      if 'language' in d]),
            BigqueryFullResultMatcher(
                project=self.project,
                query="SELECT * FROM %s" % output_table_2,
                data=[(d['name'], d['language']) for d in _ELEMENTS
                      if 'language' in d])
        ]

        args = self.test_pipeline.get_full_options_as_args(
            on_success_matcher=hc.all_of(*pipeline_verifiers))

        with beam.Pipeline(argv=args) as p:
            input = p | beam.Create(
                [row for row in _ELEMENTS if 'language' in row])

            _ = (input
                 | "WriteWithMultipleDests" >>
                 beam.io.gcp.bigquery.WriteToBigQuery(
                     table=value_provider.StaticValueProvider(
                         str, output_table_1),
                     schema=value_provider.StaticValueProvider(dict, schema),
                     method='STREAMING_INSERTS'))
            _ = (input
                 | "WriteWithMultipleDests2" >>
                 beam.io.gcp.bigquery.WriteToBigQuery(
                     table=value_provider.StaticValueProvider(
                         str, output_table_2),
                     method='FILE_LOADS'))
コード例 #6
0
  def test_big_query_write_new_types(self):
    table_name = 'python_new_types_table'
    table_id = '{}.{}'.format(self.dataset_id, table_name)

    row_data = {
        'float': 0.33, 'numeric': Decimal('10'), 'bytes':
        base64.b64encode(b'\xab\xac').decode('utf-8'), 'date': '3000-12-31',
        'time': '23:59:59', 'datetime': '2018-12-31T12:44:31',
        'timestamp': '2018-12-31 12:44:31.744957 UTC', 'geo': 'POINT(30 10)'
    }

    input_data = [row_data]
    # add rows with only one key value pair and None values for all other keys
    for key, value in iteritems(row_data):
      input_data.append({key: value})

    table_schema = {"fields": [
        {"name": "float", "type": "FLOAT"},
        {"name": "numeric", "type": "NUMERIC"},
        {"name": "bytes", "type": "BYTES"},
        {"name": "date", "type": "DATE"},
        {"name": "time", "type": "TIME"},
        {"name": "datetime", "type": "DATETIME"},
        {"name": "timestamp", "type": "TIMESTAMP"},
        {"name": "geo", "type": "GEOGRAPHY"}
    ]}

    expected_row = (0.33, Decimal('10'), b'\xab\xac',
                    datetime.date(3000, 12, 31), datetime.time(23, 59, 59),
                    datetime.datetime(2018, 12, 31, 12, 44, 31),
                    datetime.datetime(2018, 12, 31, 12, 44, 31, 744957,
                                      tzinfo=pytz.utc), 'POINT(30 10)',
                   )

    expected_data = [expected_row]

    # add rows with only one key value pair and None values for all other keys
    for i, value in enumerate(expected_row):
      row = [None]*len(expected_row)
      row[i] = value
      expected_data.append(tuple(row))

    pipeline_verifiers = [
        BigqueryFullResultMatcher(
            project=self.project,
            query='SELECT float, numeric, bytes, date, time, datetime,'
                  'timestamp, geo FROM %s' % table_id,
            data=expected_data)]

    args = self.test_pipeline.get_full_options_as_args(
        on_success_matcher=hc.all_of(*pipeline_verifiers))

    with beam.Pipeline(argv=args) as p:
      # pylint: disable=expression-not-assigned
      (p | 'create' >> beam.Create(input_data)
       | 'write' >> beam.io.WriteToBigQuery(
           table_id,
           schema=table_schema,
           create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
           write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY))
コード例 #7
0
  def test_big_query_write_schema_autodetect(self):
    if self.runner_name == 'TestDataflowRunner':
      self.skipTest('DataflowRunner does not support schema autodetection')

    table_name = 'python_write_table'
    table_id = '{}.{}'.format(self.dataset_id, table_name)

    input_data = [
        {'number': 1, 'str': 'abc'},
        {'number': 2, 'str': 'def'},
    ]

    pipeline_verifiers = [
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT number, str FROM %s" % table_id,
            data=[(1, 'abc',), (2, 'def',)])]

    args = self.test_pipeline.get_full_options_as_args(
        on_success_matcher=hc.all_of(*pipeline_verifiers),
        experiments='use_beam_bq_sink')

    with beam.Pipeline(argv=args) as p:
      # pylint: disable=expression-not-assigned
      (p | 'create' >> beam.Create(input_data)
       | 'write' >> beam.io.WriteToBigQuery(
           table_id,
           method=beam.io.WriteToBigQuery.Method.FILE_LOADS,
           schema=beam.io.gcp.bigquery.SCHEMA_AUTODETECT,
           create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
           write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY))
コード例 #8
0
  def test_big_query_write(self):
    table_name = 'python_write_table'
    table_id = '{}.{}'.format(self.dataset_id, table_name)

    input_data = [
        {'number': 1, 'str': 'abc'},
        {'number': 2, 'str': 'def'},
        {'number': 3, 'str': u'你好'},
        {'number': 4, 'str': u'привет'},
    ]
    table_schema = {"fields": [
        {"name": "number", "type": "INTEGER"},
        {"name": "str", "type": "STRING"}]}

    pipeline_verifiers = [
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT number, str FROM %s" % table_id,
            data=[(1, 'abc',), (2, 'def',), (3, u'你好',), (4, u'привет',)])]

    args = self.test_pipeline.get_full_options_as_args(
        on_success_matcher=hc.all_of(*pipeline_verifiers))

    with beam.Pipeline(argv=args) as p:
      # pylint: disable=expression-not-assigned
      (p | 'create' >> beam.Create(input_data)
       | 'write' >> beam.io.WriteToBigQuery(
           table_id,
           schema=table_schema,
           create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
           write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY))
コード例 #9
0
  def test_big_query_write_without_schema(self):
    table_name = 'python_no_schema_table'
    self.create_table(table_name)
    table_id = '{}.{}'.format(self.dataset_id, table_name)

    input_data = [{
        'bytes': b'xyw', 'date': '2011-01-01', 'time': '23:59:59.999999'
    }, {
        'bytes': b'abc', 'date': '2000-01-01', 'time': '00:00:00'
    },
                  {
                      'bytes': b'\xe4\xbd\xa0\xe5\xa5\xbd',
                      'date': '3000-12-31',
                      'time': '23:59:59'
                  },
                  {
                      'bytes': b'\xab\xac\xad',
                      'date': '2000-01-01',
                      'time': '00:00:00'
                  }]
    # bigquery io expects bytes to be base64 encoded values
    for row in input_data:
      row['bytes'] = base64.b64encode(row['bytes'])

    pipeline_verifiers = [
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT bytes, date, time FROM %s" % table_id,
            data=[(
                b'xyw',
                datetime.date(2011, 1, 1),
                datetime.time(23, 59, 59, 999999),
            ), (
                b'abc',
                datetime.date(2000, 1, 1),
                datetime.time(0, 0, 0),
            ),
                  (
                      b'\xe4\xbd\xa0\xe5\xa5\xbd',
                      datetime.date(3000, 12, 31),
                      datetime.time(23, 59, 59),
                  ),
                  (
                      b'\xab\xac\xad',
                      datetime.date(2000, 1, 1),
                      datetime.time(0, 0, 0),
                  )])
    ]

    args = self.test_pipeline.get_full_options_as_args(
        on_success_matcher=hc.all_of(*pipeline_verifiers))

    with beam.Pipeline(argv=args) as p:
      # pylint: disable=expression-not-assigned
      (
          p | 'create' >> beam.Create(input_data)
          | 'write' >> beam.io.WriteToBigQuery(
              table_id,
              write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
              temp_file_format=FileFormat.JSON))
コード例 #10
0
  def test_big_query_write_new_types(self):
    table_name = 'python_new_types_table'
    table_id = '{}.{}'.format(self.dataset_id, table_name)

    input_data = [
        {'bytes': b'xyw', 'date': '2011-01-01', 'time': '23:59:59.999999'},
        {'bytes': b'abc', 'date': '2000-01-01', 'time': '00:00:00'},
        {'bytes': b'\xe4\xbd\xa0\xe5\xa5\xbd', 'date': '3000-12-31',
         'time': '23:59:59'},
        {'bytes': b'\xab\xac\xad', 'date': '2000-01-01', 'time': '00:00:00'}
    ]
    # bigquery io expects bytes to be base64 encoded values
    for row in input_data:
      row['bytes'] = base64.b64encode(row['bytes'])

    table_schema = {"fields": [
        {"name": "bytes", "type": "BYTES"},
        {"name": "date", "type": "DATE"},
        {"name": "time", "type": "TIME"}]}

    pipeline_verifiers = [
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT bytes, date, time FROM %s" % table_id,
            data=[(b'xyw', datetime.date(2011, 1, 1),
                   datetime.time(23, 59, 59, 999999), ),
                  (b'abc', datetime.date(2000, 1, 1),
                   datetime.time(0, 0, 0), ),
                  (b'\xe4\xbd\xa0\xe5\xa5\xbd', datetime.date(3000, 12, 31),
                   datetime.time(23, 59, 59), ),
                  (b'\xab\xac\xad', datetime.date(2000, 1, 1),
                   datetime.time(0, 0, 0), )])]

    args = self.test_pipeline.get_full_options_as_args(
        on_success_matcher=hc.all_of(*pipeline_verifiers))

    with beam.Pipeline(argv=args) as p:
      # pylint: disable=expression-not-assigned
      (p | 'create' >> beam.Create(input_data)
       | 'write' >> beam.io.WriteToBigQuery(
           table_id,
           schema=table_schema,
           create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
           write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY))
コード例 #11
0
    def test_big_query_write_temp_table_append_schema_update(self):
        """
    Test that schema update options are respected when appending to an existing
    table via temporary tables.

    _MAXIMUM_SOURCE_URIS and max_file_size are both set to 1 to force multiple
    load jobs and usage of temporary tables.
    """
        table_name = 'python_append_schema_update'
        self.create_table(table_name)
        table_id = '{}.{}'.format(self.dataset_id, table_name)

        input_data = [{"int64": 1, "bool": True}, {"int64": 2, "bool": False}]

        table_schema = {
            "fields": [{
                "name": "int64",
                "type": "INT64"
            }, {
                "name": "bool",
                "type": "BOOL"
            }]
        }

        args = self.test_pipeline.get_full_options_as_args(
            on_success_matcher=BigqueryFullResultMatcher(
                project=self.project,
                query="SELECT bytes, date, time, int64, bool FROM %s" %
                table_id,
                data=[(None, None, None, 1, True), (None, None, None, 2,
                                                    False)]))

        with beam.Pipeline(argv=args) as p:
            # pylint: disable=expression-not-assigned
            (p | 'create' >> beam.Create(input_data)
             | 'write' >> beam.io.WriteToBigQuery(
                 table_id,
                 schema=table_schema,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 max_file_size=1,  # bytes
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS,
                 additional_bq_parameters={
                     'schemaUpdateOptions': ['ALLOW_FIELD_ADDITION']
                 }))
コード例 #12
0
    def test_multiple_destinations_transform(self):
        streaming = self.test_pipeline.options.view_as(
            StandardOptions).streaming
        if streaming and isinstance(self.test_pipeline.runner,
                                    TestDataflowRunner):
            self.skipTest("TestStream is not supported on TestDataflowRunner")

        output_table_1 = '%s%s' % (self.output_table, 1)
        output_table_2 = '%s%s' % (self.output_table, 2)

        full_output_table_1 = '%s:%s' % (self.project, output_table_1)
        full_output_table_2 = '%s:%s' % (self.project, output_table_2)

        schema1 = {
            'fields': [{
                'name': 'name',
                'type': 'STRING',
                'mode': 'NULLABLE'
            }, {
                'name': 'language',
                'type': 'STRING',
                'mode': 'NULLABLE'
            }]
        }
        schema2 = {
            'fields': [{
                'name': 'name',
                'type': 'STRING',
                'mode': 'NULLABLE'
            }, {
                'name': 'foundation',
                'type': 'STRING',
                'mode': 'NULLABLE'
            }]
        }

        bad_record = {'language': 1, 'manguage': 2}

        if streaming:
            pipeline_verifiers = [
                PipelineStateMatcher(PipelineState.RUNNING),
                BigqueryFullResultStreamingMatcher(
                    project=self.project,
                    query="SELECT name, language FROM %s" % output_table_1,
                    data=[(d['name'], d['language']) for d in _ELEMENTS
                          if 'language' in d]),
                BigqueryFullResultStreamingMatcher(
                    project=self.project,
                    query="SELECT name, foundation FROM %s" % output_table_2,
                    data=[(d['name'], d['foundation']) for d in _ELEMENTS
                          if 'foundation' in d])
            ]
        else:
            pipeline_verifiers = [
                BigqueryFullResultMatcher(
                    project=self.project,
                    query="SELECT name, language FROM %s" % output_table_1,
                    data=[(d['name'], d['language']) for d in _ELEMENTS
                          if 'language' in d]),
                BigqueryFullResultMatcher(
                    project=self.project,
                    query="SELECT name, foundation FROM %s" % output_table_2,
                    data=[(d['name'], d['foundation']) for d in _ELEMENTS
                          if 'foundation' in d])
            ]

        args = self.test_pipeline.get_full_options_as_args(
            on_success_matcher=hc.all_of(*pipeline_verifiers),
            experiments='use_beam_bq_sink')

        with beam.Pipeline(argv=args) as p:
            if streaming:
                _SIZE = len(_ELEMENTS)
                test_stream = (
                    TestStream().advance_watermark_to(0).add_elements(
                        _ELEMENTS[:_SIZE // 2]).advance_watermark_to(
                            100).add_elements(
                                _ELEMENTS[_SIZE //
                                          2:]).advance_watermark_to_infinity())
                input = p | test_stream
            else:
                input = p | beam.Create(_ELEMENTS)

            schema_table_pcv = beam.pvalue.AsDict(
                p
                | "MakeSchemas" >> beam.Create([(full_output_table_1, schema1),
                                                (full_output_table_2,
                                                 schema2)]))

            table_record_pcv = beam.pvalue.AsDict(
                p
                | "MakeTables" >> beam.Create([('table1', full_output_table_1),
                                               ('table2',
                                                full_output_table_2)]))

            input2 = p | "Broken record" >> beam.Create([bad_record])

            input = (input, input2) | beam.Flatten()

            r = (input
                 | "WriteWithMultipleDests" >>
                 beam.io.gcp.bigquery.WriteToBigQuery(
                     table=lambda x, tables:
                     (tables['table1']
                      if 'language' in x else tables['table2']),
                     table_side_inputs=(table_record_pcv, ),
                     schema=lambda dest, table_map: table_map.get(dest, None),
                     schema_side_inputs=(schema_table_pcv, ),
                     insert_retry_strategy=RetryStrategy.
                     RETRY_ON_TRANSIENT_ERROR,
                     method='STREAMING_INSERTS'))

            assert_that(r[beam.io.gcp.bigquery.BigQueryWriteFn.FAILED_ROWS],
                        equal_to([(full_output_table_1, bad_record)]))
コード例 #13
0
    def test_multiple_destinations_transform(self):
        output_table_1 = '%s%s' % (self.output_table, 1)
        output_table_2 = '%s%s' % (self.output_table, 2)
        output_table_3 = '%s%s' % (self.output_table, 3)
        output_table_4 = '%s%s' % (self.output_table, 4)
        schema1 = bigquery.WriteToBigQuery.get_dict_table_schema(
            bigquery_tools.parse_table_schema_from_json(self.BIG_QUERY_SCHEMA))
        schema2 = bigquery.WriteToBigQuery.get_dict_table_schema(
            bigquery_tools.parse_table_schema_from_json(
                self.BIG_QUERY_SCHEMA_2))

        schema_kv_pairs = [
            (output_table_1, schema1), (output_table_2, schema2),
            (output_table_3, schema1), (output_table_4, schema2)
        ]
        pipeline_verifiers = [
            BigqueryFullResultMatcher(
                project=self.project,
                query="SELECT name, language FROM %s" % output_table_1,
                data=[(d['name'], d['language']) for d in _ELEMENTS
                      if 'language' in d]),
            BigqueryFullResultMatcher(
                project=self.project,
                query="SELECT name, foundation FROM %s" % output_table_2,
                data=[(d['name'], d['foundation']) for d in _ELEMENTS
                      if 'foundation' in d]),
            BigqueryFullResultMatcher(
                project=self.project,
                query="SELECT name, language FROM %s" % output_table_3,
                data=[(d['name'], d['language']) for d in _ELEMENTS
                      if 'language' in d]),
            BigqueryFullResultMatcher(
                project=self.project,
                query="SELECT name, foundation FROM %s" % output_table_4,
                data=[(d['name'], d['foundation']) for d in _ELEMENTS
                      if 'foundation' in d])
        ]

        args = self.test_pipeline.get_full_options_as_args(
            on_success_matcher=all_of(*pipeline_verifiers),
            experiments='use_beam_bq_sink')

        with beam.Pipeline(argv=args) as p:
            input = p | beam.Create(_ELEMENTS)

            schema_map_pcv = beam.pvalue.AsDict(
                p | "MakeSchemas" >> beam.Create(schema_kv_pairs))

            table_record_pcv = beam.pvalue.AsDict(
                p | "MakeTables" >> beam.Create([('table1', output_table_1),
                                                 ('table2', output_table_2)]))

            # Get all input in same machine
            input = (input
                     | beam.Map(lambda x: (None, x))
                     | beam.GroupByKey()
                     | beam.FlatMap(lambda elm: elm[1]))

            _ = (
                input
                | "WriteWithMultipleDestsFreely" >> bigquery.WriteToBigQuery(
                    table=lambda x, tables:
                    (tables['table1']
                     if 'language' in x else tables['table2']),
                    table_side_inputs=(table_record_pcv, ),
                    schema=lambda dest, schema_map: schema_map.get(dest, None),
                    schema_side_inputs=(schema_map_pcv, ),
                    create_disposition=beam.io.BigQueryDisposition.
                    CREATE_IF_NEEDED,
                    write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY))

            _ = (input | "WriteWithMultipleDests" >> bigquery.WriteToBigQuery(
                table=lambda x:
                (output_table_3 if 'language' in x else output_table_4),
                schema=lambda dest, schema_map: schema_map.get(dest, None),
                schema_side_inputs=(schema_map_pcv, ),
                create_disposition=beam.io.BigQueryDisposition.
                CREATE_IF_NEEDED,
                write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY,
                max_file_size=20,
                max_files_per_bundle=-1))
コード例 #14
0
    def test_big_query_write_temp_table_append_schema_update(
            self, file_format):
        """
    Test that nested schema update options and schema relaxation
    are respected when appending to an existing table via temporary tables.

    _MAXIMUM_SOURCE_URIS and max_file_size are both set to 1 to force multiple
    load jobs and usage of temporary tables.
    """
        table_name = 'python_append_schema_update'
        self.create_table(table_name)
        table_id = '{}.{}'.format(self.dataset_id, table_name)

        # bytes, date, time fields are optional and omitted in the test
        # only required and new columns are specified
        table_schema = {
            "fields": [{
                "name": "int64",
                "type": "INT64",
                "mode": "NULLABLE",
            }, {
                "name": "bool",
                "type": "BOOL",
            }, {
                "name":
                "nested_field",
                "type":
                "RECORD",
                "mode":
                "REPEATED",
                "fields": [
                    {
                        "name": "fruit",
                        "type": "STRING",
                        "mode": "NULLABLE"
                    },
                ]
            }]
        }
        input_data = [{
            "int64": 1,
            "bool": True,
            "nested_field": [{
                "fruit": "Apple"
            }]
        }, {
            "bool": False,
            "nested_field": [{
                "fruit": "Mango"
            }]
        }, {
            "int64": None,
            "bool": True,
            "nested_field": [{
                "fruit": "Banana"
            }]
        }]
        args = self.test_pipeline.get_full_options_as_args(
            on_success_matcher=BigqueryFullResultMatcher(
                project=self.project,
                query="""
            SELECT bytes, date, time, int64, bool, fruit
            FROM {},
            UNNEST(nested_field) as nested_field
            ORDER BY fruit
            """.format(table_id),
                data=[(None, None, None, 1, True,
                       "Apple"), (
                           None, None, None, None, True,
                           "Banana"), (None, None, None, None, False,
                                       "Mango")]))

        with beam.Pipeline(argv=args) as p:
            # pylint: disable=expression-not-assigned
            (p | 'create' >> beam.Create(input_data)
             | 'write' >> beam.io.WriteToBigQuery(
                 table_id,
                 schema=table_schema,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 max_file_size=1,  # bytes
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS,
                 additional_bq_parameters={
                     'schemaUpdateOptions':
                     ['ALLOW_FIELD_ADDITION', 'ALLOW_FIELD_RELAXATION']
                 },
                 temp_file_format=file_format))
コード例 #15
0
    def test_big_query_write_insert_errors_reporting(self):
        """
    Test that errors returned by beam.io.WriteToBigQuery
    contain both the failed rows amd the reason for it failing.
    """
        table_name = 'python_write_table'
        table_id = '{}.{}'.format(self.dataset_id, table_name)

        input_data = [{
            'number': 1,
            'str': 'some_string',
        }, {
            'number': 2
        }, {
            'number': 3,
            'str': 'some_string',
            'additional_field_str': 'some_string',
        }]

        table_schema = {
            "fields": [{
                "name": "number",
                "type": "INTEGER",
                'mode': 'REQUIRED'
            }, {
                "name": "str",
                "type": "STRING",
                'mode': 'REQUIRED'
            }]
        }

        bq_result_errors = [(
            {
                "number": 2
            },
            [{
                "reason": "invalid",
                "location": "",
                "debugInfo": "",
                "message":
                "Missing required field: Msg_0_CLOUD_QUERY_TABLE.str."
            }],
        ),
                            ({
                                "number": 3,
                                "str": "some_string",
                                "additional_field_str": "some_string"
                            }, [{
                                "reason":
                                "invalid",
                                "location":
                                "additional_field_str",
                                "debugInfo":
                                "",
                                "message":
                                "no such field: additional_field_str."
                            }])]

        pipeline_verifiers = [
            BigqueryFullResultMatcher(project=self.project,
                                      query="SELECT number, str FROM %s" %
                                      table_id,
                                      data=[(1, 'some_string')]),
        ]

        args = self.test_pipeline.get_full_options_as_args(
            on_success_matcher=hc.all_of(*pipeline_verifiers))

        with beam.Pipeline(argv=args) as p:
            # pylint: disable=expression-not-assigned
            errors = (
                p | 'create' >> beam.Create(input_data)
                | 'write' >> beam.io.WriteToBigQuery(
                    table_id,
                    schema=table_schema,
                    method='STREAMING_INSERTS',
                    insert_retry_strategy='RETRY_NEVER',
                    create_disposition=beam.io.BigQueryDisposition.
                    CREATE_IF_NEEDED,
                    write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)
            )

            assert_that(
                errors[BigQueryWriteFn.FAILED_ROWS_WITH_ERRORS]
                | 'ParseErrors' >> beam.Map(lambda err: (err[1], err[2])),
                equal_to(bq_result_errors))
コード例 #16
0
  def test_avro_file_load(self):
    # Construct elements such that they can be written via Avro but not via
    # JSON. See BEAM-8841.
    from apache_beam.io.gcp import bigquery_file_loads
    old_max_files = bigquery_file_loads._MAXIMUM_SOURCE_URIS
    old_max_file_size = bigquery_file_loads._DEFAULT_MAX_FILE_SIZE
    bigquery_file_loads._MAXIMUM_SOURCE_URIS = 1
    bigquery_file_loads._DEFAULT_MAX_FILE_SIZE = 100
    elements = [
        {
            'name': u'Negative infinity',
            'value': -float('inf'),
            'timestamp': datetime.datetime(1970, 1, 1, tzinfo=pytz.utc),
        },
        {
            'name': u'Not a number',
            'value': float('nan'),
            'timestamp': datetime.datetime(2930, 12, 9, tzinfo=pytz.utc),
        },
    ]

    schema = beam.io.gcp.bigquery.WriteToBigQuery.get_dict_table_schema(
        bigquery.TableSchema(
            fields=[
                bigquery.TableFieldSchema(
                    name='name', type='STRING', mode='REQUIRED'),
                bigquery.TableFieldSchema(
                    name='value', type='FLOAT', mode='REQUIRED'),
                bigquery.TableFieldSchema(
                    name='timestamp', type='TIMESTAMP', mode='REQUIRED'),
            ]))

    pipeline_verifiers = [
        # Some gymnastics here to avoid comparing NaN since NaN is not equal to
        # anything, including itself.
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT name, value, timestamp FROM {} WHERE value<0".format(
                self.output_table),
            data=[(d['name'], d['value'], d['timestamp'])
                  for d in elements[:1]],
        ),
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT name, timestamp FROM {}".format(self.output_table),
            data=[(d['name'], d['timestamp']) for d in elements],
        ),
    ]

    args = self.test_pipeline.get_full_options_as_args(
        on_success_matcher=hc.all_of(*pipeline_verifiers),
    )

    with beam.Pipeline(argv=args) as p:
      input = p | 'CreateInput' >> beam.Create(elements)
      schema_pc = p | 'CreateSchema' >> beam.Create([schema])

      _ = (
          input
          | 'WriteToBigQuery' >> beam.io.gcp.bigquery.WriteToBigQuery(
              table='%s:%s' % (self.project, self.output_table),
              schema=lambda _,
              schema: schema,
              schema_side_inputs=(beam.pvalue.AsSingleton(schema_pc), ),
              method='FILE_LOADS',
              temp_file_format=bigquery_tools.FileFormat.AVRO,
          ))
    bigquery_file_loads._MAXIMUM_SOURCE_URIS = old_max_files
    bigquery_file_loads._DEFAULT_MAX_FILE_SIZE = old_max_file_size