Beispiel #1
0
  def test_value_provider_transform(self):
    output_table_1 = '%s%s' % (self.output_table, 1)
    output_table_2 = '%s%s' % (self.output_table, 2)
    schema = {'fields': [
        {'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE'},
        {'name': 'language', 'type': 'STRING', 'mode': 'NULLABLE'}]}

    additional_bq_parameters = {
        'timePartitioning': {'type': 'DAY'},
        'clustering': {'fields': ['language']}}

    table_ref = bigquery_tools.parse_table_reference(output_table_1)
    table_ref2 = bigquery_tools.parse_table_reference(output_table_2)

    pipeline_verifiers = [
        BigQueryTableMatcher(
            project=self.project,
            dataset=table_ref.datasetId,
            table=table_ref.tableId,
            expected_properties=additional_bq_parameters),
        BigQueryTableMatcher(
            project=self.project,
            dataset=table_ref2.datasetId,
            table=table_ref2.tableId,
            expected_properties=additional_bq_parameters),
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT name, language FROM %s" % output_table_1,
            data=[(d['name'], d['language'])
                  for d in _ELEMENTS
                  if 'language' in d]),
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT name, language FROM %s" % output_table_2,
            data=[(d['name'], d['language'])
                  for d in _ELEMENTS
                  if 'language' in d])]

    args = self.test_pipeline.get_full_options_as_args(
        on_success_matcher=hc.all_of(*pipeline_verifiers),
        experiments='use_beam_bq_sink')

    with beam.Pipeline(argv=args) as p:
      input = p | beam.Create([row for row in _ELEMENTS if 'language' in row])

      _ = (input
           | "WriteWithMultipleDests" >> beam.io.gcp.bigquery.WriteToBigQuery(
               table=value_provider.StaticValueProvider(
                   str, '%s:%s' % (self.project, output_table_1)),
               schema=value_provider.StaticValueProvider(dict, schema),
               additional_bq_parameters=additional_bq_parameters,
               method='STREAMING_INSERTS'))
      _ = (input
           | "WriteWithMultipleDests2" >> beam.io.gcp.bigquery.WriteToBigQuery(
               table=value_provider.StaticValueProvider(
                   str, '%s:%s' % (self.project, output_table_2)),
               schema=beam.io.gcp.bigquery.SCHEMA_AUTODETECT,
               additional_bq_parameters=lambda _: additional_bq_parameters,
               method='FILE_LOADS'))
Beispiel #2
0
    def test_value_provider_transform(self):
        output_table_1 = '%s%s' % (self.output_table, 1)
        output_table_2 = '%s%s' % (self.output_table, 2)
        schema = {
            'fields': [{
                'name': 'name',
                'type': 'STRING',
                'mode': 'NULLABLE'
            }, {
                'name': 'language',
                'type': 'STRING',
                'mode': 'NULLABLE'
            }]
        }

        pipeline_verifiers = [
            BigqueryFullResultMatcher(
                project=self.project,
                query="SELECT * FROM %s" % output_table_1,
                data=[(d['name'], d['language']) for d in _ELEMENTS
                      if 'language' in d]),
            BigqueryFullResultMatcher(
                project=self.project,
                query="SELECT * FROM %s" % output_table_2,
                data=[(d['name'], d['language']) for d in _ELEMENTS
                      if 'language' in d])
        ]

        args = self.test_pipeline.get_full_options_as_args(
            on_success_matcher=hc.all_of(*pipeline_verifiers))

        with beam.Pipeline(argv=args) as p:
            input = p | beam.Create(
                [row for row in _ELEMENTS if 'language' in row])

            _ = (input
                 | "WriteWithMultipleDests" >>
                 beam.io.gcp.bigquery.WriteToBigQuery(
                     table=value_provider.StaticValueProvider(
                         str, output_table_1),
                     schema=value_provider.StaticValueProvider(dict, schema),
                     method='STREAMING_INSERTS'))
            _ = (input
                 | "WriteWithMultipleDests2" >>
                 beam.io.gcp.bigquery.WriteToBigQuery(
                     table=value_provider.StaticValueProvider(
                         str, output_table_2),
                     method='FILE_LOADS'))
Beispiel #3
0
 def _value_provider_or_static_val(elm):
     if isinstance(elm, value_provider.ValueProvider):
         return elm
     else:
         # The type argument is a NoOp, because we assume the argument already has
         # the proper formatting.
         return value_provider.StaticValueProvider(lambda x: x, value=elm)
Beispiel #4
0
  def __init__(
      self,
      destination,
      schema=None,
      custom_gcs_temp_location=None,
      create_disposition=None,
      write_disposition=None,
      triggering_frequency=None,
      with_auto_sharding=False,
      temp_file_format=None,
      max_file_size=None,
      max_files_per_bundle=None,
      max_partition_size=None,
      max_files_per_partition=None,
      additional_bq_parameters=None,
      table_side_inputs=None,
      schema_side_inputs=None,
      test_client=None,
      validate=True,
      is_streaming_pipeline=False,
      load_job_project_id=None):
    self.destination = destination
    self.create_disposition = create_disposition
    self.write_disposition = write_disposition
    self.triggering_frequency = triggering_frequency
    self.with_auto_sharding = with_auto_sharding
    self.max_file_size = max_file_size or _DEFAULT_MAX_FILE_SIZE
    self.max_files_per_bundle = (
        max_files_per_bundle or _DEFAULT_MAX_WRITERS_PER_BUNDLE)
    self.max_partition_size = max_partition_size or _MAXIMUM_LOAD_SIZE
    self.max_files_per_partition = (
        max_files_per_partition or _MAXIMUM_SOURCE_URIS)
    if (isinstance(custom_gcs_temp_location, str) or
        custom_gcs_temp_location is None):
      self._custom_gcs_temp_location = vp.StaticValueProvider(
          str, custom_gcs_temp_location or '')
    elif isinstance(custom_gcs_temp_location, vp.ValueProvider):
      self._custom_gcs_temp_location = custom_gcs_temp_location
    else:
      raise ValueError('custom_gcs_temp_location must be str or ValueProvider')

    self.test_client = test_client
    self.schema = schema
    self._temp_file_format = temp_file_format or bigquery_tools.FileFormat.JSON

    # If we have multiple destinations, then we will have multiple load jobs,
    # thus we will need temporary tables for atomicity.
    self.dynamic_destinations = bool(callable(destination))

    self.additional_bq_parameters = additional_bq_parameters or {}
    self.table_side_inputs = table_side_inputs or ()
    self.schema_side_inputs = schema_side_inputs or ()

    self.is_streaming_pipeline = is_streaming_pipeline
    self.load_job_project_id = load_job_project_id
    self._validate = validate
    if self._validate:
      self.verify()
Beispiel #5
0
    def __init__(self,
                 destination,
                 schema=None,
                 custom_gcs_temp_location=None,
                 create_disposition=None,
                 write_disposition=None,
                 triggering_frequency=None,
                 coder=None,
                 max_file_size=None,
                 max_files_per_bundle=None,
                 additional_bq_parameters=None,
                 table_side_inputs=None,
                 schema_side_inputs=None,
                 test_client=None,
                 validate=True,
                 is_streaming_pipeline=False):
        self.destination = destination
        self.create_disposition = create_disposition
        self.write_disposition = write_disposition
        self.triggering_frequency = triggering_frequency
        self.max_file_size = max_file_size or _DEFAULT_MAX_FILE_SIZE
        self.max_files_per_bundle = (max_files_per_bundle
                                     or _DEFAULT_MAX_WRITERS_PER_BUNDLE)
        if (isinstance(custom_gcs_temp_location, str)
                or custom_gcs_temp_location is None):
            self._custom_gcs_temp_location = vp.StaticValueProvider(
                str, custom_gcs_temp_location or '')
        elif isinstance(custom_gcs_temp_location, vp.ValueProvider):
            self._custom_gcs_temp_location = custom_gcs_temp_location
        else:
            raise ValueError(
                'custom_gcs_temp_location must be str or ValueProvider')

        self.test_client = test_client
        self.schema = schema
        self.coder = coder or bigquery_tools.RowAsDictJsonCoder()

        # If we have multiple destinations, then we will have multiple load jobs,
        # thus we will need temporary tables for atomicity.
        # If the destination is a single one, we assume that we will have only one
        # job to run - and thus we avoid using temporary tables
        self.temp_tables = True if callable(destination) else False

        self.additional_bq_parameters = additional_bq_parameters or {}
        self.table_side_inputs = table_side_inputs or ()
        self.schema_side_inputs = schema_side_inputs or ()

        self.is_streaming_pipeline = is_streaming_pipeline
        self._validate = validate
        if self._validate:
            self.verify()
  def test_to_from_runner_api(self):
    """Tests that serialization of WriteToBigQuery is correct.

    This is not intended to be a change-detector test. As such, this only tests
    the more complicated serialization logic of parameters: ValueProviders,
    callables, and side inputs.
    """
    FULL_OUTPUT_TABLE = 'test_project:output_table'

    p = TestPipeline()

    # Used for testing side input parameters.
    table_record_pcv = beam.pvalue.AsDict(
        p | "MakeTable" >> beam.Create([('table', FULL_OUTPUT_TABLE)]))

    # Used for testing value provider parameters.
    schema = value_provider.StaticValueProvider(str, '"a:str"')

    original = WriteToBigQuery(
        table=lambda _,
        side_input: side_input['table'],
        table_side_inputs=(table_record_pcv, ),
        schema=schema)

    # pylint: disable=expression-not-assigned
    p | 'MyWriteToBigQuery' >> original

    # Run the pipeline through to generate a pipeline proto from an empty
    # context. This ensures that the serialization code ran.
    pipeline_proto, context = TestPipeline.from_runner_api(
        p.to_runner_api(), p.runner, p.get_pipeline_options()).to_runner_api(
            return_context=True)

    # Find the transform from the context.
    write_to_bq_id = [
        k for k,
        v in pipeline_proto.components.transforms.items()
        if v.unique_name == 'MyWriteToBigQuery'
    ][0]
    deserialized_node = context.transforms.get_by_id(write_to_bq_id)
    deserialized = deserialized_node.transform
    self.assertIsInstance(deserialized, WriteToBigQuery)

    # Test that the serialization of a value provider is correct.
    self.assertEqual(original.schema, deserialized.schema)

    # Test that the serialization of a callable is correct.
    self.assertEqual(
        deserialized._table(None, {'table': FULL_OUTPUT_TABLE}),
        FULL_OUTPUT_TABLE)

    # Test that the serialization of a side input is correct.
    self.assertEqual(
        len(original.table_side_inputs), len(deserialized.table_side_inputs))
    original_side_input_data = original.table_side_inputs[0]._side_input_data()
    deserialized_side_input_data = deserialized.table_side_inputs[
        0]._side_input_data()
    self.assertEqual(
        original_side_input_data.access_pattern,
        deserialized_side_input_data.access_pattern)
    self.assertEqual(
        original_side_input_data.window_mapping_fn,
        deserialized_side_input_data.window_mapping_fn)
    self.assertEqual(
        original_side_input_data.view_fn, deserialized_side_input_data.view_fn)