Beispiel #1
0
 def testConstruct(self):
     big_query_to_elwc_example_gen = component.BigQueryToElwcExampleGen(
         query='query',
         elwc_config=elwc_config_pb2.ElwcConfig(
             context_feature_fields=['query_id', 'query_content']))
     self.assertEqual(
         standard_artifacts.Examples.TYPE_NAME,
         big_query_to_elwc_example_gen.outputs['examples'].type_name)
Beispiel #2
0
 def testConstructWithInputConfig(self):
     big_query_to_elwc_example_gen = component.BigQueryToElwcExampleGen(
         elwc_config=elwc_config_pb2.ElwcConfig(
             context_feature_fields=['query_id', 'query_content']),
         input_config=example_gen_pb2.Input(splits=[
             example_gen_pb2.Input.Split(name='train', pattern='query1'),
             example_gen_pb2.Input.Split(name='eval', pattern='query2'),
             example_gen_pb2.Input.Split(name='test', pattern='query3')
         ]))
     self.assertEqual(
         standard_artifacts.Examples.TYPE_NAME,
         big_query_to_elwc_example_gen.outputs['examples'].type_name)
Beispiel #3
0
 def testConstructWithOutputConfig(self):
     big_query_to_elwc_example_gen = component.BigQueryToElwcExampleGen(
         query='query',
         elwc_config=elwc_config_pb2.ElwcConfig(
             context_feature_fields=['query_id', 'query_content']),
         output_config=example_gen_pb2.
         Output(split_config=example_gen_pb2.SplitConfig(splits=[
             example_gen_pb2.SplitConfig.Split(name='train',
                                               hash_buckets=2),
             example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1),
             example_gen_pb2.SplitConfig.Split(name='test', hash_buckets=1)
         ])))
     self.assertEqual(
         standard_artifacts.Examples.TYPE_NAME,
         big_query_to_elwc_example_gen.outputs['examples'].type_name)
Beispiel #4
0
def _BigQueryToElwc(pipeline: beam.Pipeline, exec_properties: Dict[str, Any],
                    split_pattern: str) -> beam.pvalue.PCollection:
  """Read from BigQuery and transform to ExampleListWithContext.

  When a field has no value in BigQuery, a feature with no value will be
  generated in the tf.train.Features. This behavior is consistent with
  BigQueryExampleGen.

  Args:
    pipeline: beam pipeline.
    exec_properties: A dict of execution properties.
    split_pattern: Split.pattern in Input config, a BigQuery sql string.

  Returns:
    PCollection of ExampleListWithContext.

  Raises:
    RuntimeError: Context features must be included in the queried result.
  """

  custom_config = example_gen_pb2.CustomConfig()
  json_format.Parse(exec_properties['custom_config'], custom_config)
  elwc_config = elwc_config_pb2.ElwcConfig()
  custom_config.custom_config.Unpack(elwc_config)

  client = bigquery.Client()
  # Dummy query to get the type information for each field.
  query_job = client.query('SELECT * FROM ({}) LIMIT 0'.format(split_pattern))
  results = query_job.result()
  type_map = {}
  context_feature_fields = set(elwc_config.context_feature_fields)
  field_names = set()
  for field in results.schema:
    type_map[field.name] = field.field_type
    field_names.add(field.name)
  # Check whether the query contains necessary context fields.
  if not field_names.issuperset(context_feature_fields):
    raise RuntimeError('Context feature fields are missing from the query.')

  return (
      pipeline
      | 'ReadFromBigQuery' >> utils.ReadFromBigQuery(query=split_pattern)
      | 'RowToContextFeatureAndExample' >> beam.ParDo(
          _RowToContextFeatureAndExample(type_map, context_feature_fields))
      |
      'CombineByContext' >> beam.CombinePerKey(beam.combiners.ToListCombineFn())
      | 'ConvertContextAndExamplesToElwc' >>
      beam.Map(_ConvertContextAndExamplesToElwc))
Beispiel #5
0
    def testBigQueryToElwc(self, mock_client):
        # Mock query result schema for _BigQueryElwcConverter.
        mock_client.return_value.query.return_value.result.return_value.schema = self._schema
        elwc_config = elwc_config_pb2.ElwcConfig(
            context_feature_fields=['context_feature_1', 'context_feature_2'])
        packed_custom_config = example_gen_pb2.CustomConfig()
        packed_custom_config.custom_config.Pack(elwc_config)
        with beam.Pipeline() as pipeline:
            elwc_examples = (pipeline | 'ToElwc' >> executor._BigQueryToElwc(
                exec_properties={
                    '_beam_pipeline_args': [],
                    'custom_config':
                    json_format.MessageToJson(packed_custom_config,
                                              preserving_proto_field_name=True)
                },
                split_pattern='SELECT context_feature_1, context_feature_2, '
                'feature_id_1, feature_id_2, feature_id_3 FROM `fake`'))

            expected_elwc_examples = [
                _ELWC_1, _ELWC_2, _ELWC_3, _ELWC_4, _ELWC_5
            ]
            util.assert_that(elwc_examples,
                             util.equal_to(expected_elwc_examples))