def test_group_by_key_input_visitor_for_non_gbk_transforms(self):
   p = TestPipeline()
   pcoll = PCollection(p)
   for transform in [beam.Flatten(), beam.Map(lambda x: x)]:
     pcoll.element_type = typehints.Any
     DataflowRunner.group_by_key_input_visitor().visit_transform(
         AppliedPTransform(None, transform, "label", [pcoll]))
     self.assertEqual(pcoll.element_type, typehints.Any)
Beispiel #2
0
 def test_group_by_key_input_visitor_for_non_gbk_transforms(self):
     p = TestPipeline()
     pcoll = PCollection(p)
     for transform in [beam.Flatten(), beam.Map(lambda x: x)]:
         pcoll.element_type = typehints.Any
         DataflowRunner.group_by_key_input_visitor().visit_transform(
             AppliedPTransform(None, transform, "label", [pcoll]))
         self.assertEqual(pcoll.element_type, typehints.Any)
Beispiel #3
0
 def apply_ReadStringsFromPubSub(self, transform, pcoll):
   try:
     from google.cloud import pubsub as unused_pubsub
   except ImportError:
     raise ImportError('Google Cloud PubSub not available, please install '
                       'apache_beam[gcp]')
   # Execute this as a native transform.
   output = PCollection(pcoll.pipeline)
   output.element_type = unicode
   return output
 def test_group_by_key_input_visitor_with_invalid_inputs(self):
   p = TestPipeline()
   pcoll1 = PCollection(p)
   pcoll2 = PCollection(p)
   for transform in [_GroupByKeyOnly(), beam.GroupByKey()]:
     pcoll1.element_type = str
     pcoll2.element_type = typehints.Set
     err_msg = (
         r"Input to 'label' must be compatible with KV\[Any, Any\]. "
         "Found .*")
     for pcoll in [pcoll1, pcoll2]:
       with self.assertRaisesRegexp(ValueError, err_msg):
         DataflowRunner.group_by_key_input_visitor().visit_transform(
             AppliedPTransform(None, transform, "label", [pcoll]))
  def _test_flatten_input_visitor(self, input_type, output_type, num_inputs):
    p = TestPipeline()
    inputs = []
    for _ in range(num_inputs):
      input_pcoll = PCollection(p)
      input_pcoll.element_type = input_type
      inputs.append(input_pcoll)
    output_pcoll = PCollection(p)
    output_pcoll.element_type = output_type

    flatten = AppliedPTransform(None, beam.Flatten(), "label", inputs)
    flatten.add_output(output_pcoll, None)
    DataflowRunner.flatten_input_visitor().visit_transform(flatten)
    for _ in range(num_inputs):
      self.assertEqual(inputs[0].element_type, output_type)
 def test_group_by_key_input_visitor_with_valid_inputs(self):
   p = TestPipeline()
   pcoll1 = PCollection(p)
   pcoll2 = PCollection(p)
   pcoll3 = PCollection(p)
   for transform in [_GroupByKeyOnly(), beam.GroupByKey()]:
     pcoll1.element_type = None
     pcoll2.element_type = typehints.Any
     pcoll3.element_type = typehints.KV[typehints.Any, typehints.Any]
     for pcoll in [pcoll1, pcoll2, pcoll3]:
       applied = AppliedPTransform(None, transform, "label", [pcoll])
       applied.outputs[None] = PCollection(None)
       DataflowRunner.group_by_key_input_visitor().visit_transform(applied)
       self.assertEqual(
           pcoll.element_type, typehints.KV[typehints.Any, typehints.Any])
 def test_group_by_key_input_visitor_with_invalid_inputs(self):
   p = TestPipeline()
   pcoll1 = PCollection(p)
   pcoll2 = PCollection(p)
   for transform in [_GroupByKeyOnly(), beam.GroupByKey()]:
     pcoll1.element_type = typehints.TupleSequenceConstraint
     pcoll2.element_type = typehints.Set
     err_msg = "Input to GroupByKey must be of Tuple or Any type"
     for pcoll in [pcoll1, pcoll2]:
       with self.assertRaisesRegexp(ValueError, err_msg):
         DataflowRunner.group_by_key_input_visitor().visit_transform(
             AppliedPTransform(None, transform, "label", [pcoll]))
Beispiel #8
0
    def test_group_by_key_input_visitor_with_invalid_inputs(self):
        p = TestPipeline()
        pcoll1 = PCollection(p)
        pcoll2 = PCollection(p)

        pcoll1.element_type = str
        pcoll2.element_type = typehints.Set
        err_msg = (r"Input to 'label' must be compatible with KV\[Any, Any\]. "
                   "Found .*")
        for pcoll in [pcoll1, pcoll2]:
            with self.assertRaisesRegex(ValueError, err_msg):
                DataflowRunner.group_by_key_input_visitor().visit_transform(
                    AppliedPTransform(None, beam.GroupByKey(), "label",
                                      [pcoll]))
 def test_group_by_key_input_visitor_with_valid_inputs(self):
   p = TestPipeline()
   pcoll1 = PCollection(p)
   pcoll2 = PCollection(p)
   pcoll3 = PCollection(p)
   for transform in [_GroupByKeyOnly(), beam.GroupByKey()]:
     pcoll1.element_type = None
     pcoll2.element_type = typehints.Any
     pcoll3.element_type = typehints.KV[typehints.Any, typehints.Any]
     for pcoll in [pcoll1, pcoll2, pcoll3]:
       DataflowRunner.group_by_key_input_visitor().visit_transform(
           AppliedPTransform(None, transform, "label", [pcoll]))
       self.assertEqual(pcoll.element_type,
                        typehints.KV[typehints.Any, typehints.Any])
Beispiel #10
0
    def _test_flatten_input_visitor(self, input_type, output_type, num_inputs):
        p = TestPipeline()
        inputs = []
        for _ in range(num_inputs):
            input_pcoll = PCollection(p)
            input_pcoll.element_type = input_type
            inputs.append(input_pcoll)
        output_pcoll = PCollection(p)
        output_pcoll.element_type = output_type

        flatten = AppliedPTransform(None, beam.Flatten(), "label", inputs)
        flatten.add_output(output_pcoll, None)
        DataflowRunner.flatten_input_visitor().visit_transform(flatten)
        for _ in range(num_inputs):
            self.assertEqual(inputs[0].element_type, output_type)
Beispiel #11
0
 def expand(self, pvalue):
   # This is handled as a native transform.
   return PCollection(self.pipeline, is_bounded=self._source.is_bounded())
Beispiel #12
0
 def expand(self, pcoll):
     self._check_pcollection(pcoll)
     return PCollection.from_(pcoll)
Beispiel #13
0
 def expand(self, pvalue):
     # This is handled as a native transform.
     return PCollection(self.pipeline)
Beispiel #14
0
 def expand(self, pcoll):
   return PCollection.from_(pcoll)