def test_flatmap_builtin(self):
    with TestPipeline() as pipeline:
      pcoll = pipeline | 'label1' >> Create([1, 2, 3])
      assert_that(pcoll, equal_to([1, 2, 3]))

      pcoll2 = pcoll | 'do' >> FlatMap(lambda x: [x + 10])
      assert_that(pcoll2, equal_to([11, 12, 13]), label='pcoll2')

      pcoll3 = pcoll2 | 'm1' >> Map(lambda x: [x, 12])
      assert_that(
          pcoll3, equal_to([[11, 12], [12, 12], [13, 12]]), label='pcoll3')

      pcoll4 = pcoll3 | 'do2' >> FlatMap(set)
      assert_that(pcoll4, equal_to([11, 12, 12, 12, 13]), label='pcoll4')
Exemple #2
0
 def expand(self, p):
     return (p | 'clean from HTML' >> Map(PreProcessing.clean_html)
             | 'remove mentions and links' >> Map(
                 PreProcessing.remove_mentions_and_links)
             | 'lowercase' >> Map(PreProcessing.make_lower)
             | 'remove negations' >> Map(PreProcessing.remove_negations)
             | 'letter only' >> Map(PreProcessing.letter_only)
             | 'remove small words' >> Map(PreProcessing.remove_small_words,
                                           self.tok))
Exemple #3
0
    def test_window_param(self):
        class TestDoFn(NewDoFn):
            def process(self, element, window=NewDoFn.WindowParam):
                yield (element, (float(window.start), float(window.end)))

        pipeline = TestPipeline()
        pcoll = (pipeline
                 | Create([1, 7])
                 | Map(lambda x: TimestampedValue(x, x))
                 | WindowInto(windowfn=SlidingWindows(10, 5))
                 | ParDo(TestDoFn()))
        assert_that(
            pcoll,
            equal_to([(1, (-5, 5)), (1, (0, 10)), (7, (0, 10)), (7, (5, 15))]))
        pipeline.run()
Exemple #4
0
 def test_rewindow(self):
   p = TestPipeline()
   result = (p
             | Create([(k, k) for k in range(10)])
             | Map(lambda (x, t): TimestampedValue(x, t))
             | 'window' >> WindowInto(SlidingWindows(period=2, size=6))
             # Per the model, each element is now duplicated across
             # three windows. Rewindowing must preserve this duplication.
             | 'rewindow' >> WindowInto(FixedWindows(5))
             | 'rewindow2' >> WindowInto(FixedWindows(5))
             | Map(lambda v: ('key', v))
             | GroupByKey())
   assert_that(result, equal_to([('key', sorted([0, 1, 2, 3, 4] * 3)),
                                 ('key', sorted([5, 6, 7, 8, 9] * 3))]))
   p.run()
Exemple #5
0
    def test_incomparable_default(self):
        class IncomparableType(object):
            def __eq__(self, other):
                raise RuntimeError()

            def __ne__(self, other):
                raise RuntimeError()

            def __hash__(self):
                raise RuntimeError()

        # Ensure that we don't use default values in a context where they must be
        # comparable (see BEAM-8301).
        with TestPipeline() as pipeline:
            pcoll = (
                pipeline
                | beam.Create([None])
                | Map(lambda e, x=IncomparableType(): (e, type(x).__name__)))
            assert_that(pcoll, equal_to([(None, 'IncomparableType')]))
  def test_window_param(self):
    class TestDoFn(DoFn):
      def process(self, element, window=DoFn.WindowParam):
        yield (element, (float(window.start), float(window.end)))

    with TestPipeline() as pipeline:
      pcoll = (
          pipeline
          | Create([1, 7])
          | Map(lambda x: TimestampedValue(x, x))
          | WindowInto(windowfn=SlidingWindows(10, 5))
          | ParDo(TestDoFn()))
      assert_that(
          pcoll,
          equal_to([(1, (-5, 5)), (1, (0, 10)), (7, (0, 10)), (7, (5, 15))]))
      pcoll2 = pcoll | 'Again' >> ParDo(TestDoFn())
      assert_that(
          pcoll2,
          equal_to([((1, (-5, 5)), (-5, 5)), ((1, (0, 10)), (0, 10)),
                    ((7, (0, 10)), (0, 10)), ((7, (5, 15)), (5, 15))]),
          label='doubled windows')
Exemple #7
0
 def expand(self, pcoll):
     return pcoll | Map(lambda x: x + self.suffix)
Exemple #8
0
 def expand(self, pcoll):
     return (pcoll
             | 'Convert to Mutation' >> Map(self._mutation_fn)
             | 'Write Mutation to Datastore' >> ParDo(
                 _Mutate.DatastoreWriteFn(self._project)))
Exemple #9
0
 def timestamped_key_values(self, pipeline, key, *timestamps):
   return (pipeline | 'start' >> Create(timestamps)
           | Map(lambda x: WindowedValue((key, x), x, [GlobalWindow()])))
Exemple #10
0
 def test_eager_pipeline(self):
     p = Pipeline('EagerRunner')
     self.assertEqual([1, 4, 9],
                      p | Create([1, 2, 3]) | Map(lambda x: x * x))
    def test_multi_triggered_gbk_side_input(self):
        """Test a GBK sideinput, with multiple triggering."""
        # TODO(BEAM-9322): Remove use of this experiment.
        # This flag is only necessary when using the multi-output TestStream b/c
        # it relies on using the PCollection output tags as the PCollection output
        # ids.
        options = StandardOptions(streaming=True)
        options.view_as(DebugOptions).add_experiment(
            'passthrough_pcollection_output_ids')

        p = TestPipeline(options=options)

        test_stream = (
            p
            | 'Mixed TestStream' >> TestStream().advance_watermark_to(
                3, tag='main').add_elements(
                    ['a1'], tag='main').advance_watermark_to(
                        8, tag='main').add_elements(['a2'], tag='main').
            add_elements([window.TimestampedValue(
                ('k', 100), 2)], tag='side').add_elements(
                    [window.TimestampedValue(('k', 400), 7)],
                    tag='side').advance_watermark_to_infinity(
                        tag='main').advance_watermark_to_infinity(tag='side'))

        main_data = (
            test_stream['main']
            | 'Main windowInto' >> beam.WindowInto(
                window.FixedWindows(5),
                accumulation_mode=trigger.AccumulationMode.DISCARDING))

        side_data = (
            test_stream['side']
            | 'Side windowInto' >> beam.WindowInto(
                window.FixedWindows(5),
                trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)),
                accumulation_mode=trigger.AccumulationMode.DISCARDING)
            | beam.CombinePerKey(sum)
            | 'Values' >> Map(lambda k_vs: k_vs[1]))

        class RecordFn(beam.DoFn):
            def process(self,
                        elm=beam.DoFn.ElementParam,
                        ts=beam.DoFn.TimestampParam,
                        side=beam.DoFn.SideInputParam):
                yield (elm, ts, side)

        records = (main_data
                   | beam.ParDo(RecordFn(), beam.pvalue.AsList(side_data)))

        expected_window_to_elements = {
            window.IntervalWindow(0, 5): [
                ('a1', Timestamp(3), [100, 0]),
            ],
            window.IntervalWindow(5, 10): [('a2', Timestamp(8), [400, 0])],
        }

        assert_that(records,
                    equal_to_per_window(expected_window_to_elements),
                    use_global_window=False,
                    label='assert per window')

        p.run()
Exemple #12
0
 def expand(self, pcoll):
     pcoll = pcoll | Map(WriteToPubSubLite._message_to_proto_str)
     pcoll.element_type = bytes
     pcoll = pcoll | self._source
     return pcoll
Exemple #13
0
 def expand(self, pvalue):
   pcoll = pvalue.pipeline | Read(self._source)
   pcoll.element_type = bytes
   pcoll = pcoll | 'DecodeString' >> Map(lambda b: b.decode('utf-8'))
   pcoll.element_type = unicode
   return pcoll
Exemple #14
0
from apache_beam.transforms.window import IntervalWindow
from apache_beam.transforms.window import Sessions
from apache_beam.transforms.window import SlidingWindows
from apache_beam.transforms.window import TimestampCombiner
from apache_beam.transforms.window import TimestampedValue
from apache_beam.transforms.window import WindowedValue
from apache_beam.transforms.window import WindowFn
from apache_beam.utils.timestamp import MAX_TIMESTAMP
from apache_beam.utils.timestamp import MIN_TIMESTAMP


def context(element, timestamp):
  return WindowFn.AssignContext(timestamp, element)


sort_values = Map(lambda k_vs: (k_vs[0], sorted(k_vs[1])))


class ReifyWindowsFn(core.DoFn):
  def process(self, element, window=core.DoFn.WindowParam):
    key, values = element
    yield "%s @ %s" % (key, window), values


reify_windows = core.ParDo(ReifyWindowsFn())


class WindowTest(unittest.TestCase):

  def test_timestamped_value_cmp(self):
    self.assertEqual(TimestampedValue('a', 2), TimestampedValue('a', 2))
Exemple #15
0
 def expand(self, pvalue):
     p = (pvalue.pipeline
          | ReadFromPubSub(self.topic, self.subscription, self.id_label)
          | 'DecodeString' >> Map(lambda b: b.decode('utf-8')))
     p.element_type = basestring
     return p
Exemple #16
0
 def expand(self, pcoll):
     pcoll = pcoll | 'EncodeString' >> Map(lambda s: s.encode('utf-8'))
     pcoll.element_type = bytes
     return pcoll | Write(self._sink)
Exemple #17
0
 def expand(self, pvalue):
     pcoll = pvalue.pipeline | self._source
     pcoll.element_type = bytes
     pcoll = pcoll | Map(pubsublite.SequencedMessage.deserialize)
     pcoll.element_type = pubsublite.SequencedMessage
     return pcoll