コード例 #1
0
  def testCoGroupByKey(self):
    with self.pipeline as p:
      pc1 = (p
             | 'Read ' + INPUT_TAG >> beam.io.Read(
                 synthetic_pipeline.SyntheticSource(
                     self.parseTestPipelineOptions(self.inputOptions)))
             | 'Make ' + INPUT_TAG + ' iterable' >> beam.Map(lambda x: (x, x))
            )

      pc2 = (p
             | 'Read ' + CO_INPUT_TAG >> beam.io.Read(
                 synthetic_pipeline.SyntheticSource(
                     self.parseTestPipelineOptions(self.coInputOptions)))
             | 'Make ' + CO_INPUT_TAG + ' iterable' >> beam.Map(
                 lambda x: (x, x))
            )
      # pylint: disable=expression-not-assigned
      ({INPUT_TAG: pc1, CO_INPUT_TAG: pc2}
       | 'CoGroupByKey: ' >> beam.CoGroupByKey()
       | 'Consume Joined Collections' >> beam.ParDo(self._Ungroup())
       | 'Measure time' >> beam.ParDo(MeasureTime())
      )

      result = p.run()
      result.wait_until_finish()
      metrics = result.metrics().query()

      for dist in metrics['distributions']:
        logging.info("Distribution: %s", dist)
コード例 #2
0
    def test(self):
        pc1 = (self.pipeline
               | 'Read ' + self.INPUT_TAG >> beam.io.Read(
                   synthetic_pipeline.SyntheticSource(
                       self.parse_synthetic_source_options()))
               | 'Measure time: Start pc1' >> beam.ParDo(
                   MeasureTime(self.metrics_namespace)))

        pc2 = (self.pipeline
               | 'Read ' + self.CO_INPUT_TAG >> beam.io.Read(
                   synthetic_pipeline.SyntheticSource(
                       self.parse_synthetic_source_options(
                           self.co_input_options)))
               | 'Measure time: Start pc2' >> beam.ParDo(
                   MeasureTime(self.metrics_namespace)))
        # pylint: disable=expression-not-assigned
        ({
            self.INPUT_TAG: pc1,
            self.CO_INPUT_TAG: pc2
        }
         | 'CoGroupByKey ' >> beam.CoGroupByKey()
         | 'Consume Joined Collections' >> beam.ParDo(
             self._UngroupAndReiterate(self.INPUT_TAG, self.CO_INPUT_TAG),
             self.iterations)
         | 'Measure time: End' >> beam.ParDo(
             MeasureTime(self.metrics_namespace)))
コード例 #3
0
    def testCoGroupByKey(self):
        pc1 = (self.pipeline
               | 'Read ' + INPUT_TAG >> beam.io.Read(
                   synthetic_pipeline.SyntheticSource(
                       self.parseTestPipelineOptions(self.input_options)))
               |
               'Make ' + INPUT_TAG + ' iterable' >> beam.Map(lambda x: (x, x))
               | 'Measure time: Start pc1' >> beam.ParDo(
                   MeasureTime(self.metrics_namespace)))

        pc2 = (
            self.pipeline
            | 'Read ' + CO_INPUT_TAG >> beam.io.Read(
                synthetic_pipeline.SyntheticSource(
                    self.parseTestPipelineOptions(self.co_input_options)))
            |
            'Make ' + CO_INPUT_TAG + ' iterable' >> beam.Map(lambda x: (x, x))
            | 'Measure time: Start pc2' >> beam.ParDo(
                MeasureTime(self.metrics_namespace)))
        # pylint: disable=expression-not-assigned
        ({
            INPUT_TAG: pc1,
            CO_INPUT_TAG: pc2
        }
         | 'CoGroupByKey: ' >> beam.CoGroupByKey()
         | 'Consume Joined Collections' >> beam.ParDo(self._Ungroup())
         | 'Measure time: End' >> beam.ParDo(
             MeasureTime(self.metrics_namespace)))

        result = self.pipeline.run()
        result.wait_until_finish()
        if self.metrics_monitor is not None:
            self.metrics_monitor.send_metrics(result)
コード例 #4
0
    def testSideInput(self):
        def join_fn(element, side_input, iterations):
            list = []
            for i in range(iterations):
                for key, value in side_input:
                    if i == iterations - 1:
                        list.append({key: element[1] + value})
            yield list

        main_input = (self.pipeline
                      | "Read pcoll 1" >> beam.io.Read(
                          synthetic_pipeline.SyntheticSource(
                              self.parseTestPipelineOptions()))
                      | 'Measure time: Start pcoll 1' >> beam.ParDo(
                          MeasureTime(self.metrics_namespace)))

        side_input = (
            self.pipeline
            | "Read pcoll 2" >> beam.io.Read(
                synthetic_pipeline.SyntheticSource(self._getSideInput()))
            | 'Measure time: Start pcoll 2' >> beam.ParDo(
                MeasureTime(self.metrics_namespace)))
        # pylint: disable=expression-not-assigned
        (main_input
         | "Merge" >> beam.ParDo(join_fn, AsIter(side_input), self.iterations)
         | 'Measure time' >> beam.ParDo(MeasureTime(self.metrics_namespace)))
コード例 #5
0
  def testCoGroupByKey(self):
    pc1 = (self.pipeline
           | 'Read ' + INPUT_TAG >> beam.io.Read(
               synthetic_pipeline.SyntheticSource(
                   self.parseTestPipelineOptions(self.input_options)))
           | 'Make ' + INPUT_TAG + ' iterable' >> beam.Map(lambda x: (x, x))
           | 'Measure time: Start pc1' >> beam.ParDo(
               MeasureTime(self.metrics_namespace))
          )

    pc2 = (self.pipeline
           | 'Read ' + CO_INPUT_TAG >> beam.io.Read(
               synthetic_pipeline.SyntheticSource(
                   self.parseTestPipelineOptions(self.co_input_options)))
           | 'Make ' + CO_INPUT_TAG + ' iterable' >> beam.Map(
               lambda x: (x, x))
           | 'Measure time: Start pc2' >> beam.ParDo(
               MeasureTime(self.metrics_namespace))
          )
    # pylint: disable=expression-not-assigned
    ({INPUT_TAG: pc1, CO_INPUT_TAG: pc2}
     | 'CoGroupByKey ' >> beam.CoGroupByKey()
     | 'Consume Joined Collections' >> beam.ParDo(self._UngroupAndReiterate(),
                                                  self.iterations)
     | 'Measure time: End' >> beam.ParDo(MeasureTime(self.metrics_namespace))
    )
コード例 #6
0
    def testSideInput(self):
        def join_fn(element, side_input, iterations):
            list = []
            for i in range(iterations):
                for key, value in side_input:
                    if i == iterations - 1:
                        list.append({key: element[1] + value})
            yield list

        with self.pipeline as p:
            main_input = (p
                          | "Read pcoll 1" >> beam.io.Read(
                              synthetic_pipeline.SyntheticSource(
                                  self._parseTestPipelineOptions()))
                          | 'Measure time: Start pcoll 1' >> beam.ParDo(
                              MeasureTime(self.metrics_namespace)))

            side_input = (
                p
                | "Read pcoll 2" >> beam.io.Read(
                    synthetic_pipeline.SyntheticSource(self._getSideInput()))
                | 'Measure time: Start pcoll 2' >> beam.ParDo(
                    MeasureTime(self.metrics_namespace)))
            # pylint: disable=expression-not-assigned
            (main_input
             | "Merge" >> beam.ParDo(join_fn, AsIter(side_input),
                                     self.iterations)
             |
             'Measure time' >> beam.ParDo(MeasureTime(self.metrics_namespace)))

            result = p.run()
            result.wait_until_finish()

            if self.metrics_monitor is not None:
                self.metrics_monitor.send_metrics(result)
コード例 #7
0
  def testParDo(self):
    class CounterOperation(beam.DoFn):
      def __init__(self, number_of_counters, number_of_operations):
        self.number_of_operations = number_of_operations
        self.counters = []
        for i in range(number_of_counters):
          self.counters.append(Metrics.counter('do-not-publish',
                                               'name-{}'.format(i)))

      def process(self, element):
        for _ in range(self.number_of_operations):
          for counter in self.counters:
            counter.inc()
        yield element

    pc = (self.pipeline
          | 'Read synthetic' >> beam.io.Read(
              synthetic_pipeline.SyntheticSource(
                  self.parseTestPipelineOptions()
              ))
          | 'Measure time: Start' >> beam.ParDo(
              MeasureTime(self.metrics_namespace))
         )

    for i in range(self.iterations):
      pc = (pc
            | 'Step: %d' % i >> beam.ParDo(
                CounterOperation(self.number_of_counters,
                                 self.number_of_operations))
           )

    # pylint: disable=expression-not-assigned
    (pc
     | 'Measure time: End' >> beam.ParDo(MeasureTime(self.metrics_namespace))
    )
コード例 #8
0
    def testParDo(self):
        if self.iterations is None:
            num_runs = 1
        else:
            num_runs = int(self.iterations)

        with self.pipeline as p:
            pc = (p
                  | 'Read synthetic' >> beam.io.Read(
                      synthetic_pipeline.SyntheticSource(
                          self.parseTestPipelineOptions()))
                  | 'Measure time' >> beam.ParDo(MeasureTime()))

            for i in range(num_runs):
                label = 'Step: %d' % i
                pc = (pc | label >> beam.ParDo(self._GetElement()))

            if self.output is not None:
                # pylint: disable=expression-not-assigned
                (pc | "Write" >> beam.io.WriteToText(self.output))

            result = p.run()
            result.wait_until_finish()
            metrics = result.metrics().query()
            for counter in metrics['counters']:
                logging.info("Counter: %s", counter)

            for dist in metrics['distributions']:
                logging.info("Distribution: %s", dist)
コード例 #9
0
    def testParDo(self):
        class _GetElement(beam.DoFn):
            from apache_beam.testing.load_tests.load_test_metrics_utils import count_bytes

            @count_bytes
            def process(self, element, namespace, is_returning):
                if is_returning:
                    yield element

        if not self.iterations:
            num_runs = 1
        else:
            num_runs = int(self.iterations)

        pc = (self.pipeline
              | 'Read synthetic' >> beam.io.Read(
                  synthetic_pipeline.SyntheticSource(
                      self.parseTestPipelineOptions()))
              | 'Measure time: Start' >> beam.ParDo(
                  MeasureTime(self.metrics_namespace)))

        for i in range(num_runs):
            is_returning = (i == (num_runs - 1))
            pc = (pc
                  | 'Step: %d' % i >> beam.ParDo(
                      _GetElement(), self.metrics_namespace, is_returning))

        if self.output:
            pc = (pc | "Write" >> beam.io.WriteToText(self.output))

        # pylint: disable=expression-not-assigned
        (pc
         | 'Measure time: End' >> beam.ParDo(
             MeasureTime(self.metrics_namespace)))
コード例 #10
0
 def test_synthetic_source_split_uneven(self):
     source = synthetic_pipeline.SyntheticSource(
         input_spec(1000, 1, 1, 'zipf', 3, 10))
     splits = source.split(100)
     sources_info = [(split.source, split.start_position,
                      split.stop_position) for split in splits]
     self.assertEqual(10, len(sources_info))
     source_test_utils.assert_sources_equal_reference_source(
         (source, None, None), sources_info)
コード例 #11
0
 def test_synthetic_source(self):
   def assert_size(element, expected_size):
     assert len(element) == expected_size
   with beam.Pipeline() as p:
     pcoll = (
         p | beam.io.Read(
             synthetic_pipeline.SyntheticSource(input_spec(300, 5, 15))))
     (pcoll
      | beam.Map(lambda elm: elm[0]) | 'key' >> beam.Map(assert_size, 5))
     (pcoll
      | beam.Map(lambda elm: elm[1]) | 'value' >> beam.Map(assert_size, 15))
     assert_that(pcoll | beam.combiners.Count.Globally(),
                 equal_to([300]))
コード例 #12
0
    def testGroupByKey(self):
        input = (self.pipeline
                 | beam.io.Read(
                     synthetic_pipeline.SyntheticSource(
                         self.parseTestPipelineOptions()))
                 | 'Measure time: Start' >> beam.ParDo(
                     MeasureTime(self.metrics_namespace)))

        for branch in range(self.fanout):
            # pylint: disable=expression-not-assigned
            (input
             | 'GroupByKey %i' % branch >> beam.GroupByKey()
             | 'Ungroup %i' % branch >> beam.ParDo(self._UngroupAndReiterate(),
                                                   self.iterations)
             | 'Measure time: End %i' % branch >> beam.ParDo(
                 MeasureTime(self.metrics_namespace)))
コード例 #13
0
    def testCombineGlobally(self):
        input = (self.pipeline
                 | beam.io.Read(
                     synthetic_pipeline.SyntheticSource(
                         self.parseTestPipelineOptions()))
                 | 'Measure time: Start' >> beam.ParDo(
                     MeasureTime(self.metrics_namespace)))

        for branch in range(self.fanout):
            # pylint: disable=expression-not-assigned
            (input
             | 'Combine with Top %i' % branch >> beam.CombineGlobally(
                 beam.combiners.TopCombineFn(1000))
             | 'Consume %i' % branch >> beam.ParDo(self._GetElement())
             | 'Measure time: End %i' % branch >> beam.ParDo(
                 MeasureTime(self.metrics_namespace)))
コード例 #14
0
ファイル: pardo_test.py プロジェクト: yuhonghong7035/beam
  def testParDo(self):

    class _GetElement(beam.DoFn):
      from apache_beam.testing.load_tests.load_test_metrics_utils import count_bytes

      @count_bytes(COUNTER_LABEL)
      def process(self, element, namespace, is_returning):
        if is_returning:
          yield element

    if self.iterations is None:
      num_runs = 1
    else:
      num_runs = int(self.iterations)

    with self.pipeline as p:
      pc = (p
            | 'Read synthetic' >> beam.io.Read(
                synthetic_pipeline.SyntheticSource(
                    self.parseTestPipelineOptions()
                ))
            | 'Measure time: Start' >> beam.ParDo(
                MeasureTime(self.metrics_namespace))
           )

      for i in range(num_runs):
        is_returning = (i == (num_runs-1))
        pc = (pc
              | 'Step: %d' % i >> beam.ParDo(
                  _GetElement(), self.metrics_namespace, is_returning)
             )

      if self.output is not None:
        pc = (pc
              | "Write" >> beam.io.WriteToText(self.output)
             )

      # pylint: disable=expression-not-assigned
      (pc
       | 'Measure time: End' >> beam.ParDo(MeasureTime(self.metrics_namespace))
      )

      result = p.run()
      result.wait_until_finish()

      if self.metrics_monitor is not None:
        self.metrics_monitor.send_metrics(result)
コード例 #15
0
ファイル: combine_test.py プロジェクト: yoreyuan/beam
  def testCombineGlobally(self):
    with self.pipeline as p:
      # pylint: disable=expression-not-assigned
      (p
       | beam.io.Read(synthetic_pipeline.SyntheticSource(
           self.parseTestPipelineOptions()))
       | 'Measure time' >> beam.ParDo(MeasureTime())
       | 'Combine with Top' >> beam.CombineGlobally(
           beam.combiners.TopCombineFn(1000))
       | 'Consume' >> beam.ParDo(self._GetElement())
      )

      result = p.run()
      result.wait_until_finish()
      metrics = result.metrics().query()
      for dist in metrics['distributions']:
        logging.info("Distribution: %s", dist)
コード例 #16
0
    def testGroupByKey(self):
        # pylint: disable=expression-not-assigned
        (self.pipeline
         | beam.io.Read(
             synthetic_pipeline.SyntheticSource(
                 self.parseTestPipelineOptions()))
         | 'Measure time: Start' >> beam.ParDo(
             MeasureTime(self.metrics_namespace))
         | 'GroupByKey' >> beam.GroupByKey()
         | 'Ungroup' >> beam.FlatMap(lambda elm: [(elm[0], v) for v in elm[1]])
         | 'Measure time: End' >> beam.ParDo(
             MeasureTime(self.metrics_namespace)))

        result = self.pipeline.run()
        result.wait_until_finish()
        if self.metrics_monitor is not None:
            self.metrics_monitor.send_metrics(result)
コード例 #17
0
  def testCombineGlobally(self):
    # pylint: disable=expression-not-assigned
    (self.pipeline
     | beam.io.Read(synthetic_pipeline.SyntheticSource(
         self.parseTestPipelineOptions()))
     | 'Measure time: Start' >> beam.ParDo(
         MeasureTime(self.metrics_namespace))
     | 'Combine with Top' >> beam.CombineGlobally(
         beam.combiners.TopCombineFn(1000))
     | 'Consume' >> beam.ParDo(self._GetElement())
     | 'Measure time: End' >> beam.ParDo(MeasureTime(self.metrics_namespace))
    )

    result = self.pipeline.run()
    result.wait_until_finish()
    if self.metrics_monitor is not None:
      self.metrics_monitor.send_metrics(result)
コード例 #18
0
ファイル: group_by_key_test.py プロジェクト: yoreyuan/beam
    def testGroupByKey(self):
        with self.pipeline as p:
            # pylint: disable=expression-not-assigned
            (p
             | beam.io.Read(
                 synthetic_pipeline.SyntheticSource(
                     self.parseTestPipelineOptions()))
             | 'Measure time' >> beam.ParDo(MeasureTime())
             | 'GroupByKey' >> beam.GroupByKey()
             | 'Ungroup' >> beam.FlatMap(lambda elm: [(elm[0], v)
                                                      for v in elm[1]]))

            result = p.run()
            result.wait_until_finish()
            metrics = result.metrics().query()
            for dist in metrics['distributions']:
                logging.info("Distribution: %s", dist)
コード例 #19
0
 def test_split_at_fraction(self):
     source = synthetic_pipeline.SyntheticSource(input_spec(10, 1, 1))
     source_test_utils.assert_split_at_fraction_exhaustive(source)
     source_test_utils.assert_split_at_fraction_fails(source, 5, 0.3)
     source_test_utils.assert_split_at_fraction_succeeds_and_consistent(
         source, 1, 0.3)