Example #1
0
def model_join_using_side_inputs(
    name_list, email_list, phone_list, output_path):
  """Joining PCollections using side inputs."""

  import apache_beam as beam
  from apache_beam.pvalue import AsIter

  with TestPipeline() as p:  # Use TestPipeline for testing.
    # [START model_join_using_side_inputs]
    # This code performs a join by receiving the set of names as an input and
    # passing PCollections that contain emails and phone numbers as side inputs
    # instead of using CoGroupByKey.
    names = p | 'names' >> beam.Create(name_list)
    emails = p | 'email' >> beam.Create(email_list)
    phones = p | 'phone' >> beam.Create(phone_list)

    def join_info(name, emails, phone_numbers):
      filtered_emails = []
      for name_in_list, email in emails:
        if name_in_list == name:
          filtered_emails.append(email)

      filtered_phone_numbers = []
      for name_in_list, phone_number in phone_numbers:
        if name_in_list == name:
          filtered_phone_numbers.append(phone_number)

      return '; '.join(['%s' % name,
                        '%s' % ','.join(filtered_emails),
                        '%s' % ','.join(filtered_phone_numbers)])

    contact_lines = names | 'CreateContacts' >> beam.core.Map(
        join_info, AsIter(emails), AsIter(phones))
    # [END model_join_using_side_inputs]
    contact_lines | beam.io.WriteToText(output_path)
Example #2
0
  def test(self):
    def join_fn(element, side_input, iterations):
      result = []
      for i in range(iterations):
        for key, value in side_input:
          if i == iterations - 1:
            result.append({key: element[1] + value})
      yield result

    main_input = (
        self.pipeline
        | "Read pcoll 1" >> beam.io.Read(
            SyntheticSource(self.parse_synthetic_source_options()))
        | 'Measure time: Start pcoll 1' >> beam.ParDo(
            MeasureTime(self.metrics_namespace)))

    side_input = (
        self.pipeline
        | "Read pcoll 2" >> beam.io.Read(
            SyntheticSource(self.parse_synthetic_source_options()))
        | 'Measure time: Start pcoll 2' >> beam.ParDo(
            MeasureTime(self.metrics_namespace)))
    # pylint: disable=expression-not-assigned
    (
        main_input
        | "Merge" >> beam.ParDo(join_fn, AsIter(side_input), self.iterations)
        | 'Measure time' >> beam.ParDo(MeasureTime(self.metrics_namespace)))
Example #3
0
    def testSideInput(self):
        def join_fn(element, side_input, iterations):
            list = []
            for i in range(iterations):
                for key, value in side_input:
                    if i == iterations - 1:
                        list.append({key: element[1] + value})
            yield list

        with self.pipeline as p:
            main_input = (p
                          | "Read pcoll 1" >> beam.io.Read(
                              synthetic_pipeline.SyntheticSource(
                                  self._parseTestPipelineOptions()))
                          | 'Measure time: Start pcoll 1' >> beam.ParDo(
                              MeasureTime(self.metrics_namespace)))

            side_input = (
                p
                | "Read pcoll 2" >> beam.io.Read(
                    synthetic_pipeline.SyntheticSource(self._getSideInput()))
                | 'Measure time: Start pcoll 2' >> beam.ParDo(
                    MeasureTime(self.metrics_namespace)))
            # pylint: disable=expression-not-assigned
            (main_input
             | "Merge" >> beam.ParDo(join_fn, AsIter(side_input),
                                     self.iterations)
             |
             'Measure time' >> beam.ParDo(MeasureTime(self.metrics_namespace)))

            result = p.run()
            result.wait_until_finish()

            if self.metrics_monitor is not None:
                self.metrics_monitor.send_metrics(result)
Example #4
0
 def expand(self, pcoll):
     do_once = pcoll.pipeline | 'DoOnce' >> core.Create([None])
     init_result_coll = do_once | 'InitializeWrite' >> core.Map(
         lambda _, sink: sink.initialize_write(), self.sink)
     if getattr(self.sink, 'num_shards', 0):
         min_shards = self.sink.num_shards
         if min_shards == 1:
             keyed_pcoll = pcoll | core.Map(lambda x: (None, x))
         else:
             keyed_pcoll = pcoll | core.ParDo(_RoundRobinKeyFn(min_shards))
         write_result_coll = (
             keyed_pcoll
             | core.WindowInto(window.GlobalWindows())
             | core.GroupByKey()
             |
             'WriteBundles' >> core.ParDo(_WriteKeyedBundleDoFn(self.sink),
                                          AsSingleton(init_result_coll)))
     else:
         min_shards = 1
         write_result_coll = (
             pcoll
             | 'WriteBundles' >> core.ParDo(_WriteBundleDoFn(self.sink),
                                            AsSingleton(init_result_coll))
             | 'Pair' >> core.Map(lambda x: (None, x))
             | core.WindowInto(window.GlobalWindows())
             | core.GroupByKey()
             | 'Extract' >> core.FlatMap(lambda x: x[1]))
     return do_once | 'FinalizeWrite' >> core.FlatMap(
         _finalize_write, self.sink, AsSingleton(init_result_coll),
         AsIter(write_result_coll), min_shards)
Example #5
0
    def test_pcollectionview_not_recreated(self):
        pipeline = Pipeline('DirectRunner')
        value = pipeline | 'create1' >> Create([1, 2, 3])
        value2 = pipeline | 'create2' >> Create([(1, 1), (2, 2), (3, 3)])
        value3 = pipeline | 'create3' >> Create([(1, 1), (2, 2), (3, 3)])
        self.assertEqual(AsSingleton(value), AsSingleton(value))
        self.assertEqual(AsSingleton('new', value, default_value=1),
                         AsSingleton('new', value, default_value=1))
        self.assertNotEqual(AsSingleton(value),
                            AsSingleton('new', value, default_value=1))
        self.assertEqual(AsIter(value), AsIter(value))
        self.assertEqual(AsList(value), AsList(value))
        self.assertEqual(AsDict(value2), AsDict(value2))

        self.assertNotEqual(AsSingleton(value), AsSingleton(value2))
        self.assertNotEqual(AsIter(value), AsIter(value2))
        self.assertNotEqual(AsList(value), AsList(value2))
        self.assertNotEqual(AsDict(value2), AsDict(value3))