def finish_bundle(self):
    data = self._read_from_pubsub(self.source.timestamp_attribute)
    if data:
      output_pcollection = list(self._outputs)[0]
      bundle = self._evaluation_context.create_bundle(output_pcollection)
      # TODO(ccy): Respect the PubSub source's id_label field.
      for timestamp, message in data:
        if self.source.with_attributes:
          element = message
        else:
          element = message.data
        bundle.output(
            GlobalWindows.windowed_value(element, timestamp=timestamp))
      bundles = [bundle]
    else:
      bundles = []
    if self._applied_ptransform.inputs:
      input_pvalue = self._applied_ptransform.inputs[0]
    else:
      input_pvalue = pvalue.PBegin(self._applied_ptransform.transform.pipeline)
    unprocessed_bundle = self._evaluation_context.create_bundle(
        input_pvalue)

    # TODO(udim): Correct value for watermark hold.
    return TransformResult(self, bundles, [unprocessed_bundle], None,
                           {None: Timestamp.of(time.time())})
 def test_update_int(self):
   opcounts = OperationCounters(CounterFactory(), 'some-name',
                                coders.PickleCoder(), 0)
   self.verify_counters(opcounts, 0)
   opcounts.update_from(GlobalWindows.windowed_value(1))
   opcounts.update_collect()
   self.verify_counters(opcounts, 1)
 def __iter__(self):
   output_stream = create_OutputStream()
   for encoded_key, values in self._table.items():
     key = self._key_coder.decode(encoded_key)
     self._post_grouped_coder.get_impl().encode_to_stream(
         GlobalWindows.windowed_value((key, values)), output_stream, True)
   return iter([output_stream.get()])
 def test_update_multiple(self):
   coder = coders.PickleCoder()
   total_size = 0
   opcounts = OperationCounters(CounterFactory(), 'some-name',
                                coder, 0)
   self.verify_counters(opcounts, 0, float('nan'))
   value = GlobalWindows.windowed_value('abcde')
   opcounts.update_from(value)
   total_size += coder.estimate_size(value)
   value = GlobalWindows.windowed_value('defghij')
   opcounts.update_from(value)
   total_size += coder.estimate_size(value)
   self.verify_counters(opcounts, 2, (float(total_size) / 2))
   value = GlobalWindows.windowed_value('klmnop')
   opcounts.update_from(value)
   total_size += coder.estimate_size(value)
   self.verify_counters(opcounts, 3, (float(total_size) / 3))
 def test_update_str(self):
   coder = coders.PickleCoder()
   opcounts = OperationCounters(CounterFactory(), 'some-name',
                                coder, 0)
   self.verify_counters(opcounts, 0, float('nan'))
   value = GlobalWindows.windowed_value('abcde')
   opcounts.update_from(value)
   estimated_size = coder.estimate_size(value)
   self.verify_counters(opcounts, 1, estimated_size)
 def test_update_old_object(self):
   coder = coders.PickleCoder()
   opcounts = OperationCounters(CounterFactory(), 'some-name',
                                coder, 0)
   self.verify_counters(opcounts, 0, float('nan'))
   obj = OldClassThatDoesNotImplementLen()
   value = GlobalWindows.windowed_value(obj)
   opcounts.update_from(value)
   estimated_size = coder.estimate_size(value)
   self.verify_counters(opcounts, 1, estimated_size)
Beispiel #7
0
  def finish_bundle(self):
    bundles = []
    transform = self._applied_ptransform.transform

    assert transform.value is not None
    create_result = [GlobalWindows.windowed_value(v) for v in transform.value]
    for result in create_result:
      self.bundle.output(result)
    bundles.append(self.bundle)

    return TransformResult(
        self._applied_ptransform, bundles, None, None, None, None)
Beispiel #8
0
 def get_root_bundles(self):
   test_stream = self._applied_ptransform.transform
   bundles = []
   if len(test_stream.events) > 0:
     bundle = self._evaluation_context.create_bundle(
         pvalue.PBegin(self._applied_ptransform.transform.pipeline))
     # Explicitly set timestamp to MIN_TIMESTAMP to ensure that we hold the
     # watermark.
     bundle.add(GlobalWindows.windowed_value(0, timestamp=MIN_TIMESTAMP))
     bundle.commit(None)
     bundles.append(bundle)
   return bundles
 def finish_bundle(self):
   unprocessed_bundles = []
   hold = None
   if self.current_index < len(self.test_stream.events) - 1:
     unprocessed_bundle = self._evaluation_context.create_bundle(
         pvalue.PBegin(self._applied_ptransform.transform.pipeline))
     unprocessed_bundle.add(GlobalWindows.windowed_value(
         self.current_index + 1, timestamp=self.watermark))
     unprocessed_bundles.append(unprocessed_bundle)
     hold = self.watermark
   return TransformResult(
       self._applied_ptransform, self.bundles, unprocessed_bundles, None, hold)
  def finish_bundle(self):
    bundles = []
    bundle = None
    for encoded_k, vs in iteritems(self.gbk_items):
      if not bundle:
        bundle = self._evaluation_context.create_bundle(
            self.output_pcollection)
        bundles.append(bundle)
      kwi = KeyedWorkItem(encoded_k, elements=vs)
      bundle.add(GlobalWindows.windowed_value(kwi))

    return TransformResult(self, bundles, [], None, None)
Beispiel #11
0
 def __iter__(self):
   output_stream = create_OutputStream()
   if self._windowing.is_default():
     globally_window = GlobalWindows.windowed_value(None).with_value
     windowed_key_values = lambda key, values: [globally_window((key, values))]
   else:
     trigger_driver = trigger.create_trigger_driver(self._windowing, True)
     windowed_key_values = trigger_driver.process_entire_key
   coder_impl = self._post_grouped_coder.get_impl()
   key_coder_impl = self._key_coder.get_impl()
   for encoded_key, windowed_values in self._table.items():
     key = key_coder_impl.decode(encoded_key)
     for wkvs in windowed_key_values(key, windowed_values):
       coder_impl.encode_to_stream(wkvs, output_stream, True)
   return iter([output_stream.get()])
Beispiel #12
0
  def _flush_batch(self, destination):

    # Flush the current batch of rows to BigQuery.
    rows = self._rows_buffer[destination]
    table_reference = bigquery_tools.parse_table_reference(destination)

    if table_reference.projectId is None:
      table_reference.projectId = vp.RuntimeValueProvider.get_value(
          'project', str, '')

    logging.debug('Flushing data to %s. Total %s rows.',
                  destination, len(rows))

    while True:
      # TODO: Figure out an insertId to make calls idempotent.
      passed, errors = self.bigquery_wrapper.insert_rows(
          project_id=table_reference.projectId,
          dataset_id=table_reference.datasetId,
          table_id=table_reference.tableId,
          rows=rows,
          skip_invalid_rows=True)

      logging.debug("Passed: %s. Errors are %s", passed, errors)
      failed_rows = [rows[entry.index] for entry in errors]
      should_retry = any(
          bigquery_tools.RetryStrategy.should_retry(
              self._retry_strategy, entry.errors[0].reason)
          for entry in errors)
      rows = failed_rows

      if not should_retry:
        break
      else:
        retry_backoff = next(self._backoff_calculator)
        logging.info('Sleeping %s seconds before retrying insertion.',
                     retry_backoff)
        time.sleep(retry_backoff)

    self._total_buffered_rows -= len(self._rows_buffer[destination])
    del self._rows_buffer[destination]

    return [pvalue.TaggedOutput(BigQueryWriteFn.FAILED_ROWS,
                                GlobalWindows.windowed_value(
                                    (destination, row))) for row in failed_rows]
Beispiel #13
0
 def partition(self, n):
     # type: (int) -> List[List[bytes]]
     """ It is used to partition _GroupingBuffer to N parts. Once it is
 partitioned, it would not be re-partitioned with diff N. Re-partition
 is not supported now.
 """
     if not self._grouped_output:
         if self._windowing.is_default():
             globally_window = GlobalWindows.windowed_value(
                 None,
                 timestamp=GlobalWindow().max_timestamp(),
                 pane_info=windowed_value.PaneInfo(
                     is_first=True,
                     is_last=True,
                     timing=windowed_value.PaneInfoTiming.ON_TIME,
                     index=0,
                     nonspeculative_index=0)).with_value
             windowed_key_values = lambda key, values: [
                 globally_window((key, values))
             ]
         else:
             # TODO(pabloem, BEAM-7514): Trigger driver needs access to the clock
             #   note that this only comes through if windowing is default - but what
             #   about having multiple firings on the global window.
             #   May need to revise.
             trigger_driver = trigger.create_trigger_driver(
                 self._windowing, True)
             windowed_key_values = trigger_driver.process_entire_key
         coder_impl = self._post_grouped_coder.get_impl()
         key_coder_impl = self._key_coder.get_impl()
         self._grouped_output = [[] for _ in range(n)]
         output_stream_list = [create_OutputStream() for _ in range(n)]
         for idx, (encoded_key,
                   windowed_values) in enumerate(self._table.items()):
             key = key_coder_impl.decode(encoded_key)
             for wkvs in windowed_key_values(key, windowed_values):
                 coder_impl.encode_to_stream(wkvs,
                                             output_stream_list[idx % n],
                                             True)
         for ix, output_stream in enumerate(output_stream_list):
             self._grouped_output[ix] = [output_stream.get()]
         self._table.clear()
     return self._grouped_output
Beispiel #14
0
 def __iter__(self):
   if not self._grouped_output:
     output_stream = create_OutputStream()
     if self._windowing.is_default():
       globally_window = GlobalWindows.windowed_value(None).with_value
       windowed_key_values = lambda key, values: [
           globally_window((key, values))]
     else:
       trigger_driver = trigger.create_trigger_driver(self._windowing, True)
       windowed_key_values = trigger_driver.process_entire_key
     coder_impl = self._post_grouped_coder.get_impl()
     key_coder_impl = self._key_coder.get_impl()
     for encoded_key, windowed_values in self._table.items():
       key = key_coder_impl.decode(encoded_key)
       for wkvs in windowed_key_values(key, windowed_values):
         coder_impl.encode_to_stream(wkvs, output_stream, True)
     self._grouped_output = [output_stream.get()]
     self._table = None
   return iter(self._grouped_output)
Beispiel #15
0
 def test_reshuffle_streaming_global_window(self):
     options = PipelineOptions()
     options.view_as(StandardOptions).streaming = True
     pipeline = TestPipeline(options=options)
     data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 4)]
     expected_data = [(1, [1, 2, 4]), (2, [1, 2]), (3, [1])]
     before_reshuffle = (pipeline
                         | beam.Create(data)
                         | beam.WindowInto(GlobalWindows())
                         | beam.GroupByKey()
                         | beam.MapTuple(lambda k, vs: (k, sorted(vs))))
     assert_that(before_reshuffle,
                 equal_to(expected_data),
                 label='before_reshuffle')
     after_reshuffle = before_reshuffle | beam.Reshuffle()
     assert_that(after_reshuffle,
                 equal_to(expected_data),
                 label='after reshuffle')
     pipeline.run()
Beispiel #16
0
  def get_root_bundles(self):
    test_stream = self._applied_ptransform.transform

    # If there was an endpoint defined then get the events from the
    # TestStreamService.
    if test_stream.endpoint:
      _TestStreamEvaluator.event_stream = _TestStream.events_from_rpc(
          test_stream.endpoint,
          test_stream.output_tags,
          test_stream.coder,
          self._evaluation_context)
    else:
      _TestStreamEvaluator.event_stream = (
          _TestStream.events_from_script(test_stream._events))

    bundle = self._evaluation_context.create_bundle(
        pvalue.PBegin(self._applied_ptransform.transform.pipeline))
    bundle.add(GlobalWindows.windowed_value(b'', timestamp=MIN_TIMESTAMP))
    bundle.commit(None)
    return [bundle]
Beispiel #17
0
def create_trigger_driver(
    windowing, is_batch=False, phased_combine_fn=None, clock=None):
  """Create the TriggerDriver for the given windowing and options."""

  # TODO(robertwb): We can do more if we know elements are in timestamp
  # sorted order.
  if windowing.is_default() and is_batch:
    driver = BatchGlobalTriggerDriver()
  elif (windowing.windowfn == GlobalWindows() and
        (windowing.triggerfn in [AfterCount(1), Always()]) and is_batch):
    # Here we also just pass through all the values exactly once.
    driver = BatchGlobalTriggerDriver()
  else:
    driver = GeneralTriggerDriver(windowing, clock)

  if phased_combine_fn:
    # TODO(ccy): Refactor GeneralTriggerDriver to combine values eagerly using
    # the known phased_combine_fn here.
    driver = CombiningTriggerDriver(phased_combine_fn, driver)
  return driver
    def finish_bundle(self):
        if self._is_final_bundle():
            if self.global_state.get_state(
                    None, _GroupByKeyOnlyEvaluator.COMPLETION_TAG):
                # Ignore empty bundles after emitting output. (This may happen because
                # empty bundles do not affect input watermarks.)
                bundles = []
            else:
                gbk_result = []
                # TODO(ccy): perhaps we can clean this up to not use this
                # internal attribute of the DirectStepContext.
                for encoded_k in self.step_context.keyed_existing_state:
                    # Ignore global state.
                    if encoded_k is None:
                        continue
                    k = self.key_coder.decode(encoded_k)
                    state = self.step_context.get_keyed_state(encoded_k)
                    vs = state.get_state(None,
                                         _GroupByKeyOnlyEvaluator.ELEMENTS_TAG)
                    gbk_result.append(GlobalWindows.windowed_value((k, vs)))

                def len_element_fn(element):
                    _, v = element.value
                    return len(v)

                bundles = self._split_list_into_bundles(
                    self.output_pcollection, gbk_result,
                    _GroupByKeyOnlyEvaluator.MAX_ELEMENT_PER_BUNDLE,
                    len_element_fn)

            self.global_state.add_state(
                None, _GroupByKeyOnlyEvaluator.COMPLETION_TAG, True)
            hold = WatermarkManager.WATERMARK_POS_INF
        else:
            bundles = []
            hold = WatermarkManager.WATERMARK_NEG_INF
            self.global_state.set_timer(None, '', TimeDomain.WATERMARK,
                                        WatermarkManager.WATERMARK_POS_INF)

        return TransformResult(self._applied_ptransform, bundles, [], None,
                               {None: hold})
Beispiel #19
0
  def test_buffering_timer_in_global_window_streaming(self):
    max_buffering_duration_secs = 42

    start_time = timestamp.Timestamp(0)
    test_stream = TestStream().advance_watermark_to(start_time)
    for i, value in enumerate(GroupIntoBatchesTest._create_test_data()):
      test_stream.add_elements(
          [TimestampedValue(value, start_time + i)]) \
        .advance_processing_time(5)
    test_stream.advance_watermark_to(
        start_time + GroupIntoBatchesTest.NUM_ELEMENTS + 1) \
      .advance_watermark_to_infinity()

    with TestPipeline(options=StandardOptions(streaming=True)) as pipeline:
      # Set a batch size larger than the total number of elements.
      # Since we're in a global window, we would have been waiting
      # for all the elements to arrive without the buffering time limit.
      batch_size = GroupIntoBatchesTest.NUM_ELEMENTS * 2

      # To trigger the processing time timer, use a fake clock with start time
      # being Timestamp(0). Since the fake clock never really advances during
      # the pipeline execution, meaning that the timer is always set to the same
      # value, the timer will be fired on every element after the first firing.
      fake_clock = FakeClock(now=start_time)

      num_elements_per_batch = (
          pipeline | test_stream
          | WindowInto(
              GlobalWindows(),
              trigger=Repeatedly(AfterCount(1)),
              accumulation_mode=trigger.AccumulationMode.DISCARDING)
          | util.GroupIntoBatches(
              batch_size, max_buffering_duration_secs, fake_clock)
          | 'count elements in batch' >> Map(lambda x: (None, len(x[1])))
          | GroupByKey()
          | FlatMapTuple(lambda k, vs: vs))

      # We will flush twice when the max buffering duration is reached and when
      # the global window ends.
      assert_that(num_elements_per_batch, equal_to([9, 1]))
Beispiel #20
0
  def finish_bundle(self):
    if self._is_final_bundle():
      if self.global_state.get_state(
          None, _GroupByKeyOnlyEvaluator.COMPLETION_TAG):
        # Ignore empty bundles after emitting output. (This may happen because
        # empty bundles do not affect input watermarks.)
        bundles = []
      else:
        gbk_result = []
        # TODO(ccy): perhaps we can clean this up to not use this
        # internal attribute of the DirectStepContext.
        for encoded_k in self.step_context.keyed_existing_state:
          # Ignore global state.
          if encoded_k is None:
            continue
          k = self.key_coder.decode(encoded_k)
          state = self.step_context.get_keyed_state(encoded_k)
          vs = state.get_state(None, _GroupByKeyOnlyEvaluator.ELEMENTS_TAG)
          gbk_result.append(GlobalWindows.windowed_value((k, vs)))

        def len_element_fn(element):
          _, v = element.value
          return len(v)

        bundles = self._split_list_into_bundles(
            self.output_pcollection, gbk_result,
            _GroupByKeyOnlyEvaluator.MAX_ELEMENT_PER_BUNDLE, len_element_fn)

      self.global_state.add_state(
          None, _GroupByKeyOnlyEvaluator.COMPLETION_TAG, True)
      hold = WatermarkManager.WATERMARK_POS_INF
    else:
      bundles = []
      hold = WatermarkManager.WATERMARK_NEG_INF
      self.global_state.set_timer(
          None, '', TimeDomain.WATERMARK, WatermarkManager.WATERMARK_POS_INF)

    return TransformResult(
        self._applied_ptransform, bundles, [], None, {None: hold})
Beispiel #21
0
 def finish_bundle(self):
   data = self._read_from_pubsub()
   if data:
     output_pcollection = list(self._outputs)[0]
     bundle = self._evaluation_context.create_bundle(output_pcollection)
     # TODO(ccy): we currently do not use the PubSub message timestamp or
     # respect the PubSub source's id_label field.
     now = Timestamp.of(time.time())
     for message_data in data:
       bundle.output(GlobalWindows.windowed_value(message_data, timestamp=now))
     bundles = [bundle]
   else:
     bundles = []
   if self._applied_ptransform.inputs:
     input_pvalue = self._applied_ptransform.inputs[0]
   else:
     input_pvalue = pvalue.PBegin(self._applied_ptransform.transform.pipeline)
   unprocessed_bundle = self._evaluation_context.create_bundle(
       input_pvalue)
   return TransformResult(
       self._applied_ptransform, bundles,
       [unprocessed_bundle], None, {None: Timestamp.of(time.time())})
Beispiel #22
0
def create_trigger_driver(windowing,
                          is_batch=False,
                          phased_combine_fn=None,
                          clock=None):
    """Create the TriggerDriver for the given windowing and options."""

    # TODO(robertwb): We can do more if we know elements are in timestamp
    # sorted order.
    if windowing.is_default() and is_batch:
        driver = DiscardingGlobalTriggerDriver()
    elif (windowing.windowfn == GlobalWindows()
          and windowing.triggerfn == AfterCount(1)
          and windowing.accumulation_mode == AccumulationMode.DISCARDING):
        # Here we also just pass through all the values every time.
        driver = DiscardingGlobalTriggerDriver()
    else:
        driver = GeneralTriggerDriver(windowing, clock)

    if phased_combine_fn:
        # TODO(ccy): Refactor GeneralTriggerDriver to combine values eagerly using
        # the known phased_combine_fn here.
        driver = CombiningTriggerDriver(phased_combine_fn, driver)
    return driver
Beispiel #23
0
def main():
    options = PipelineOptions()
    options.view_as(SetupOptions).save_main_session = True

    BATCH_SIZE = 1000000
    BUFFERING_SECS = 600

    p = Pipeline(options=options)
    (p
     | Create(range(100), reshuffle=True)
     | ParDo(make_large_elements)  # 128 KiB
     | WithKeys('')
     | WindowInto(GlobalWindows(),
                  trigger=Repeatedly(
                      AfterAny(AfterCount(BATCH_SIZE),
                               AfterProcessingTime(BUFFERING_SECS))),
                  accumulation_mode=AccumulationMode.DISCARDING)
     | GroupByKey()
     | Map(lambda kv: logging.info('key: %s, value count: %s', kv[0], len(kv[1]
                                                                          ))))

    run = p.run()
    run.wait_until_finish()
Beispiel #24
0
 def process_element(self, element):
   index = element.value
   self.watermark = element.timestamp
   assert isinstance(index, int)
   assert 0 <= index <= len(self.test_stream.events)
   self.current_index = index
   event = self.test_stream.events[self.current_index]
   if isinstance(event, ElementEvent):
     assert len(self._outputs) == 1
     output_pcollection = list(self._outputs)[0]
     bundle = self._evaluation_context.create_bundle(output_pcollection)
     for tv in event.timestamped_values:
       bundle.output(
           GlobalWindows.windowed_value(tv.value, timestamp=tv.timestamp))
     self.bundles.append(bundle)
   elif isinstance(event, WatermarkEvent):
     assert event.new_watermark >= self.watermark
     self.watermark = event.new_watermark
   elif isinstance(event, ProcessingTimeEvent):
     # TODO(ccy): advance processing time in the context's mock clock.
     pass
   else:
     raise ValueError('Invalid TestStream event: %s.' % event)
Beispiel #25
0
    def test_combining_with_accumulation_mode_and_fanout(self):
        # PCollection will contain elements from 1 to 5.
        elements = [i for i in range(1, 6)]

        ts = TestStream().advance_watermark_to(0)
        for i in elements:
            ts.add_elements([i])
        ts.advance_watermark_to_infinity()

        options = PipelineOptions()
        options.view_as(StandardOptions).streaming = True
        with TestPipeline(options=options) as p:
            result = (
                p
                | ts
                | beam.WindowInto(
                    GlobalWindows(),
                    accumulation_mode=trigger.AccumulationMode.ACCUMULATING,
                    trigger=AfterWatermark(early=AfterAll(AfterCount(1))))
                | beam.CombineGlobally(sum).without_defaults().with_fanout(2))

            # The frings for DISCARDING mode is [1, 2, 3, 4, 5, 0, 0].
            firings = [1, 3, 6, 10, 15, 15, 15]
            assert_that(result, equal_to(firings))
 def get_windowing(self, inputs):
     return Windowing(GlobalWindows())
Beispiel #27
0
 def finish_bundle(self):
     yield WindowedValue(list(self.all_columns),
                         timestamp=0,
                         windows=[GlobalWindows()])
 def test_update_int(self):
     opcounts = OperationCounters(CounterFactory(), 'some-name',
                                  coders.PickleCoder(), 0)
     self.verify_counters(opcounts, 0)
     opcounts.update_from(GlobalWindows.windowed_value(1))
     self.verify_counters(opcounts, 1)
Beispiel #29
0
 def _read_values_to_bundles(reader):
   read_result = [GlobalWindows.windowed_value(e) for e in reader]
   return self._split_list_into_bundles(
       output_pcollection, read_result,
       _BoundedReadEvaluator.MAX_ELEMENT_PER_BUNDLE, lambda _: 1)
Beispiel #30
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """
        Main execution logic for the Sequencer component

        :param input_dict: input channels
        :param output_dict: output channels
        :param exec_properties: the execution properties defined in the spec
        """

        source = exec_properties[StepKeys.SOURCE]
        args = exec_properties[StepKeys.ARGS]

        c = source_utils.load_source_path_class(source)

        # Get the schema
        schema_path = io_utils.get_only_uri_in_dir(
            artifact_utils.get_single_uri(input_dict[constants.SCHEMA]))
        schema = io_utils.SchemaReader().read(schema_path)

        # TODO: Getting the statistics might help the future implementations

        sequence_step: BaseSequencerStep = c(schema=schema,
                                             statistics=None,
                                             **args)

        # Get split names
        input_artifact = artifact_utils.get_single_instance(
            input_dict[constants.INPUT_EXAMPLES])
        split_names = artifact_utils.decode_split_names(
            input_artifact.split_names)

        # Create output artifact
        output_artifact = artifact_utils.get_single_instance(
            output_dict[constants.OUTPUT_EXAMPLES])
        output_artifact.split_names = artifact_utils.encode_split_names(
            split_names)

        with self._make_beam_pipeline() as p:
            for s in split_names:
                input_uri = io_utils.all_files_pattern(
                    artifact_utils.get_split_uri(
                        input_dict[constants.INPUT_EXAMPLES], s))

                output_uri = artifact_utils.get_split_uri(
                    output_dict[constants.OUTPUT_EXAMPLES], s)
                output_path = os.path.join(output_uri, self._DEFAULT_FILENAME)

                # Read and decode the data
                data = \
                    (p
                     | 'Read_' + s >> beam.io.ReadFromTFRecord(
                                file_pattern=input_uri)
                     | 'Decode_' + s >> tf_example_decoder.DecodeTFExample()
                     | 'ToDataFrame_' + s >> beam.ParDo(utils.ConvertToDataframe()))

                # Window into sessions
                s_data = \
                    (data
                     | 'AddCategory_' + s >> beam.ParDo(
                                sequence_step.get_category_do_fn())
                     | 'AddTimestamp_' + s >> beam.ParDo(
                                sequence_step.get_timestamp_do_fn())
                     | 'Sessions_' + s >> beam.WindowInto(
                                sequence_step.get_window()))

                # Combine and transform
                p_data = \
                    (s_data
                     | 'Combine_' + s >> beam.CombinePerKey(
                                sequence_step.get_combine_fn()))

                # Write the results
                _ = \
                    (p_data
                     | 'Global_' + s >> beam.WindowInto(GlobalWindows())
                     | 'RemoveKey_' + s >> beam.ParDo(RemoveKey())
                     | 'ToExample_' + s >> beam.Map(utils.df_to_example)
                     | 'Serialize_' + s >> beam.Map(utils.serialize)
                     | 'Write_' + s >> beam.io.WriteToTFRecord(
                                output_path,
                                file_name_suffix='.gz'))
Beispiel #31
0
from apache_beam.transforms import trigger
from apache_beam.transforms import window
from apache_beam.transforms.window import GlobalWindow
from apache_beam.transforms.window import GlobalWindows
from apache_beam.utils import proto_utils
from apache_beam.utils import windowed_value

if TYPE_CHECKING:
    from apache_beam.coders.coder_impl import CoderImpl
    from apache_beam.runners.portability.fn_api_runner import worker_handlers
    from apache_beam.runners.portability.fn_api_runner.translations import DataSideInput
    from apache_beam.transforms.window import BoundedWindow

ENCODED_IMPULSE_VALUE = WindowedValueCoder(
    BytesCoder(), GlobalWindowCoder()).get_impl().encode_nested(
        GlobalWindows.windowed_value(b''))

SAFE_WINDOW_FNS = set(window.WindowFn._known_urns.keys()) - set(
    [python_urns.PICKLED_WINDOWFN])


class Buffer(Protocol):
    def __iter__(self):
        # type: () -> Iterator[bytes]
        pass

    def append(self, item):
        # type: (bytes) -> None
        pass

    def _map_task_registration(self, map_task, state_handler,
                               data_operation_spec):
        input_data = {}
        runner_sinks = {}
        transforms = []
        transform_index_to_id = {}

        # Maps coders to new coder objects and references.
        coders = {}

        def coder_id(coder):
            if coder not in coders:
                coders[coder] = beam_fn_api_pb2.Coder(
                    function_spec=sdk_worker.pack_function_spec_data(
                        json.dumps(coder.as_cloud_object()),
                        sdk_worker.PYTHON_CODER_URN,
                        id=self._next_uid()))

            return coders[coder].function_spec.id

        def output_tags(op):
            return getattr(op, 'output_tags', ['out'])

        def as_target(op_input):
            input_op_index, input_output_index = op_input
            input_op = map_task[input_op_index][1]
            return {
                'ignored_input_tag':
                beam_fn_api_pb2.Target.List(target=[
                    beam_fn_api_pb2.Target(
                        primitive_transform_reference=transform_index_to_id[
                            input_op_index],
                        name=output_tags(input_op)[input_output_index])
                ])
            }

        def outputs(op):
            return {
                tag:
                beam_fn_api_pb2.PCollection(coder_reference=coder_id(coder))
                for tag, coder in zip(output_tags(op), op.output_coders)
            }

        for op_ix, (stage_name, operation) in enumerate(map_task):
            transform_id = transform_index_to_id[op_ix] = self._next_uid()
            if isinstance(operation, operation_specs.WorkerInMemoryWrite):
                # Write this data back to the runner.
                fn = beam_fn_api_pb2.FunctionSpec(
                    urn=sdk_worker.DATA_OUTPUT_URN, id=self._next_uid())
                if data_operation_spec:
                    fn.data.Pack(data_operation_spec)
                inputs = as_target(operation.input)
                side_inputs = {}
                runner_sinks[(transform_id, 'out')] = operation

            elif isinstance(operation, operation_specs.WorkerRead):
                # A Read is either translated to a direct injection of windowed values
                # into the sdk worker, or an injection of the source object into the
                # sdk worker as data followed by an SDF that reads that source.
                if (isinstance(operation.source.source,
                               maptask_executor_runner.InMemorySource)
                        and isinstance(
                            operation.source.source.default_output_coder(),
                            WindowedValueCoder)):
                    output_stream = create_OutputStream()
                    element_coder = (operation.source.source.
                                     default_output_coder().get_impl())
                    # Re-encode the elements in the nested context and
                    # concatenate them together
                    for element in operation.source.source.read(None):
                        element_coder.encode_to_stream(element, output_stream,
                                                       True)
                    target_name = self._next_uid()
                    input_data[(transform_id,
                                target_name)] = output_stream.get()
                    fn = beam_fn_api_pb2.FunctionSpec(
                        urn=sdk_worker.DATA_INPUT_URN, id=self._next_uid())
                    if data_operation_spec:
                        fn.data.Pack(data_operation_spec)
                    inputs = {target_name: beam_fn_api_pb2.Target.List()}
                    side_inputs = {}
                else:
                    # Read the source object from the runner.
                    source_coder = beam.coders.DillCoder()
                    input_transform_id = self._next_uid()
                    output_stream = create_OutputStream()
                    source_coder.get_impl().encode_to_stream(
                        GlobalWindows.windowed_value(operation.source),
                        output_stream, True)
                    target_name = self._next_uid()
                    input_data[(input_transform_id,
                                target_name)] = output_stream.get()
                    input_ptransform = beam_fn_api_pb2.PrimitiveTransform(
                        id=input_transform_id,
                        function_spec=beam_fn_api_pb2.FunctionSpec(
                            urn=sdk_worker.DATA_INPUT_URN,
                            id=self._next_uid()),
                        # TODO(robertwb): Possible name collision.
                        step_name=stage_name + '/inject_source',
                        inputs={target_name: beam_fn_api_pb2.Target.List()},
                        outputs={
                            'out':
                            beam_fn_api_pb2.PCollection(
                                coder_reference=coder_id(source_coder))
                        })
                    if data_operation_spec:
                        input_ptransform.function_spec.data.Pack(
                            data_operation_spec)
                    transforms.append(input_ptransform)

                    # Read the elements out of the source.
                    fn = sdk_worker.pack_function_spec_data(
                        OLDE_SOURCE_SPLITTABLE_DOFN_DATA,
                        sdk_worker.PYTHON_DOFN_URN,
                        id=self._next_uid())
                    inputs = {
                        'ignored_input_tag':
                        beam_fn_api_pb2.Target.List(target=[
                            beam_fn_api_pb2.Target(
                                primitive_transform_reference=
                                input_transform_id,
                                name='out')
                        ])
                    }
                    side_inputs = {}

            elif isinstance(operation, operation_specs.WorkerDoFn):
                fn = sdk_worker.pack_function_spec_data(
                    operation.serialized_fn,
                    sdk_worker.PYTHON_DOFN_URN,
                    id=self._next_uid())
                inputs = as_target(operation.input)
                # Store the contents of each side input for state access.
                for si in operation.side_inputs:
                    assert isinstance(si.source, iobase.BoundedSource)
                    element_coder = si.source.default_output_coder()
                    view_id = self._next_uid()
                    # TODO(robertwb): Actually flesh out the ViewFn API.
                    side_inputs[si.tag] = beam_fn_api_pb2.SideInput(
                        view_fn=sdk_worker.serialize_and_pack_py_fn(
                            element_coder,
                            urn=sdk_worker.PYTHON_ITERABLE_VIEWFN_URN,
                            id=view_id))
                    # Re-encode the elements in the nested context and
                    # concatenate them together
                    output_stream = create_OutputStream()
                    for element in si.source.read(
                            si.source.get_range_tracker(None, None)):
                        element_coder.get_impl().encode_to_stream(
                            element, output_stream, True)
                    elements_data = output_stream.get()
                    state_key = beam_fn_api_pb2.StateKey.MultimapSideInput(
                        key=view_id)
                    state_handler.Clear(state_key)
                    state_handler.Append(state_key, elements_data)

            elif isinstance(operation, operation_specs.WorkerFlatten):
                fn = sdk_worker.pack_function_spec_data(
                    operation.serialized_fn,
                    sdk_worker.IDENTITY_DOFN_URN,
                    id=self._next_uid())
                inputs = {
                    'ignored_input_tag':
                    beam_fn_api_pb2.Target.List(target=[
                        beam_fn_api_pb2.Target(
                            primitive_transform_reference=
                            transform_index_to_id[input_op_index],
                            name=output_tags(map_task[input_op_index]
                                             [1])[input_output_index]) for
                        input_op_index, input_output_index in operation.inputs
                    ])
                }
                side_inputs = {}

            else:
                raise TypeError(operation)

            ptransform = beam_fn_api_pb2.PrimitiveTransform(
                id=transform_id,
                function_spec=fn,
                step_name=stage_name,
                inputs=inputs,
                side_inputs=side_inputs,
                outputs=outputs(operation))
            transforms.append(ptransform)

        process_bundle_descriptor = beam_fn_api_pb2.ProcessBundleDescriptor(
            id=self._next_uid(),
            coders=coders.values(),
            primitive_transform=transforms)
        return beam_fn_api_pb2.InstructionRequest(
            instruction_id=self._next_uid(),
            register=beam_fn_api_pb2.RegisterRequest(
                process_bundle_descriptor=[process_bundle_descriptor
                                           ])), runner_sinks, input_data
Beispiel #33
0
from apache_beam.transforms.window import GlobalWindows
from apache_beam.utils.windowed_value import WindowedValue

# Allow some "pure mode" declarations.
try:
    import cython
except ImportError:

    class FakeCython(object):
        @staticmethod
        def cast(type, value):
            return value

    globals()['cython'] = FakeCython()

_globally_windowed_value = GlobalWindows.windowed_value(None)
_global_window_type = type(_globally_windowed_value.windows[0])


class ConsumerSet(Receiver):
    """A ConsumerSet represents a graph edge between two Operation nodes.

  The ConsumerSet object collects information from the output of the
  Operation at one end of its edge and the input of the Operation at
  the other edge.
  ConsumerSet are attached to the outputting Operation.
  """
    def __init__(self, counter_factory, step_name, output_index, consumers,
                 coder):
        self.consumers = consumers
        self.opcounter = opcounters.OperationCounters(counter_factory,
Beispiel #34
0
 def set(self, ts):
   from apache_beam.transforms.window import GlobalWindows
   self._receiver.receive(
       GlobalWindows.windowed_value(
           (self._key,
            dict(timestamp=timestamp.Timestamp.of(ts)))))
 def set(self, ts):
   from apache_beam.transforms.window import GlobalWindows
   self._receiver.receive(
       GlobalWindows.windowed_value(
           (self._key,
            dict(timestamp=timestamp.Timestamp.of(ts)))))
Beispiel #36
0
 def process_timer(self, timer_firing):
   """Default process_timer() impl. generating KeyedWorkItem element."""
   self.process_element(
       GlobalWindows.windowed_value(
           KeyedWorkItem(timer_firing.encoded_key,
                         timer_firings=[timer_firing])))
Beispiel #37
0
from apache_beam.transforms.combiners import curry_combine_fn
from apache_beam.transforms.window import GlobalWindows
from apache_beam.utils.windowed_value import WindowedValue

# Allow some "pure mode" declarations.
try:
  import cython
except ImportError:
  class FakeCython(object):
    @staticmethod
    def cast(type, value):
      return value
  globals()['cython'] = FakeCython()


_globally_windowed_value = GlobalWindows.windowed_value(None)
_global_window_type = type(_globally_windowed_value.windows[0])


class ConsumerSet(Receiver):
  """A ConsumerSet represents a graph edge between two Operation nodes.

  The ConsumerSet object collects information from the output of the
  Operation at one end of its edge and the input of the Operation at
  the other edge.
  ConsumerSet are attached to the outputting Operation.
  """

  def __init__(
      self, counter_factory, step_name, output_index, consumers, coder):
    self.consumers = consumers
Beispiel #38
0
 def finish_bundle(self):
     assert len(self._outputs) == 1
     output_pcollection = list(self._outputs)[0]
     bundle = self._evaluation_context.create_bundle(output_pcollection)
     bundle.output(GlobalWindows.windowed_value(b''))
     return TransformResult(self, [bundle], [], None, None)
Beispiel #39
0
  def process(self, source):
    if isinstance(source, iobase.SourceBundle):
      for value in source.source.read(source.source.get_range_tracker(
          source.start_position, source.stop_position)):
        yield value
    else:
      # Dataflow native source
      with source.reader() as reader:
        for value in reader:
          yield value


# See DataflowRunner._pardo_fn_data
OLDE_SOURCE_SPLITTABLE_DOFN_DATA = pickler.dumps(
    (OldeSourceSplittableDoFn(), (), {}, [],
     beam.transforms.core.Windowing(GlobalWindows())))


class _GroupingBuffer(object):
  """Used to accumulate groupded (shuffled) results."""
  def __init__(self, pre_grouped_coder, post_grouped_coder, windowing):
    self._key_coder = pre_grouped_coder.key_coder()
    self._pre_grouped_coder = pre_grouped_coder
    self._post_grouped_coder = post_grouped_coder
    self._table = collections.defaultdict(list)
    self._windowing = windowing

  def append(self, elements_data):
    input_stream = create_InputStream(elements_data)
    while input_stream.size() > 0:
      windowed_key_value = self._pre_grouped_coder.get_impl(
Beispiel #40
0
 def _test(self, trigger, lateness, expected):
     windowing = WindowInto(GlobalWindows(),
                            trigger=trigger,
                            accumulation_mode=AccumulationMode.ACCUMULATING,
                            allowed_lateness=lateness).windowing
     self.assertEqual(trigger.may_lose_data(windowing), expected)
  def _map_task_registration(self, map_task, state_handler,
                             data_operation_spec):
    input_data = {}
    runner_sinks = {}
    transforms = []
    transform_index_to_id = {}

    # Maps coders to new coder objects and references.
    coders = {}

    def coder_id(coder):
      if coder not in coders:
        coders[coder] = beam_fn_api_pb2.Coder(
            function_spec=sdk_worker.pack_function_spec_data(
                json.dumps(coder.as_cloud_object()),
                sdk_worker.PYTHON_CODER_URN, id=self._next_uid()))

      return coders[coder].function_spec.id

    def output_tags(op):
      return getattr(op, 'output_tags', ['out'])

    def as_target(op_input):
      input_op_index, input_output_index = op_input
      input_op = map_task[input_op_index][1]
      return {
          'ignored_input_tag':
              beam_fn_api_pb2.Target.List(target=[
                  beam_fn_api_pb2.Target(
                      primitive_transform_reference=transform_index_to_id[
                          input_op_index],
                      name=output_tags(input_op)[input_output_index])
              ])
      }

    def outputs(op):
      return {
          tag: beam_fn_api_pb2.PCollection(coder_reference=coder_id(coder))
          for tag, coder in zip(output_tags(op), op.output_coders)
      }

    for op_ix, (stage_name, operation) in enumerate(map_task):
      transform_id = transform_index_to_id[op_ix] = self._next_uid()
      if isinstance(operation, operation_specs.WorkerInMemoryWrite):
        # Write this data back to the runner.
        fn = beam_fn_api_pb2.FunctionSpec(urn=sdk_worker.DATA_OUTPUT_URN,
                                          id=self._next_uid())
        if data_operation_spec:
          fn.data.Pack(data_operation_spec)
        inputs = as_target(operation.input)
        side_inputs = {}
        runner_sinks[(transform_id, 'out')] = operation

      elif isinstance(operation, operation_specs.WorkerRead):
        # A Read is either translated to a direct injection of windowed values
        # into the sdk worker, or an injection of the source object into the
        # sdk worker as data followed by an SDF that reads that source.
        if (isinstance(operation.source.source,
                       maptask_executor_runner.InMemorySource)
            and isinstance(operation.source.source.default_output_coder(),
                           WindowedValueCoder)):
          output_stream = create_OutputStream()
          element_coder = (
              operation.source.source.default_output_coder().get_impl())
          # Re-encode the elements in the nested context and
          # concatenate them together
          for element in operation.source.source.read(None):
            element_coder.encode_to_stream(element, output_stream, True)
          target_name = self._next_uid()
          input_data[(transform_id, target_name)] = output_stream.get()
          fn = beam_fn_api_pb2.FunctionSpec(urn=sdk_worker.DATA_INPUT_URN,
                                            id=self._next_uid())
          if data_operation_spec:
            fn.data.Pack(data_operation_spec)
          inputs = {target_name: beam_fn_api_pb2.Target.List()}
          side_inputs = {}
        else:
          # Read the source object from the runner.
          source_coder = beam.coders.DillCoder()
          input_transform_id = self._next_uid()
          output_stream = create_OutputStream()
          source_coder.get_impl().encode_to_stream(
              GlobalWindows.windowed_value(operation.source),
              output_stream,
              True)
          target_name = self._next_uid()
          input_data[(input_transform_id, target_name)] = output_stream.get()
          input_ptransform = beam_fn_api_pb2.PrimitiveTransform(
              id=input_transform_id,
              function_spec=beam_fn_api_pb2.FunctionSpec(
                  urn=sdk_worker.DATA_INPUT_URN,
                  id=self._next_uid()),
              # TODO(robertwb): Possible name collision.
              step_name=stage_name + '/inject_source',
              inputs={target_name: beam_fn_api_pb2.Target.List()},
              outputs={
                  'out':
                      beam_fn_api_pb2.PCollection(
                          coder_reference=coder_id(source_coder))
              })
          if data_operation_spec:
            input_ptransform.function_spec.data.Pack(data_operation_spec)
          transforms.append(input_ptransform)

          # Read the elements out of the source.
          fn = sdk_worker.pack_function_spec_data(
              OLDE_SOURCE_SPLITTABLE_DOFN_DATA,
              sdk_worker.PYTHON_DOFN_URN,
              id=self._next_uid())
          inputs = {
              'ignored_input_tag':
                  beam_fn_api_pb2.Target.List(target=[
                      beam_fn_api_pb2.Target(
                          primitive_transform_reference=input_transform_id,
                          name='out')
                  ])
          }
          side_inputs = {}

      elif isinstance(operation, operation_specs.WorkerDoFn):
        fn = sdk_worker.pack_function_spec_data(
            operation.serialized_fn,
            sdk_worker.PYTHON_DOFN_URN,
            id=self._next_uid())
        inputs = as_target(operation.input)
        # Store the contents of each side input for state access.
        for si in operation.side_inputs:
          assert isinstance(si.source, iobase.BoundedSource)
          element_coder = si.source.default_output_coder()
          view_id = self._next_uid()
          # TODO(robertwb): Actually flesh out the ViewFn API.
          side_inputs[si.tag] = beam_fn_api_pb2.SideInput(
              view_fn=sdk_worker.serialize_and_pack_py_fn(
                  element_coder, urn=sdk_worker.PYTHON_ITERABLE_VIEWFN_URN,
                  id=view_id))
          # Re-encode the elements in the nested context and
          # concatenate them together
          output_stream = create_OutputStream()
          for element in si.source.read(
              si.source.get_range_tracker(None, None)):
            element_coder.get_impl().encode_to_stream(
                element, output_stream, True)
          elements_data = output_stream.get()
          state_key = beam_fn_api_pb2.StateKey.MultimapSideInput(key=view_id)
          state_handler.Clear(state_key)
          state_handler.Append(state_key, elements_data)

      elif isinstance(operation, operation_specs.WorkerFlatten):
        fn = sdk_worker.pack_function_spec_data(
            operation.serialized_fn,
            sdk_worker.IDENTITY_DOFN_URN,
            id=self._next_uid())
        inputs = {
            'ignored_input_tag':
                beam_fn_api_pb2.Target.List(target=[
                    beam_fn_api_pb2.Target(
                        primitive_transform_reference=transform_index_to_id[
                            input_op_index],
                        name=output_tags(map_task[input_op_index][1])[
                            input_output_index])
                    for input_op_index, input_output_index in operation.inputs
                ])
        }
        side_inputs = {}

      else:
        raise TypeError(operation)

      ptransform = beam_fn_api_pb2.PrimitiveTransform(
          id=transform_id,
          function_spec=fn,
          step_name=stage_name,
          inputs=inputs,
          side_inputs=side_inputs,
          outputs=outputs(operation))
      transforms.append(ptransform)

    process_bundle_descriptor = beam_fn_api_pb2.ProcessBundleDescriptor(
        id=self._next_uid(), coders=coders.values(),
        primitive_transform=transforms)
    return beam_fn_api_pb2.InstructionRequest(
        instruction_id=self._next_uid(),
        register=beam_fn_api_pb2.RegisterRequest(
            process_bundle_descriptor=[process_bundle_descriptor
                                      ])), runner_sinks, input_data