Ejemplo n.º 1
0
    def expand(self, pbegin) -> beam.PCollection[filesystem.FileMetadata]:
        # invoke periodic impulse
        impulse = pbegin | PeriodicImpulse(start_timestamp=self.start_ts,
                                           stop_timestamp=self.stop_ts,
                                           fire_interval=self.interval)

        # match file pattern periodically
        match_files = (
            impulse
            | 'GetFilePattern' >> beam.Map(lambda x: self.file_pattern)
            | MatchAll())

        # apply deduplication strategy if required
        if self.has_deduplication:
            # Making a Key Value so each file has its own state.
            match_files = match_files | 'ToKV' >> beam.Map(lambda x:
                                                           (x.path, x))
            if self.match_upd:
                match_files = match_files | 'RemoveOldAlreadyRead' >> beam.ParDo(
                    _RemoveOldDuplicates())
            else:
                match_files = match_files | 'RemoveAlreadyRead' >> beam.ParDo(
                    _RemoveDuplicates())

        # apply windowing if required. Apply at last because deduplication relies on
        # the global window.
        if self.apply_windowing:
            match_files = match_files | beam.WindowInto(
                FixedWindows(self.interval))

        return match_files
Ejemplo n.º 2
0
def side_input_slow_update(
    src_file_pattern,
    first_timestamp,
    last_timestamp,
    interval,
    sample_main_input_elements,
    main_input_windowing_interval):
  # [START SideInputSlowUpdateSnip1]
  from apache_beam.transforms.periodicsequence import PeriodicImpulse
  from apache_beam.transforms.window import TimestampedValue
  from apache_beam.transforms import window

  # from apache_beam.utils.timestamp import MAX_TIMESTAMP
  # last_timestamp = MAX_TIMESTAMP to go on indefninitely

  # Any user-defined function.
  # cross join is used as an example.
  def cross_join(left, rights):
    for x in rights:
      yield (left, x)

  # Create pipeline.
  pipeline_options = PipelineOptions()
  p = beam.Pipeline(options=pipeline_options)
  side_input = (
      p
      | 'PeriodicImpulse' >> PeriodicImpulse(
          first_timestamp, last_timestamp, interval, True)
      | 'MapToFileName' >> beam.Map(lambda x: src_file_pattern + str(x))
      | 'ReadFromFile' >> beam.io.ReadAllFromText())

  main_input = (
      p
      | 'MpImpulse' >> beam.Create(sample_main_input_elements)
      |
      'MapMpToTimestamped' >> beam.Map(lambda src: TimestampedValue(src, src))
      | 'WindowMpInto' >> beam.WindowInto(
          window.FixedWindows(main_input_windowing_interval)))

  result = (
      main_input
      | 'ApplyCrossJoin' >> beam.FlatMap(
          cross_join, rights=beam.pvalue.AsIter(side_input)))
  # [END SideInputSlowUpdateSnip1]

  return p, result
Ejemplo n.º 3
0
  def test_periodicimpulse_default_start(self):
    default_parameters = inspect.signature(PeriodicImpulse).parameters
    it = default_parameters["start_timestamp"].default
    duration = 1
    et = it + duration
    interval = 0.5

    # Check default `stop_timestamp` is the same type `start_timestamp`
    is_same_type = isinstance(
        it, type(default_parameters["stop_timestamp"].default))
    error = "'start_timestamp' and 'stop_timestamp' have different type"
    assert is_same_type, error

    with TestPipeline() as p:
      result = p | 'PeriodicImpulse' >> PeriodicImpulse(it, et, interval)

      k = [it + x * interval for x in range(0, int(duration / interval))]
      assert_that(result, equal_to(k))
Ejemplo n.º 4
0
    def expand(self, pcoll):
        impulse = pcoll | PeriodicImpulse(start_timestamp=self.start_ts,
                                          stop_timestamp=self.stop_ts,
                                          fire_interval=self.interval)

        match_files = (
            impulse
            | 'GetFilePattern' >> beam.Map(lambda x: self.file_pattern)
            | MatchAll())

        if self.has_deduplication:
            match_files = (
                match_files
                # Making a Key Value so each file has its own state.
                | 'ToKV' >> beam.Map(lambda x: (x.path, x))
                | 'RemoveAlreadyRead' >> beam.ParDo(_RemoveDuplicates()))

        return match_files
Ejemplo n.º 5
0
    def test_periodicimpulse_windowing_on_si(self):
        start_offset = -15
        it = time.time() + start_offset
        duration = 15
        et = it + duration
        interval = 5

        with TestPipeline() as p:
            si = (p
                  |
                  'PeriodicImpulse' >> PeriodicImpulse(it, et, interval, True)
                  | 'AddKey' >> beam.Map(lambda v: ('key', v))
                  | 'GBK' >> beam.GroupByKey()
                  | 'SortGBK' >> beam.MapTuple(lambda k, vs: (k, sorted(vs))))

            actual = si
            k = [('key', [it + x * interval])
                 for x in range(0, int(duration / interval), 1)]
            assert_that(actual, equal_to(k))