Beispiel #1
0
def run(argv=None):
    """Build and run the pipeline."""

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input_subscription',
        required=True,
        help=
        'Input PubSub subscription of the form "projects/<project>/subscriptions/<subscription_name>".'
    )
    parser.add_argument(
        '--output_table',
        required=True,
        help=
        ('Output BigQuery table for results specified as: PROJECT:DATASET.TABLE '
         'or DATASET.TABLE.'))
    known_args, pipeline_args = parser.parse_known_args(argv)

    with beam.Pipeline(argv=pipeline_args) as p:

        # Read the text from PubSub messages.
        lines = p | beam.io.ReadFromPubSub(
            subscription=known_args.input_subscription)
        transformed = (lines
                       | 'Split' >> (beam.FlatMap(find_msg))
                       | 'window' >> beam.WindowInto(window.FixedWindows(60))
                       | 'append' >> beam.CombineGlobally(
                           ToListCombineFn()).without_defaults()
                       | 'Format' >> beam.ParDo(FormDoFn()))

        transformed | 'Write' >> beam.io.WriteToBigQuery(
            known_args.output_table)
Beispiel #2
0
    class SimpleTestStatefulDoFn(DoFn):
      BUFFER_STATE = CombiningValueStateSpec(
          'buffer',
          IterableCoder(VarIntCoder()), ToListCombineFn())
      EXPIRY_TIMER = TimerSpec('expiry1', TimeDomain.WATERMARK)

      def process(self, element, buffer=DoFn.StateParam(BUFFER_STATE),
                  timer1=DoFn.TimerParam(EXPIRY_TIMER)):
        unused_key, value = element
        buffer.add(value)
        timer1.set(20)

      @on_timer(EXPIRY_TIMER)
      def expiry_callback(self, buffer=DoFn.StateParam(BUFFER_STATE),
                          timer=DoFn.TimerParam(EXPIRY_TIMER)):
        yield ''.join(str(x) for x in sorted(buffer.read()))