def run(argv=None): """Build and run the pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input_subscription', required=True, help= 'Input PubSub subscription of the form "projects/<project>/subscriptions/<subscription_name>".' ) parser.add_argument( '--output_table', required=True, help= ('Output BigQuery table for results specified as: PROJECT:DATASET.TABLE ' 'or DATASET.TABLE.')) known_args, pipeline_args = parser.parse_known_args(argv) with beam.Pipeline(argv=pipeline_args) as p: # Read the text from PubSub messages. lines = p | beam.io.ReadFromPubSub( subscription=known_args.input_subscription) transformed = (lines | 'Split' >> (beam.FlatMap(find_msg)) | 'window' >> beam.WindowInto(window.FixedWindows(60)) | 'append' >> beam.CombineGlobally( ToListCombineFn()).without_defaults() | 'Format' >> beam.ParDo(FormDoFn())) transformed | 'Write' >> beam.io.WriteToBigQuery( known_args.output_table)
class SimpleTestStatefulDoFn(DoFn): BUFFER_STATE = CombiningValueStateSpec( 'buffer', IterableCoder(VarIntCoder()), ToListCombineFn()) EXPIRY_TIMER = TimerSpec('expiry1', TimeDomain.WATERMARK) def process(self, element, buffer=DoFn.StateParam(BUFFER_STATE), timer1=DoFn.TimerParam(EXPIRY_TIMER)): unused_key, value = element buffer.add(value) timer1.set(20) @on_timer(EXPIRY_TIMER) def expiry_callback(self, buffer=DoFn.StateParam(BUFFER_STATE), timer=DoFn.TimerParam(EXPIRY_TIMER)): yield ''.join(str(x) for x in sorted(buffer.read()))