def test_expand_kafka_write(self): # We just test the expansion but do not execute. # pylint: disable=expression-not-assigned ( self.create_pipeline() | Impulse() | Map(lambda input: (1, input)) | WriteToKafka( producer_config={ 'bootstrap.servers': 'localhost:9092, notvalid2:3531' }, topic='topic1', key_serializer='org.apache.kafka.' 'common.serialization.' 'LongSerializer', value_serializer='org.apache.kafka.' 'common.serialization.' 'ByteArraySerializer', expansion_service=self.get_expansion_service()))
def run(bootstrap_servers, topic, pipeline_args): # bootstrap_servers = '123.45.67.89:123:9092' # topic = 'kafka_taxirides_realtime' # pipeline_args = ['--project', 'my-project', # '--runner', 'DataflowRunner', # '--temp_location', 'my-temp-location', # '--region', 'my-region', # '--num_workers', 'my-num-workers', # '--experiments', 'use_runner_v2'] pipeline_options = PipelineOptions(pipeline_args, save_main_session=True, streaming=True) window_size = 15 # size of the Window in seconds. def log_ride(ride_bytes): # Converting bytes record from Kafka to a dictionary. import ast ride = ast.literal_eval(ride_bytes.decode("UTF-8")) logging.info( 'Found ride at latitude %r and longitude %r with %r ' 'passengers', ride['latitude'], ride['longitude'], ride['passenger_count']) with beam.Pipeline(options=pipeline_options) as pipeline: _ = ( pipeline | beam.io.ReadFromPubSub( topic='projects/pubsub-public-data/topics/taxirides-realtime'). with_output_types(bytes) | beam.Map(lambda x: (b'', x)).with_output_types( typing.Tuple[bytes, bytes]) # Kafka write transforms expects KVs. | beam.WindowInto(beam.window.FixedWindows(window_size)) | WriteToKafka( producer_config={'bootstrap.servers': bootstrap_servers}, topic=topic)) _ = (pipeline | ReadFromKafka( consumer_config={'bootstrap.servers': bootstrap_servers}, topics=[topic]) | beam.FlatMap(lambda kv: log_ride(kv[1])))
def toPositions(x, stamp=beam.DoFn.TimestampParam): return (x[0].decode("utf-8"), tuple(map(float, x[1].decode("utf-8").split(" "))) + (stamp.micros / 1000., )) if __name__ == "__main__": if len(sys.argv) < 4: usage() bootstrapServer, inputTopic, outputTopic = sys.argv[1:4] with beam.Pipeline(options=PipelineOptions(["--streaming"] + sys.argv[4:])) as p: (p | ReadFromKafka( consumer_config={'bootstrap.servers': bootstrapServer}, topics=[inputTopic], timestamp_policy=ReadFromKafka.create_time_policy, expansion_service=get_expansion_service()) | "ToPositions" >> beam.Map(toPositions) | "SportTracker" >> SportTrackerCalc() | "ToKv" >> beam.Map(toKv) | "StoreOutput" >> WriteToKafka(producer_config={'bootstrap.servers': bootstrapServer}, topic=outputTopic, key_serializer= "org.apache.kafka.common.serialization.StringSerializer", value_serializer= "org.apache.kafka.common.serialization.StringSerializer", expansion_service=get_expansion_service()))
| beam.WindowInto( window.GlobalWindows(), trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)), accumulation_mode=trigger.AccumulationMode.ACCUMULATING, allowed_lateness=window.Duration.of(0)) | "MaxLength" >> beam.combiners.Top.Of(1, key=len).without_defaults() | "Flatten" >> beam.FlatMap(lambda x: x)) def toKv(s: str) -> beam.typehints.KV[bytes, bytes]: return ("".encode("utf-8"), s.encode("utf-8")) if __name__ == "__main__": if len(sys.argv) < 4: usage() bootstrapServer, inputTopic, outputTopic = sys.argv[1:4] with beam.Pipeline(options=PipelineOptions(["--streaming"] + sys.argv[4:])) as p: (p | ReadFromKafka( consumer_config={'bootstrap.servers': bootstrapServer}, topics=[inputTopic], expansion_service=get_expansion_service()) | "ToLines" >> beam.Map(lambda x: "%s %s" % (x[0].decode("utf-8"), x[1].decode("utf-8"))) | "ComputeLongestWord" >> ComputeLongestWord() | beam.Map(toKv) | "StoreOutput" >> WriteToKafka( producer_config={'bootstrap.servers': bootstrapServer}, topic=outputTopic, expansion_service=get_expansion_service()))
def run(bootstrap_servers, topic, with_metadata, bq_dataset, bq_table_name, project, pipeline_options): # bootstrap_servers = '123.45.67.89:123:9092' # topic = 'kafka_taxirides_realtime' # pipeline_args = ['--project', 'my-project', # '--runner', 'DataflowRunner', # '--temp_location', 'my-temp-location', # '--region', 'my-region', # '--num_workers', 'my-num-workers', # '--experiments', 'use_runner_v2'] window_size = 15 # size of the Window in seconds. def log_ride(ride): if 'timestamp' in ride: logging.info( 'Found ride at latitude %r and longitude %r with %r ' 'passengers at timestamp %r', ride['latitude'], ride['longitude'], ride['passenger_count'], ride['timestamp']) else: logging.info( 'Found ride at latitude %r and longitude %r with %r ' 'passengers', ride['latitude'], ride['longitude'], ride['passenger_count']) def convert_kafka_record_to_dictionary(record): # the records have 'value' attribute when --with_metadata is given if hasattr(record, 'value'): ride_bytes = record.value elif isinstance(record, tuple): ride_bytes = record[1] else: raise RuntimeError('unknown record type: %s' % type(record)) # Converting bytes record from Kafka to a dictionary. import ast ride = ast.literal_eval(ride_bytes.decode("UTF-8")) output = { key: ride[key] for key in ['latitude', 'longitude', 'passenger_count'] } if hasattr(record, 'timestamp'): # timestamp is read from Kafka metadata output['timestamp'] = record.timestamp return output with beam.Pipeline(options=pipeline_options) as pipeline: _ = ( pipeline | beam.io.ReadFromPubSub( topic='projects/pubsub-public-data/topics/taxirides-realtime'). with_output_types(bytes) | beam.Map(lambda x: (b'', x)).with_output_types( typing.Tuple[bytes, bytes]) # Kafka write transforms expects KVs. | beam.WindowInto(beam.window.FixedWindows(window_size)) | WriteToKafka( producer_config={'bootstrap.servers': bootstrap_servers}, topic=topic)) ride_col = ( pipeline | ReadFromKafka( consumer_config={'bootstrap.servers': bootstrap_servers}, topics=[topic], with_metadata=with_metadata) | beam.Map(lambda record: convert_kafka_record_to_dictionary(record)) ) if bq_dataset: schema = 'latitude:STRING,longitude:STRING,passenger_count:INTEGER' if with_metadata: schema += ',timestamp:STRING' _ = (ride_col | beam.io.WriteToBigQuery(bq_table_name, bq_dataset, project, schema)) else: _ = ride_col | beam.FlatMap(lambda ride: log_ride(ride))