def test_json_row_serialization_deserialization_schema(self): jsons = [ "{\"svt\":\"2020-02-24T12:58:09.209+0800\"}", "{\"svt\":\"2020-02-24T12:58:09.209+0800\", " "\"ops\":{\"id\":\"281708d0-4092-4c21-9233-931950b6eccf\"},\"ids\":[1, 2, 3]}", "{\"svt\":\"2020-02-24T12:58:09.209+0800\"}" ] expected_jsons = [ "{\"svt\":\"2020-02-24T12:58:09.209+0800\",\"ops\":null,\"ids\":null}", "{\"svt\":\"2020-02-24T12:58:09.209+0800\"," "\"ops\":{\"id\":\"281708d0-4092-4c21-9233-931950b6eccf\"}," "\"ids\":[1,2,3]}", "{\"svt\":\"2020-02-24T12:58:09.209+0800\",\"ops\":null,\"ids\":null}" ] row_schema = Types.ROW_NAMED(["svt", "ops", "ids"], [ Types.STRING(), Types.ROW_NAMED(['id'], [Types.STRING()]), Types.PRIMITIVE_ARRAY(Types.INT()) ]) json_row_serialization_schema = JsonRowSerializationSchema.builder() \ .with_type_info(row_schema).build() json_row_deserialization_schema = JsonRowDeserializationSchema.builder() \ .type_info(row_schema).build() for i in range(len(jsons)): j_row = json_row_deserialization_schema._j_deserialization_schema\ .deserialize(bytes(jsons[i], encoding='utf-8')) result = str(json_row_serialization_schema._j_serialization_schema. serialize(j_row), encoding='utf-8') self.assertEqual(expected_jsons[i], result)
def test_row_type(self): self.assertEqual(RowTypeInfo([Types.STRING(), Types.STRING()]) .get_field_names(), ['f0', 'f1']) self.assertEqual(RowTypeInfo([Types.STRING(), Types.STRING()], ['a', 'b']).get_field_names(), ['a', 'b']) self.assertEqual(RowTypeInfo([Types.STRING(), Types.STRING()], ['a', 'b']) == RowTypeInfo([Types.STRING(), Types.STRING()], ['a', 'b']), True) self.assertEqual(RowTypeInfo([Types.STRING(), Types.STRING()], ['a', 'b']) == RowTypeInfo([Types.STRING(), Types.INT()], ['a', 'b']), False) self.assertEqual(RowTypeInfo([Types.STRING(), Types.STRING()], ['a', 'b']).__str__(), "RowTypeInfo(a: String, b: String)") self.assertEqual(Types.ROW([Types.STRING(), Types.STRING()]), RowTypeInfo([Types.STRING(), Types.STRING()]), True) self.assertEqual(Types.ROW_NAMED(['a', 'b'], [Types.STRING(), Types.STRING()]) .get_field_names(), ['a', 'b'], True) self.assertEqual(Types.ROW_NAMED(['a', 'b'], [Types.STRING(), Types.STRING()]) .get_field_types(), [Types.STRING(), Types.STRING()], True)
def test_from_data_stream_with_schema(self): from pyflink.table import Schema ds = self.env.from_collection( [(1, 'Hi', 'Hello'), (2, 'Hello', 'Hi')], type_info=Types.ROW_NAMED( ["a", "b", "c"], [Types.INT(), Types.STRING(), Types.STRING()])) table = self.t_env.from_data_stream( ds, Schema.new_builder().column("a", DataTypes.INT()).column( "b", DataTypes.STRING()).column("c", DataTypes.STRING()).build()) result = table.execute() with result.collect() as result: collected_result = [str(item) for item in result] expected_result = [ item for item in map( str, [Row(1, 'Hi', 'Hello'), Row(2, 'Hello', 'Hi')]) ] expected_result.sort() collected_result.sort() self.assertEqual(expected_result, collected_result)
def python_data_stream_example(): env = StreamExecutionEnvironment.get_execution_environment() # Set the parallelism to be one to make sure that all data including fired timer and normal data # are processed by the same worker and the collected result would be in order which is good for # assertion. env.set_parallelism(1) env.set_stream_time_characteristic(TimeCharacteristic.EventTime) type_info = Types.ROW_NAMED(['createTime', 'orderId', 'payAmount', 'payPlatform', 'provinceId'], [Types.LONG(), Types.LONG(), Types.DOUBLE(), Types.INT(), Types.INT()]) json_row_schema = JsonRowDeserializationSchema.builder().type_info(type_info).build() kafka_props = {'bootstrap.servers': 'localhost:9092', 'group.id': 'pyflink-e2e-source'} kafka_consumer = FlinkKafkaConsumer("timer-stream-source", json_row_schema, kafka_props) kafka_producer = FlinkKafkaProducer("timer-stream-sink", SimpleStringSchema(), kafka_props) watermark_strategy = WatermarkStrategy.for_bounded_out_of_orderness(Duration.of_seconds(5))\ .with_timestamp_assigner(KafkaRowTimestampAssigner()) kafka_consumer.set_start_from_earliest() ds = env.add_source(kafka_consumer).assign_timestamps_and_watermarks(watermark_strategy) ds.key_by(MyKeySelector(), key_type_info=Types.LONG()) \ .process(MyProcessFunction(), output_type=Types.STRING()) \ .add_sink(kafka_producer) env.execute_async("test data stream timer")
def popular_taxi_vendor(): env = StreamExecutionEnvironment.get_execution_environment() env.set_parallelism(1) t_env = StreamTableEnvironment.create(stream_execution_environment=env) t_env.execute_sql( create_table_ddl( "WATERMARK FOR pickupTime AS pickupTime - INTERVAL '30' SECONDS")) taxi_ride = t_env.from_path('TaxiRide') popular_rides = taxi_ride.select(taxi_ride.vendorId, taxi_ride.pickupTime) \ .window(Slide.over('15.minutes').every('5.minutes').on(taxi_ride.pickupTime).alias('w')) \ .group_by(taxi_ride.vendorId, col('w')) \ .select(taxi_ride.vendorId, \ col('w').start.alias('start'), \ col('w').end.alias('end'), \ taxi_ride.vendorId.count.alias('cnt')) t_env.to_append_stream( popular_rides, Types.ROW_NAMED(['vendorId', 'start', 'end', 'cnt'], [ Types.INT(), Types.SQL_TIMESTAMP(), Types.SQL_TIMESTAMP(), Types.LONG() ])).print() env.execute('Popular-Taxi-Vendor')
def _create_orc_basic_row_and_data() -> Tuple[RowType, RowTypeInfo, List[Row]]: row_type = DataTypes.ROW([ DataTypes.FIELD('char', DataTypes.CHAR(10)), DataTypes.FIELD('varchar', DataTypes.VARCHAR(10)), DataTypes.FIELD('bytes', DataTypes.BYTES()), DataTypes.FIELD('boolean', DataTypes.BOOLEAN()), DataTypes.FIELD('decimal', DataTypes.DECIMAL(2, 0)), DataTypes.FIELD('int', DataTypes.INT()), DataTypes.FIELD('bigint', DataTypes.BIGINT()), DataTypes.FIELD('double', DataTypes.DOUBLE()), DataTypes.FIELD('date', DataTypes.DATE().bridged_to('java.sql.Date')), DataTypes.FIELD('timestamp', DataTypes.TIMESTAMP(3).bridged_to('java.sql.Timestamp')), ]) row_type_info = Types.ROW_NAMED( ['char', 'varchar', 'bytes', 'boolean', 'decimal', 'int', 'bigint', 'double', 'date', 'timestamp'], [Types.STRING(), Types.STRING(), Types.PRIMITIVE_ARRAY(Types.BYTE()), Types.BOOLEAN(), Types.BIG_DEC(), Types.INT(), Types.LONG(), Types.DOUBLE(), Types.SQL_DATE(), Types.SQL_TIMESTAMP()] ) data = [Row( char='char', varchar='varchar', bytes=b'varbinary', boolean=True, decimal=Decimal(1.5), int=2147483647, bigint=-9223372036854775808, double=2e-308, date=date(1970, 1, 1), timestamp=datetime(1970, 1, 2, 3, 4, 5, 600000), )] return row_type, row_type_info, data
def max_travellers_per_destination(): env = StreamExecutionEnvironment.get_execution_environment() t_env = StreamTableEnvironment.create(stream_execution_environment=env) t_env.execute_sql( create_table_ddl( "WATERMARK FOR dropOffTime AS dropOffTime - INTERVAL '30' SECONDS") ) taxi_ride = t_env.from_path('TaxiRide') no_of_travelers_per_dest = taxi_ride \ .select(taxi_ride.passengerCount, taxi_ride.dropOffTime, taxi_ride.destLocationZone) \ .window(Tumble().over('1.hour').on(taxi_ride.dropOffTime).alias('w')) \ .group_by(taxi_ride.destLocationZone, col('w')) \ .select(taxi_ride.destLocationZone, \ col('w').start.alias('start'), \ col('w').end.alias('end'), \ taxi_ride.passengerCount.count.alias('cnt')) t_env.to_append_stream( no_of_travelers_per_dest, Types.ROW_NAMED(['destLocationZone', 'start', 'end', 'cnt'], [ Types.STRING(), Types.SQL_TIMESTAMP(), Types.SQL_TIMESTAMP(), Types.LONG() ])).print() env.execute('Max-Travellers-Per-Destination')
def _create_parquet_map_row_and_data() -> Tuple[RowType, RowTypeInfo, List[Row]]: row_type = DataTypes.ROW([ DataTypes.FIELD('map', DataTypes.MAP(DataTypes.INT(), DataTypes.STRING())), ]) row_type_info = Types.ROW_NAMED(['map'], [Types.MAP(Types.INT(), Types.STRING())]) data = [Row( map={0: 'a', 1: 'b', 2: 'c'} )] return row_type, row_type_info, data
def test_from_and_to_data_stream_event_time(self): from pyflink.table import Schema ds = self.env.from_collection([(1, 42, "a"), (2, 5, "a"), (3, 1000, "c"), (100, 1000, "c")], Types.ROW_NAMED( ["a", "b", "c"], [Types.LONG(), Types.INT(), Types.STRING()])) ds = ds.assign_timestamps_and_watermarks( WatermarkStrategy.for_monotonous_timestamps() .with_timestamp_assigner(MyTimestampAssigner())) table = self.t_env.from_data_stream(ds, Schema.new_builder() .column_by_metadata("rowtime", "TIMESTAMP_LTZ(3)") .watermark("rowtime", "SOURCE_WATERMARK()") .build()) self.assertEqual("""( `a` BIGINT, `b` INT, `c` STRING, `rowtime` TIMESTAMP_LTZ(3) *ROWTIME* METADATA, WATERMARK FOR `rowtime`: TIMESTAMP_LTZ(3) AS SOURCE_WATERMARK() )""", table._j_table.getResolvedSchema().toString()) self.t_env.create_temporary_view("t", ds, Schema.new_builder() .column_by_metadata("rowtime", "TIMESTAMP_LTZ(3)") .watermark("rowtime", "SOURCE_WATERMARK()") .build()) result = self.t_env.execute_sql("SELECT " "c, SUM(b) " "FROM t " "GROUP BY c, TUMBLE(rowtime, INTERVAL '0.005' SECOND)") with result.collect() as result: collected_result = [str(item) for item in result] expected_result = [item for item in map(str, [Row('a', 47), Row('c', 1000), Row('c', 1000)])] expected_result.sort() collected_result.sort() self.assertEqual(expected_result, collected_result) ds = self.t_env.to_data_stream(table) ds.key_by(lambda k: k.c, key_type=Types.STRING()) \ .window(MyTumblingEventTimeWindow()) \ .apply(SumWindowFunction(), Types.TUPLE([Types.STRING(), Types.INT()])) \ .add_sink(self.test_sink) self.env.execute() expected_results = ['(a,47)', '(c,1000)', '(c,1000)'] actual_results = self.test_sink.get_results(False) expected_results.sort() actual_results.sort() self.assertEqual(expected_results, actual_results)
def _create_parquet_array_row_and_data() -> Tuple[RowType, RowTypeInfo, RowTypeInfo, List[Row]]: row_type = DataTypes.ROW([ DataTypes.FIELD('string_array', DataTypes.ARRAY(DataTypes.STRING())), DataTypes.FIELD('int_array', DataTypes.ARRAY(DataTypes.INT())), ]) row_type_info = Types.ROW_NAMED([ 'string_array', 'int_array', ], [ Types.LIST(Types.STRING()), Types.LIST(Types.INT()), ]) conversion_row_type_info = Types.ROW_NAMED([ 'string_array', 'int_array', ], [ Types.OBJECT_ARRAY(Types.STRING()), Types.OBJECT_ARRAY(Types.INT()), ]) data = [Row( string_array=['a', 'b', 'c'], int_array=[1, 2, 3], )] return row_type, row_type_info, conversion_row_type_info, data
def _create_parquet_basic_row_and_data() -> Tuple[RowType, RowTypeInfo, List[Row]]: row_type = DataTypes.ROW([ DataTypes.FIELD('char', DataTypes.CHAR(10)), DataTypes.FIELD('varchar', DataTypes.VARCHAR(10)), DataTypes.FIELD('binary', DataTypes.BINARY(10)), DataTypes.FIELD('varbinary', DataTypes.VARBINARY(10)), DataTypes.FIELD('boolean', DataTypes.BOOLEAN()), DataTypes.FIELD('decimal', DataTypes.DECIMAL(2, 0)), DataTypes.FIELD('int', DataTypes.INT()), DataTypes.FIELD('bigint', DataTypes.BIGINT()), DataTypes.FIELD('double', DataTypes.DOUBLE()), DataTypes.FIELD('date', DataTypes.DATE().bridged_to('java.sql.Date')), DataTypes.FIELD('time', DataTypes.TIME().bridged_to('java.sql.Time')), DataTypes.FIELD('timestamp', DataTypes.TIMESTAMP(3).bridged_to('java.sql.Timestamp')), DataTypes.FIELD('timestamp_ltz', DataTypes.TIMESTAMP_LTZ(3)), ]) row_type_info = Types.ROW_NAMED( ['char', 'varchar', 'binary', 'varbinary', 'boolean', 'decimal', 'int', 'bigint', 'double', 'date', 'time', 'timestamp', 'timestamp_ltz'], [Types.STRING(), Types.STRING(), Types.PRIMITIVE_ARRAY(Types.BYTE()), Types.PRIMITIVE_ARRAY(Types.BYTE()), Types.BOOLEAN(), Types.BIG_DEC(), Types.INT(), Types.LONG(), Types.DOUBLE(), Types.SQL_DATE(), Types.SQL_TIME(), Types.SQL_TIMESTAMP(), Types.INSTANT()] ) datetime_ltz = datetime.datetime(1970, 2, 3, 4, 5, 6, 700000, tzinfo=pytz.timezone('UTC')) timestamp_ltz = Instant.of_epoch_milli( ( calendar.timegm(datetime_ltz.utctimetuple()) + calendar.timegm(time.localtime(0)) ) * 1000 + datetime_ltz.microsecond // 1000 ) data = [Row( char='char', varchar='varchar', binary=b'binary', varbinary=b'varbinary', boolean=True, decimal=Decimal(1.5), int=2147483647, bigint=-9223372036854775808, double=2e-308, date=datetime.date(1970, 1, 1), time=datetime.time(1, 1, 1), timestamp=datetime.datetime(1970, 1, 2, 3, 4, 5, 600000), timestamp_ltz=timestamp_ltz )] return row_type, row_type_info, data
def popular_destination_query(): env = StreamExecutionEnvironment.get_execution_environment() t_env = StreamTableEnvironment.create(stream_execution_environment=env) t_env.execute_sql( create_table_ddl( "WATERMARK FOR pickupTime AS pickupTime - INTERVAL '30' SECONDS")) query = f"""SELECT destLocationId, wstart, wend, cnt FROM (SELECT destLocationId, HOP_START(pickupTime, INTERVAL '5' MINUTE, INTERVAL '15' MINUTE) AS wstart, HOP_END(pickupTime, INTERVAL '5' MINUTE, INTERVAL '15' MINUTE) AS wend, COUNT(destLocationId) AS cnt FROM (SELECT pickupTime, destLocationId FROM TaxiRide) GROUP BY destLocationId, HOP(pickupTime, INTERVAL '5' MINUTE, INTERVAL '15' MINUTE) ) WHERE cnt > {args.threshold} """ results = t_env.sql_query(query) t_env.to_append_stream( results, Types.ROW_NAMED(['destLocationId', 'wstart', 'wend', 'cnt'], [ Types.INT(), Types.SQL_TIMESTAMP(), Types.SQL_TIMESTAMP(), Types.LONG() ])).print() env.execute('Popular-Destination')
def _create_parquet_array_row_and_data() -> Tuple[RowType, RowTypeInfo, List[Row]]: row_type = DataTypes.ROW([ DataTypes.FIELD( 'string_array', DataTypes.ARRAY(DataTypes.STRING()).bridged_to('java.util.ArrayList') ), DataTypes.FIELD( 'int_array', DataTypes.ARRAY(DataTypes.INT()).bridged_to('java.util.ArrayList') ), ]) row_type_info = Types.ROW_NAMED([ 'string_array', 'int_array', ], [ Types.LIST(Types.STRING()), Types.LIST(Types.INT()), ]) data = [Row( string_array=['a', 'b', 'c'], int_array=[1, 2, 3], )] return row_type, row_type_info, data
import json from pyflink.common.serialization import SimpleStringSchema, SimpleStringEncoder, JsonRowDeserializationSchema from pyflink.datastream import StreamExecutionEnvironment from pyflink.datastream.connectors import FlinkKafkaConsumer, StreamingFileSink from pyflink.common.typeinfo import Types from pyflink.datastream.functions import MapFunction s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) ti = Types.ROW_NAMED( ["app", 'busi', 'date', 'ip'], [Types.STRING(), Types.STRING(), Types.BIG_INT(), Types.STRING()]) builder = JsonRowDeserializationSchema.builder() builder.type_info(ti) jrds = builder.ignore_parse_errors().build() fkc = FlinkKafkaConsumer(topics="ULS-BUSI-LOG-dev", deserialization_schema=jrds, properties={ "bootstrap.servers": "10.100.1.16:9192", "group.id": "123", "auto.offset.reset": "earliest" }) fkc.set_start_from_earliest() src = s_env.add_source(fkc).map(lambda x: x.get("values")) src.add_sink( StreamingFileSink.for_row_format('C:\\tmp\\pyoutput', SimpleStringEncoder()).build())
class ProductJob(FlinkJob): job_name = "wish:product" kafka_addr = "10.0.9.5:9092" wish_data_topic = "wish-product-data" wish_result_topic = "wish-product-result-data" wish_shop_result_topic = "wish-shop-result-data" tables = [ f""" CREATE TABLE wish_product_data ( `timestamp` INT, `pid` STRING, `merchant_id` STRING, `merchant_name` STRING, `shop_name` STRING, `review_number` INT, `review_score` FLOAT, `shop_review_number` INT, `title` STRING, `is_pb` BOOLEAN, `is_hwc` BOOLEAN, `is_verified` BOOLEAN, `total_bought` INT, `total_wishlist` INT, `tags` ARRAY<STRING>, `category_ids` ARRAY<STRING>, `category_paths` ARRAY<STRING>, `category_l1_ids` ARRAY<STRING>, `category_l2_ids` ARRAY<STRING>, `category_l3_ids` ARRAY<STRING>, `leaf_category_ids` ARRAY<STRING>, `price` FLOAT, `shipping_price` FLOAT, `sold` FLOAT, `update_time` TIMESTAMP(0), `shop_open_time` TIMESTAMP(0), `gen_time` TIMESTAMP(0), `data_update_time` TIMESTAMP(0) ) WITH ( 'connector' = 'kafka', 'topic' = '{wish_data_topic}', 'properties.bootstrap.servers' = '{kafka_addr}', 'properties.group.id' = 'flink_bigdata', 'scan.startup.mode' = 'group-offsets', 'format' = 'json' ) """ ] result_type = Types.ROW_NAMED( ["infos"], [Types.STRING()] ) shop_result_type = Types.ROW_NAMED( ["shop_name", "shop_agg"], [Types.STRING(), Types.STRING()] ) def process(self) -> None: calculate = udaf(CalculateAgg()) shop_agg = udaf(ShopAgg()) table = self.table_env.from_path("wish_product_data") result_table = table.group_by(table.pid)\ .select(calculate(table.timestamp, table.pid, table.merchant_id, table.merchant_name, table.shop_name, table.review_number, table.review_score, table.shop_review_number, table.title, table.is_pb, table.is_hwc, table.is_verified, table.total_bought, table.total_wishlist, table.tags, table.category_ids, table.category_paths, table.category_l1_ids, table.category_l2_ids, table.category_l3_ids, table.leaf_category_ids, table.price, table.shipping_price, table.sold, table.update_time, table.shop_open_time, table.gen_time, table.data_update_time)) #calculate(table.timestamp, table.update_time, table.sold, table.price, table.review_number)) shop_table = table.select(table.update_time, table.shop_name) shop_table = shop_table.add_columns("1 as count") shop_result_table = shop_table.group_by(shop_table.shop_name)\ .select(shop_table.shop_name, shop_agg(table.update_time, shop_table.count)) self.sink_to_kafka(self.wish_result_topic, result_table, self.result_type) self.sink_to_kafka(self.wish_shop_result_topic, shop_result_table, self.shop_result_type)