def test_json_row_serialization_deserialization_schema(self):
        jsons = [
            "{\"svt\":\"2020-02-24T12:58:09.209+0800\"}",
            "{\"svt\":\"2020-02-24T12:58:09.209+0800\", "
            "\"ops\":{\"id\":\"281708d0-4092-4c21-9233-931950b6eccf\"},\"ids\":[1, 2, 3]}",
            "{\"svt\":\"2020-02-24T12:58:09.209+0800\"}"
        ]
        expected_jsons = [
            "{\"svt\":\"2020-02-24T12:58:09.209+0800\",\"ops\":null,\"ids\":null}",
            "{\"svt\":\"2020-02-24T12:58:09.209+0800\","
            "\"ops\":{\"id\":\"281708d0-4092-4c21-9233-931950b6eccf\"},"
            "\"ids\":[1,2,3]}",
            "{\"svt\":\"2020-02-24T12:58:09.209+0800\",\"ops\":null,\"ids\":null}"
        ]

        row_schema = Types.ROW_NAMED(["svt", "ops", "ids"], [
            Types.STRING(),
            Types.ROW_NAMED(['id'], [Types.STRING()]),
            Types.PRIMITIVE_ARRAY(Types.INT())
        ])

        json_row_serialization_schema = JsonRowSerializationSchema.builder() \
            .with_type_info(row_schema).build()
        json_row_deserialization_schema = JsonRowDeserializationSchema.builder() \
            .type_info(row_schema).build()

        for i in range(len(jsons)):
            j_row = json_row_deserialization_schema._j_deserialization_schema\
                .deserialize(bytes(jsons[i], encoding='utf-8'))
            result = str(json_row_serialization_schema._j_serialization_schema.
                         serialize(j_row),
                         encoding='utf-8')
            self.assertEqual(expected_jsons[i], result)
Beispiel #2
0
    def test_row_type(self):
        self.assertEqual(RowTypeInfo([Types.STRING(), Types.STRING()])
                         .get_field_names(), ['f0', 'f1'])
        self.assertEqual(RowTypeInfo([Types.STRING(), Types.STRING()],
                                     ['a', 'b']).get_field_names(), ['a', 'b'])

        self.assertEqual(RowTypeInfo([Types.STRING(), Types.STRING()],
                                     ['a', 'b']) == RowTypeInfo([Types.STRING(),
                                                                 Types.STRING()], ['a', 'b']), True)
        self.assertEqual(RowTypeInfo([Types.STRING(),
                                      Types.STRING()],
                                     ['a', 'b']) == RowTypeInfo([Types.STRING(),
                                                                Types.INT()],
                                                                ['a', 'b']), False)
        self.assertEqual(RowTypeInfo([Types.STRING(),
                                      Types.STRING()],
                                     ['a', 'b']).__str__(), "RowTypeInfo(a: String, b: String)")

        self.assertEqual(Types.ROW([Types.STRING(), Types.STRING()]),
                         RowTypeInfo([Types.STRING(), Types.STRING()]), True)

        self.assertEqual(Types.ROW_NAMED(['a', 'b'], [Types.STRING(), Types.STRING()])
                         .get_field_names(), ['a', 'b'], True)

        self.assertEqual(Types.ROW_NAMED(['a', 'b'], [Types.STRING(), Types.STRING()])
                         .get_field_types(), [Types.STRING(), Types.STRING()], True)
Beispiel #3
0
    def test_from_data_stream_with_schema(self):
        from pyflink.table import Schema

        ds = self.env.from_collection(
            [(1, 'Hi', 'Hello'), (2, 'Hello', 'Hi')],
            type_info=Types.ROW_NAMED(
                ["a", "b", "c"],
                [Types.INT(), Types.STRING(),
                 Types.STRING()]))

        table = self.t_env.from_data_stream(
            ds,
            Schema.new_builder().column("a", DataTypes.INT()).column(
                "b", DataTypes.STRING()).column("c",
                                                DataTypes.STRING()).build())
        result = table.execute()
        with result.collect() as result:
            collected_result = [str(item) for item in result]
            expected_result = [
                item for item in map(
                    str, [Row(1, 'Hi', 'Hello'),
                          Row(2, 'Hello', 'Hi')])
            ]
            expected_result.sort()
            collected_result.sort()
            self.assertEqual(expected_result, collected_result)
Beispiel #4
0
def python_data_stream_example():
    env = StreamExecutionEnvironment.get_execution_environment()
    # Set the parallelism to be one to make sure that all data including fired timer and normal data
    # are processed by the same worker and the collected result would be in order which is good for
    # assertion.
    env.set_parallelism(1)
    env.set_stream_time_characteristic(TimeCharacteristic.EventTime)

    type_info = Types.ROW_NAMED(['createTime', 'orderId', 'payAmount', 'payPlatform', 'provinceId'],
                                [Types.LONG(), Types.LONG(), Types.DOUBLE(), Types.INT(),
                                 Types.INT()])
    json_row_schema = JsonRowDeserializationSchema.builder().type_info(type_info).build()
    kafka_props = {'bootstrap.servers': 'localhost:9092', 'group.id': 'pyflink-e2e-source'}

    kafka_consumer = FlinkKafkaConsumer("timer-stream-source", json_row_schema, kafka_props)
    kafka_producer = FlinkKafkaProducer("timer-stream-sink", SimpleStringSchema(), kafka_props)

    watermark_strategy = WatermarkStrategy.for_bounded_out_of_orderness(Duration.of_seconds(5))\
        .with_timestamp_assigner(KafkaRowTimestampAssigner())

    kafka_consumer.set_start_from_earliest()
    ds = env.add_source(kafka_consumer).assign_timestamps_and_watermarks(watermark_strategy)
    ds.key_by(MyKeySelector(), key_type_info=Types.LONG()) \
        .process(MyProcessFunction(), output_type=Types.STRING()) \
        .add_sink(kafka_producer)
    env.execute_async("test data stream timer")
def popular_taxi_vendor():
    env = StreamExecutionEnvironment.get_execution_environment()
    env.set_parallelism(1)
    t_env = StreamTableEnvironment.create(stream_execution_environment=env)

    t_env.execute_sql(
        create_table_ddl(
            "WATERMARK FOR pickupTime AS pickupTime - INTERVAL '30' SECONDS"))
    taxi_ride = t_env.from_path('TaxiRide')
    popular_rides = taxi_ride.select(taxi_ride.vendorId, taxi_ride.pickupTime) \
        .window(Slide.over('15.minutes').every('5.minutes').on(taxi_ride.pickupTime).alias('w')) \
        .group_by(taxi_ride.vendorId, col('w')) \
        .select(taxi_ride.vendorId, \
                col('w').start.alias('start'), \
                col('w').end.alias('end'), \
                taxi_ride.vendorId.count.alias('cnt'))

    t_env.to_append_stream(
        popular_rides,
        Types.ROW_NAMED(['vendorId', 'start', 'end', 'cnt'], [
            Types.INT(),
            Types.SQL_TIMESTAMP(),
            Types.SQL_TIMESTAMP(),
            Types.LONG()
        ])).print()

    env.execute('Popular-Taxi-Vendor')
Beispiel #6
0
def _create_orc_basic_row_and_data() -> Tuple[RowType, RowTypeInfo, List[Row]]:
    row_type = DataTypes.ROW([
        DataTypes.FIELD('char', DataTypes.CHAR(10)),
        DataTypes.FIELD('varchar', DataTypes.VARCHAR(10)),
        DataTypes.FIELD('bytes', DataTypes.BYTES()),
        DataTypes.FIELD('boolean', DataTypes.BOOLEAN()),
        DataTypes.FIELD('decimal', DataTypes.DECIMAL(2, 0)),
        DataTypes.FIELD('int', DataTypes.INT()),
        DataTypes.FIELD('bigint', DataTypes.BIGINT()),
        DataTypes.FIELD('double', DataTypes.DOUBLE()),
        DataTypes.FIELD('date', DataTypes.DATE().bridged_to('java.sql.Date')),
        DataTypes.FIELD('timestamp', DataTypes.TIMESTAMP(3).bridged_to('java.sql.Timestamp')),
    ])
    row_type_info = Types.ROW_NAMED(
        ['char', 'varchar', 'bytes', 'boolean', 'decimal', 'int', 'bigint', 'double',
         'date', 'timestamp'],
        [Types.STRING(), Types.STRING(), Types.PRIMITIVE_ARRAY(Types.BYTE()), Types.BOOLEAN(),
         Types.BIG_DEC(), Types.INT(), Types.LONG(), Types.DOUBLE(), Types.SQL_DATE(),
         Types.SQL_TIMESTAMP()]
    )
    data = [Row(
        char='char',
        varchar='varchar',
        bytes=b'varbinary',
        boolean=True,
        decimal=Decimal(1.5),
        int=2147483647,
        bigint=-9223372036854775808,
        double=2e-308,
        date=date(1970, 1, 1),
        timestamp=datetime(1970, 1, 2, 3, 4, 5, 600000),
    )]
    return row_type, row_type_info, data
def max_travellers_per_destination():
    env = StreamExecutionEnvironment.get_execution_environment()
    t_env = StreamTableEnvironment.create(stream_execution_environment=env)

    t_env.execute_sql(
        create_table_ddl(
            "WATERMARK FOR dropOffTime AS dropOffTime - INTERVAL '30' SECONDS")
    )
    taxi_ride = t_env.from_path('TaxiRide')
    no_of_travelers_per_dest = taxi_ride \
        .select(taxi_ride.passengerCount, taxi_ride.dropOffTime, taxi_ride.destLocationZone) \
        .window(Tumble().over('1.hour').on(taxi_ride.dropOffTime).alias('w')) \
        .group_by(taxi_ride.destLocationZone, col('w')) \
        .select(taxi_ride.destLocationZone, \
                col('w').start.alias('start'), \
                col('w').end.alias('end'), \
                taxi_ride.passengerCount.count.alias('cnt'))

    t_env.to_append_stream(
        no_of_travelers_per_dest,
        Types.ROW_NAMED(['destLocationZone', 'start', 'end', 'cnt'], [
            Types.STRING(),
            Types.SQL_TIMESTAMP(),
            Types.SQL_TIMESTAMP(),
            Types.LONG()
        ])).print()

    env.execute('Max-Travellers-Per-Destination')
Beispiel #8
0
def _create_parquet_map_row_and_data() -> Tuple[RowType, RowTypeInfo, List[Row]]:
    row_type = DataTypes.ROW([
        DataTypes.FIELD('map', DataTypes.MAP(DataTypes.INT(), DataTypes.STRING())),
    ])
    row_type_info = Types.ROW_NAMED(['map'], [Types.MAP(Types.INT(), Types.STRING())])
    data = [Row(
        map={0: 'a', 1: 'b', 2: 'c'}
    )]
    return row_type, row_type_info, data
Beispiel #9
0
    def test_from_and_to_data_stream_event_time(self):
        from pyflink.table import Schema

        ds = self.env.from_collection([(1, 42, "a"), (2, 5, "a"), (3, 1000, "c"), (100, 1000, "c")],
                                      Types.ROW_NAMED(
                                          ["a", "b", "c"],
                                          [Types.LONG(), Types.INT(), Types.STRING()]))
        ds = ds.assign_timestamps_and_watermarks(
            WatermarkStrategy.for_monotonous_timestamps()
            .with_timestamp_assigner(MyTimestampAssigner()))

        table = self.t_env.from_data_stream(ds,
                                            Schema.new_builder()
                                                  .column_by_metadata("rowtime", "TIMESTAMP_LTZ(3)")
                                                  .watermark("rowtime", "SOURCE_WATERMARK()")
                                                  .build())
        self.assertEqual("""(
  `a` BIGINT,
  `b` INT,
  `c` STRING,
  `rowtime` TIMESTAMP_LTZ(3) *ROWTIME* METADATA,
  WATERMARK FOR `rowtime`: TIMESTAMP_LTZ(3) AS SOURCE_WATERMARK()
)""",
                         table._j_table.getResolvedSchema().toString())
        self.t_env.create_temporary_view("t",
                                         ds,
                                         Schema.new_builder()
                                         .column_by_metadata("rowtime", "TIMESTAMP_LTZ(3)")
                                         .watermark("rowtime", "SOURCE_WATERMARK()")
                                         .build())

        result = self.t_env.execute_sql("SELECT "
                                        "c, SUM(b) "
                                        "FROM t "
                                        "GROUP BY c, TUMBLE(rowtime, INTERVAL '0.005' SECOND)")
        with result.collect() as result:
            collected_result = [str(item) for item in result]
            expected_result = [item for item in
                               map(str, [Row('a', 47), Row('c', 1000), Row('c', 1000)])]
            expected_result.sort()
            collected_result.sort()
            self.assertEqual(expected_result, collected_result)

        ds = self.t_env.to_data_stream(table)
        ds.key_by(lambda k: k.c, key_type=Types.STRING()) \
            .window(MyTumblingEventTimeWindow()) \
            .apply(SumWindowFunction(), Types.TUPLE([Types.STRING(), Types.INT()])) \
            .add_sink(self.test_sink)
        self.env.execute()
        expected_results = ['(a,47)', '(c,1000)', '(c,1000)']
        actual_results = self.test_sink.get_results(False)
        expected_results.sort()
        actual_results.sort()
        self.assertEqual(expected_results, actual_results)
Beispiel #10
0
def _create_parquet_array_row_and_data() -> Tuple[RowType, RowTypeInfo, RowTypeInfo, List[Row]]:
    row_type = DataTypes.ROW([
        DataTypes.FIELD('string_array', DataTypes.ARRAY(DataTypes.STRING())),
        DataTypes.FIELD('int_array', DataTypes.ARRAY(DataTypes.INT())),
    ])
    row_type_info = Types.ROW_NAMED([
        'string_array',
        'int_array',
    ], [
        Types.LIST(Types.STRING()),
        Types.LIST(Types.INT()),
    ])
    conversion_row_type_info = Types.ROW_NAMED([
        'string_array',
        'int_array',
    ], [
        Types.OBJECT_ARRAY(Types.STRING()),
        Types.OBJECT_ARRAY(Types.INT()),
    ])
    data = [Row(
        string_array=['a', 'b', 'c'],
        int_array=[1, 2, 3],
    )]
    return row_type, row_type_info, conversion_row_type_info, data
Beispiel #11
0
def _create_parquet_basic_row_and_data() -> Tuple[RowType, RowTypeInfo, List[Row]]:
    row_type = DataTypes.ROW([
        DataTypes.FIELD('char', DataTypes.CHAR(10)),
        DataTypes.FIELD('varchar', DataTypes.VARCHAR(10)),
        DataTypes.FIELD('binary', DataTypes.BINARY(10)),
        DataTypes.FIELD('varbinary', DataTypes.VARBINARY(10)),
        DataTypes.FIELD('boolean', DataTypes.BOOLEAN()),
        DataTypes.FIELD('decimal', DataTypes.DECIMAL(2, 0)),
        DataTypes.FIELD('int', DataTypes.INT()),
        DataTypes.FIELD('bigint', DataTypes.BIGINT()),
        DataTypes.FIELD('double', DataTypes.DOUBLE()),
        DataTypes.FIELD('date', DataTypes.DATE().bridged_to('java.sql.Date')),
        DataTypes.FIELD('time', DataTypes.TIME().bridged_to('java.sql.Time')),
        DataTypes.FIELD('timestamp', DataTypes.TIMESTAMP(3).bridged_to('java.sql.Timestamp')),
        DataTypes.FIELD('timestamp_ltz', DataTypes.TIMESTAMP_LTZ(3)),
    ])
    row_type_info = Types.ROW_NAMED(
        ['char', 'varchar', 'binary', 'varbinary', 'boolean', 'decimal', 'int', 'bigint', 'double',
         'date', 'time', 'timestamp', 'timestamp_ltz'],
        [Types.STRING(), Types.STRING(), Types.PRIMITIVE_ARRAY(Types.BYTE()),
         Types.PRIMITIVE_ARRAY(Types.BYTE()), Types.BOOLEAN(), Types.BIG_DEC(), Types.INT(),
         Types.LONG(), Types.DOUBLE(), Types.SQL_DATE(), Types.SQL_TIME(), Types.SQL_TIMESTAMP(),
         Types.INSTANT()]
    )
    datetime_ltz = datetime.datetime(1970, 2, 3, 4, 5, 6, 700000, tzinfo=pytz.timezone('UTC'))
    timestamp_ltz = Instant.of_epoch_milli(
        (
            calendar.timegm(datetime_ltz.utctimetuple()) +
            calendar.timegm(time.localtime(0))
        ) * 1000 + datetime_ltz.microsecond // 1000
    )
    data = [Row(
        char='char',
        varchar='varchar',
        binary=b'binary',
        varbinary=b'varbinary',
        boolean=True,
        decimal=Decimal(1.5),
        int=2147483647,
        bigint=-9223372036854775808,
        double=2e-308,
        date=datetime.date(1970, 1, 1),
        time=datetime.time(1, 1, 1),
        timestamp=datetime.datetime(1970, 1, 2, 3, 4, 5, 600000),
        timestamp_ltz=timestamp_ltz
    )]
    return row_type, row_type_info, data
Beispiel #12
0
def popular_destination_query():
    env = StreamExecutionEnvironment.get_execution_environment()
    t_env = StreamTableEnvironment.create(stream_execution_environment=env)

    t_env.execute_sql(
        create_table_ddl(
            "WATERMARK FOR pickupTime AS pickupTime - INTERVAL '30' SECONDS"))

    query = f"""SELECT 
    destLocationId, wstart, wend, cnt 
FROM 
    (SELECT 
        destLocationId, 
        HOP_START(pickupTime, INTERVAL '5' MINUTE, INTERVAL '15' MINUTE) AS wstart, 
        HOP_END(pickupTime, INTERVAL '5' MINUTE, INTERVAL '15' MINUTE) AS wend, 
        COUNT(destLocationId) AS cnt 
    FROM
        (SELECT 
            pickupTime, 
            destLocationId 
        FROM TaxiRide) 
    GROUP BY
        destLocationId, HOP(pickupTime, INTERVAL '5' MINUTE, INTERVAL '15' MINUTE)
    )
WHERE cnt > {args.threshold}
"""

    results = t_env.sql_query(query)

    t_env.to_append_stream(
        results,
        Types.ROW_NAMED(['destLocationId', 'wstart', 'wend', 'cnt'], [
            Types.INT(),
            Types.SQL_TIMESTAMP(),
            Types.SQL_TIMESTAMP(),
            Types.LONG()
        ])).print()

    env.execute('Popular-Destination')
Beispiel #13
0
def _create_parquet_array_row_and_data() -> Tuple[RowType, RowTypeInfo, List[Row]]:
    row_type = DataTypes.ROW([
        DataTypes.FIELD(
            'string_array',
            DataTypes.ARRAY(DataTypes.STRING()).bridged_to('java.util.ArrayList')
        ),
        DataTypes.FIELD(
            'int_array',
            DataTypes.ARRAY(DataTypes.INT()).bridged_to('java.util.ArrayList')
        ),
    ])
    row_type_info = Types.ROW_NAMED([
        'string_array',
        'int_array',
    ], [
        Types.LIST(Types.STRING()),
        Types.LIST(Types.INT()),
    ])
    data = [Row(
        string_array=['a', 'b', 'c'],
        int_array=[1, 2, 3],
    )]
    return row_type, row_type_info, data
Beispiel #14
0
import json

from pyflink.common.serialization import SimpleStringSchema, SimpleStringEncoder, JsonRowDeserializationSchema
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.datastream.connectors import FlinkKafkaConsumer, StreamingFileSink
from pyflink.common.typeinfo import Types
from pyflink.datastream.functions import MapFunction

s_env = StreamExecutionEnvironment.get_execution_environment()
s_env.set_parallelism(1)
ti = Types.ROW_NAMED(
    ["app", 'busi', 'date', 'ip'],
    [Types.STRING(),
     Types.STRING(),
     Types.BIG_INT(),
     Types.STRING()])
builder = JsonRowDeserializationSchema.builder()
builder.type_info(ti)
jrds = builder.ignore_parse_errors().build()
fkc = FlinkKafkaConsumer(topics="ULS-BUSI-LOG-dev",
                         deserialization_schema=jrds,
                         properties={
                             "bootstrap.servers": "10.100.1.16:9192",
                             "group.id": "123",
                             "auto.offset.reset": "earliest"
                         })
fkc.set_start_from_earliest()
src = s_env.add_source(fkc).map(lambda x: x.get("values"))
src.add_sink(
    StreamingFileSink.for_row_format('C:\\tmp\\pyoutput',
                                     SimpleStringEncoder()).build())
Beispiel #15
0
class ProductJob(FlinkJob):

    job_name = "wish:product"
    kafka_addr = "10.0.9.5:9092"
    wish_data_topic = "wish-product-data"
    wish_result_topic = "wish-product-result-data"
    wish_shop_result_topic = "wish-shop-result-data"
    tables = [
      f"""
        CREATE TABLE wish_product_data (
          `timestamp` INT,
          `pid` STRING,
          `merchant_id` STRING,
          `merchant_name` STRING,
          `shop_name` STRING,
          `review_number` INT,
          `review_score` FLOAT,
          `shop_review_number` INT,
          `title` STRING,
          `is_pb` BOOLEAN,
          `is_hwc` BOOLEAN,
          `is_verified` BOOLEAN,
          `total_bought` INT,
          `total_wishlist` INT,
          `tags` ARRAY<STRING>,
          `category_ids` ARRAY<STRING>,
          `category_paths` ARRAY<STRING>,
          `category_l1_ids` ARRAY<STRING>,
          `category_l2_ids` ARRAY<STRING>,
          `category_l3_ids` ARRAY<STRING>,
          `leaf_category_ids` ARRAY<STRING>,
          `price` FLOAT,
          `shipping_price` FLOAT,
          `sold` FLOAT,
          `update_time` TIMESTAMP(0),
          `shop_open_time` TIMESTAMP(0),
          `gen_time` TIMESTAMP(0),
          `data_update_time` TIMESTAMP(0)
        ) WITH (
          'connector' = 'kafka',
          'topic' = '{wish_data_topic}',
          'properties.bootstrap.servers' = '{kafka_addr}',
          'properties.group.id' = 'flink_bigdata',
          'scan.startup.mode' = 'group-offsets',
          'format' = 'json'
        )
      """
    ]
    result_type = Types.ROW_NAMED(
      ["infos"],
      [Types.STRING()]
    )
    shop_result_type = Types.ROW_NAMED(
      ["shop_name", "shop_agg"],
      [Types.STRING(), Types.STRING()]
    )

    def process(self) -> None:
      calculate = udaf(CalculateAgg())
      shop_agg = udaf(ShopAgg())
      table = self.table_env.from_path("wish_product_data")
      result_table = table.group_by(table.pid)\
                    .select(calculate(table.timestamp, table.pid, table.merchant_id,
                                      table.merchant_name, table.shop_name, table.review_number,
                                      table.review_score, table.shop_review_number,
                                      table.title, table.is_pb, table.is_hwc, table.is_verified,
                                      table.total_bought, table.total_wishlist, table.tags,
                                      table.category_ids, table.category_paths,
                                      table.category_l1_ids, table.category_l2_ids,
                                      table.category_l3_ids, table.leaf_category_ids,
                                      table.price, table.shipping_price, table.sold, table.update_time,
                                      table.shop_open_time, table.gen_time, table.data_update_time))
                            #calculate(table.timestamp, table.update_time, table.sold, table.price, table.review_number))
      shop_table = table.select(table.update_time, table.shop_name)
      shop_table = shop_table.add_columns("1 as count")
      shop_result_table = shop_table.group_by(shop_table.shop_name)\
          .select(shop_table.shop_name, shop_agg(table.update_time, shop_table.count))
      self.sink_to_kafka(self.wish_result_topic, result_table, self.result_type)
      self.sink_to_kafka(self.wish_shop_result_topic, shop_result_table, self.shop_result_type)