Esempio n. 1
0
def word_count(input_path, output_path):
    t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode())
    # write all the data to one file
    t_env.get_config().get_configuration().set_string("parallelism.default",
                                                      "1")

    # define the source
    if input_path is not None:
        t_env.create_temporary_table(
            'source',
            TableDescriptor.for_connector('filesystem').schema(
                Schema.new_builder().column(
                    'word', DataTypes.STRING()).build()).option(
                        'path', input_path).format('csv').build())
        tab = t_env.from_path('source')
    else:
        print("Executing word_count example with default input data set.")
        print("Use --input to specify file input.")
        tab = t_env.from_elements(
            map(lambda i: (i, ), word_count_data),
            DataTypes.ROW([DataTypes.FIELD('line', DataTypes.STRING())]))

    # define the sink
    if output_path is not None:
        t_env.create_temporary_table(
            'sink',
            TableDescriptor.for_connector('filesystem').schema(
                Schema.new_builder().column('word', DataTypes.STRING()).column(
                    'count',
                    DataTypes.BIGINT()).build()).option('path', output_path).
            format(FormatDescriptor.for_format('canal-json').build()).build())
    else:
        print(
            "Printing result to stdout. Use --output to specify output path.")
        t_env.create_temporary_table(
            'sink',
            TableDescriptor.for_connector('print').schema(
                Schema.new_builder().column('word', DataTypes.STRING()).column(
                    'count', DataTypes.BIGINT()).build()).build())

    @udtf(result_types=[DataTypes.STRING()])
    def split(line: Row):
        for s in line[0].split():
            yield Row(s)

    # compute word count
    tab.flat_map(split).alias('word') \
       .group_by(col('word')) \
       .select(col('word'), lit(1).count) \
       .execute_insert('sink') \
       .wait()
Esempio n. 2
0
def process_json_data():
    t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode())

    # define the source
    table = t_env.from_elements(elements=[
        (1,
         '{"name": "Flink", "tel": 123, "addr": {"country": "Germany", "city": "Berlin"}}'
         ),
        (2,
         '{"name": "hello", "tel": 135, "addr": {"country": "China", "city": "Shanghai"}}'
         ),
        (3,
         '{"name": "world", "tel": 124, "addr": {"country": "USA", "city": "NewYork"}}'
         ),
        (4,
         '{"name": "PyFlink", "tel": 32, "addr": {"country": "China", "city": "Hangzhou"}}'
         )
    ],
                                schema=['id', 'data'])

    # define the sink
    t_env.create_temporary_table(
        'sink',
        TableDescriptor.for_connector('print').schema(
            Schema.new_builder().column('id', DataTypes.BIGINT()).column(
                'data', DataTypes.STRING()).build()).build())

    table = table.select(
        col('id'),
        col('data').json_value('$.addr.country', DataTypes.STRING()))

    # execute
    table.execute_insert('sink') \
         .wait()
Esempio n. 3
0
def mixing_use_of_datastream_and_table():
    # use StreamTableEnvironment instead of TableEnvironment when mixing use of table & datastream
    env = StreamExecutionEnvironment.get_execution_environment()
    t_env = StreamTableEnvironment.create(stream_execution_environment=env)

    # define the source
    t_env.create_temporary_table(
        'source',
        TableDescriptor.for_connector('datagen').schema(
            Schema.new_builder().column('id', DataTypes.BIGINT()).column(
                'data',
                DataTypes.STRING()).build()).option("number-of-rows",
                                                    "10").build())

    # define the sink
    t_env.create_temporary_table(
        'sink',
        TableDescriptor.for_connector('print').schema(
            Schema.new_builder().column('a',
                                        DataTypes.BIGINT()).build()).build())

    @udf(result_type=DataTypes.BIGINT())
    def length(data):
        return len(data)

    # perform table api operations
    table = t_env.from_path("source")
    table = table.select(col('id'), length(col('data')))

    # convert table to datastream and perform datastream api operations
    ds = t_env.to_data_stream(table)
    ds = ds.map(lambda i: i[0] + i[1], output_type=Types.LONG())

    # convert datastream to table and perform table api operations as you want
    table = t_env.from_data_stream(
        ds,
        Schema.new_builder().column("f0", DataTypes.BIGINT()).build())

    # execute
    table.execute_insert('sink') \
         .wait()
Esempio n. 4
0
def pandas_udaf():
    env = StreamExecutionEnvironment.get_execution_environment()
    env.set_parallelism(1)
    t_env = StreamTableEnvironment.create(stream_execution_environment=env)

    # define the source with watermark definition
    ds = env.from_collection(
        collection=[
            (Instant.of_epoch_milli(1000), 'Alice', 110.1),
            (Instant.of_epoch_milli(4000), 'Bob', 30.2),
            (Instant.of_epoch_milli(3000), 'Alice', 20.0),
            (Instant.of_epoch_milli(2000), 'Bob', 53.1),
            (Instant.of_epoch_milli(5000), 'Alice', 13.1),
            (Instant.of_epoch_milli(3000), 'Bob', 3.1),
            (Instant.of_epoch_milli(7000), 'Bob', 16.1),
            (Instant.of_epoch_milli(10000), 'Alice', 20.1)
        ],
        type_info=Types.ROW([Types.INSTANT(), Types.STRING(), Types.FLOAT()]))

    table = t_env.from_data_stream(
        ds,
        Schema.new_builder()
              .column_by_expression("ts", "CAST(f0 AS TIMESTAMP_LTZ(3))")
              .column("f1", DataTypes.STRING())
              .column("f2", DataTypes.FLOAT())
              .watermark("ts", "ts - INTERVAL '3' SECOND")
              .build()
    ).alias("ts, name, price")

    # define the sink
    t_env.create_temporary_table(
        'sink',
        TableDescriptor.for_connector('print')
                       .schema(Schema.new_builder()
                               .column('name', DataTypes.STRING())
                               .column('total_price', DataTypes.FLOAT())
                               .column('w_start', DataTypes.TIMESTAMP_LTZ())
                               .column('w_end', DataTypes.TIMESTAMP_LTZ())
                               .build())
                       .build())

    @udaf(result_type=DataTypes.FLOAT(), func_type="pandas")
    def mean_udaf(v):
        return v.mean()

    # define the tumble window operation
    table = table.window(Tumble.over(lit(5).seconds).on(col("ts")).alias("w")) \
                 .group_by(table.name, col('w')) \
                 .select(table.name, mean_udaf(table.price), col("w").start, col("w").end)

    # submit for execution
    table.execute_insert('sink') \
         .wait()
Esempio n. 5
0
def tumble_window_demo():
    env = StreamExecutionEnvironment.get_execution_environment()
    env.set_parallelism(1)
    t_env = StreamTableEnvironment.create(stream_execution_environment=env)

    # define the source with watermark definition
    ds = env.from_collection(
        collection=[
            (Instant.of_epoch_milli(1000), 'Alice', 110.1),
            (Instant.of_epoch_milli(4000), 'Bob', 30.2),
            (Instant.of_epoch_milli(3000), 'Alice', 20.0),
            (Instant.of_epoch_milli(2000), 'Bob', 53.1),
            (Instant.of_epoch_milli(5000), 'Alice', 13.1),
            (Instant.of_epoch_milli(3000), 'Bob', 3.1),
            (Instant.of_epoch_milli(7000), 'Bob', 16.1),
            (Instant.of_epoch_milli(10000), 'Alice', 20.1)
        ],
        type_info=Types.ROW([Types.INSTANT(), Types.STRING(), Types.FLOAT()]))

    table = t_env.from_data_stream(
        ds,
        Schema.new_builder()
              .column_by_expression("ts", "CAST(f0 AS TIMESTAMP(3))")
              .column("f1", DataTypes.STRING())
              .column("f2", DataTypes.FLOAT())
              .watermark("ts", "ts - INTERVAL '3' SECOND")
              .build()
    ).alias("ts", "name", "price")

    # define the sink
    t_env.create_temporary_table(
        'sink',
        TableDescriptor.for_connector('print')
                       .schema(Schema.new_builder()
                               .column('name', DataTypes.STRING())
                               .column('total_price', DataTypes.FLOAT())
                               .build())
                       .build())

    # define the over window operation
    table = table.over_window(
        Over.partition_by(col("name"))
            .order_by(col("ts"))
            .preceding(row_interval(2))
            .following(CURRENT_ROW)
            .alias('w')) \
        .select(table.name, table.price.max.over(col('w')))

    # submit for execution
    table.execute_insert('sink') \
         .wait()
 def setUp(self) -> None:
     self.env = StreamExecutionEnvironment.get_execution_environment()
     self.env.set_parallelism(1)
     self.env.add_jars("file://{}".format(find_jar_path()))
     self.t_env = StreamTableEnvironment.create(self.env)
     self.source_table = self.t_env.from_descriptor(
         TableDescriptor.for_connector("datagen").schema(
             Schema.new_builder().column("x", DataTypes.INT()).column(
                 "a", DataTypes.INT()).build()).option(
                     "fields.x.kind",
                     "sequence").option("fields.x.start", "1").option(
                         "fields.x.end",
                         "100").option("fields.a.kind", "sequence").option(
                             "fields.a.start",
                             "101").option("fields.a.end", "200").build())
Esempio n. 7
0
def process_json_data_with_udf():
    t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode())

    # define the source
    table = t_env.from_elements(elements=[
        (1,
         '{"name": "Flink", "tel": 123, "addr": {"country": "Germany", "city": "Berlin"}}'
         ),
        (2,
         '{"name": "hello", "tel": 135, "addr": {"country": "China", "city": "Shanghai"}}'
         ),
        (3,
         '{"name": "world", "tel": 124, "addr": {"country": "USA", "city": "NewYork"}}'
         ),
        (4,
         '{"name": "PyFlink", "tel": 32, "addr": {"country": "China", "city": "Hangzhou"}}'
         )
    ],
                                schema=['id', 'data'])

    # define the sink
    t_env.create_temporary_table(
        'sink',
        TableDescriptor.for_connector('print').schema(
            Schema.new_builder().column('id', DataTypes.BIGINT()).column(
                'data', DataTypes.STRING()).build()).build())

    # update json columns
    @udf(result_type=DataTypes.STRING())
    def update_tel(data):
        json_data = json.loads(data)
        json_data['tel'] += 1
        return json.dumps(json_data)

    table = table.select(table.id, update_tel(table.data))

    # execute
    table.execute_insert('sink') \
         .wait()