Exemple #1
0
def word_count():
    content = "line Licensed to the Apache Software Foundation ASF under one " \
              "line or more contributor license agreements See the NOTICE file " \
              "line distributed with this work for additional information " \
              "line regarding copyright ownership The ASF licenses this file " \
              "to you under the Apache License Version the " \
              "License you may not use this file except in compliance " \
              "with the License"

    t_env = TableEnvironment.create(EnvironmentSettings.in_batch_mode())

    # used to test pipeline.jars and pipeline.classpaths
    config_key = sys.argv[1]
    config_value = sys.argv[2]
    t_env.get_config().set(config_key, config_value)

    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    result_path = tmp_dir + '/result'
    if os.path.exists(result_path):
        try:
            if os.path.isfile(result_path):
                os.remove(result_path)
            else:
                shutil.rmtree(result_path)
        except OSError as e:
            logging.error("Error removing directory: %s - %s.", e.filename,
                          e.strerror)

    logging.info("Results directory: %s", result_path)

    sink_ddl = """
        create table Results(
            word VARCHAR,
            `count` BIGINT,
            `count_java` BIGINT
        ) with (
            'connector.type' = 'filesystem',
            'format.type' = 'csv',
            'connector.path' = '{}'
        )
        """.format(result_path)
    t_env.execute_sql(sink_ddl)

    t_env.execute_sql(
        "create temporary system function add_one as 'add_one.add_one' language python"
    )
    t_env.register_java_function("add_one_java",
                                 "org.apache.flink.python.tests.util.AddOne")

    elements = [(word, 0) for word in content.split(" ")]
    t = t_env.from_elements(elements, ["word", "count"])
    t.select(t.word,
             call("add_one", t.count).alias("count"),
             call("add_one_java", t.count).alias("count_java")) \
        .group_by(t.word) \
        .select(t.word,
                col("count").count.alias("count"),
                col("count_java").count.alias("count_java")) \
        .execute_insert("Results")
def process_json_data():
    t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode())

    # define the source
    table = t_env.from_elements(elements=[
        (1,
         '{"name": "Flink", "tel": 123, "addr": {"country": "Germany", "city": "Berlin"}}'
         ),
        (2,
         '{"name": "hello", "tel": 135, "addr": {"country": "China", "city": "Shanghai"}}'
         ),
        (3,
         '{"name": "world", "tel": 124, "addr": {"country": "USA", "city": "NewYork"}}'
         ),
        (4,
         '{"name": "PyFlink", "tel": 32, "addr": {"country": "China", "city": "Hangzhou"}}'
         )
    ],
                                schema=['id', 'data'])

    # define the sink
    t_env.create_temporary_table(
        'sink',
        TableDescriptor.for_connector('print').schema(
            Schema.new_builder().column('id', DataTypes.BIGINT()).column(
                'data', DataTypes.STRING()).build()).build())

    table = table.select(
        col('id'),
        col('data').json_value('$.addr.country', DataTypes.STRING()))

    # execute
    table.execute_insert('sink') \
         .wait()
Exemple #3
0
def sql_operations():
    t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode())

    # define the source
    table = t_env.from_elements(elements=[
        (1,
         '{"name": "Flink", "tel": 123, "addr": {"country": "Germany", "city": "Berlin"}}'
         ),
        (2,
         '{"name": "hello", "tel": 135, "addr": {"country": "China", "city": "Shanghai"}}'
         ),
        (3,
         '{"name": "world", "tel": 124, "addr": {"country": "USA", "city": "NewYork"}}'
         ),
        (4,
         '{"name": "PyFlink", "tel": 32, "addr": {"country": "China", "city": "Hangzhou"}}'
         )
    ],
                                schema=['id', 'data'])

    t_env.sql_query("SELECT * FROM %s" % table) \
         .execute().print()
    # +----+----------------------+--------------------------------+
    # | op |                   id |                           data |
    # +----+----------------------+--------------------------------+
    # | +I |                    1 | {"name": "Flink", "tel": 12... |
    # | +I |                    2 | {"name": "hello", "tel": 13... |
    # | +I |                    3 | {"name": "world", "tel": 12... |
    # | +I |                    4 | {"name": "PyFlink", "tel": ... |
    # +----+----------------------+--------------------------------+

    # execute sql statement
    @udtf(
        result_types=[DataTypes.STRING(),
                      DataTypes.INT(),
                      DataTypes.STRING()])
    def parse_data(data: str):
        json_data = json.loads(data)
        yield json_data['name'], json_data['tel'], json_data['addr']['country']

    t_env.create_temporary_function('parse_data', parse_data)
    t_env.execute_sql("""
        SELECT *
        FROM %s, LATERAL TABLE(parse_data(`data`)) t(name, tel, country)
        """ % table).print()
    # +----+----------------------+--------------------------------+--------------------------------+-------------+--------------------------------+
    # | op |                   id |                           data |                           name |         tel |                        country |
    # +----+----------------------+--------------------------------+--------------------------------+-------------+--------------------------------+
    # | +I |                    1 | {"name": "Flink", "tel": 12... |                          Flink |         123 |                        Germany |
    # | +I |                    2 | {"name": "hello", "tel": 13... |                          hello |         135 |                          China |
    # | +I |                    3 | {"name": "world", "tel": 12... |                          world |         124 |                            USA |
    # | +I |                    4 | {"name": "PyFlink", "tel": ... |                        PyFlink |          32 |                          China |
    # +----+----------------------+--------------------------------+--------------------------------+-------------+--------------------------------+

    # explain sql plan
    print(
        t_env.explain_sql("""
        SELECT *
        FROM %s, LATERAL TABLE(parse_data(`data`)) t(name, tel, country)
        """ % table))
Exemple #4
0
 def setUp(self):
     super(StreamDependencyTests, self).setUp()
     origin_execution_mode = os.environ['_python_worker_execution_mode']
     os.environ['_python_worker_execution_mode'] = "loopback"
     try:
         self.st_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode())
     finally:
         if origin_execution_mode is not None:
             os.environ['_python_worker_execution_mode'] = origin_execution_mode
def conversion_from_dataframe():
    t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode())
    t_env.get_config().set("parallelism.default", "1")

    # define the source with watermark definition
    pdf = pd.DataFrame(np.random.rand(1000, 2))
    table = t_env.from_pandas(pdf,
                              schema=DataTypes.ROW([
                                  DataTypes.FIELD("a", DataTypes.DOUBLE()),
                                  DataTypes.FIELD("b", DataTypes.DOUBLE())
                              ]))

    print(table.to_pandas())
Exemple #6
0
def word_count(input_path, output_path):
    t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode())
    # write all the data to one file
    t_env.get_config().get_configuration().set_string("parallelism.default",
                                                      "1")

    # define the source
    if input_path is not None:
        t_env.create_temporary_table(
            'source',
            TableDescriptor.for_connector('filesystem').schema(
                Schema.new_builder().column(
                    'word', DataTypes.STRING()).build()).option(
                        'path', input_path).format('csv').build())
        tab = t_env.from_path('source')
    else:
        print("Executing word_count example with default input data set.")
        print("Use --input to specify file input.")
        tab = t_env.from_elements(
            map(lambda i: (i, ), word_count_data),
            DataTypes.ROW([DataTypes.FIELD('line', DataTypes.STRING())]))

    # define the sink
    if output_path is not None:
        t_env.create_temporary_table(
            'sink',
            TableDescriptor.for_connector('filesystem').schema(
                Schema.new_builder().column('word', DataTypes.STRING()).column(
                    'count',
                    DataTypes.BIGINT()).build()).option('path', output_path).
            format(FormatDescriptor.for_format('canal-json').build()).build())
    else:
        print(
            "Printing result to stdout. Use --output to specify output path.")
        t_env.create_temporary_table(
            'sink',
            TableDescriptor.for_connector('print').schema(
                Schema.new_builder().column('word', DataTypes.STRING()).column(
                    'count', DataTypes.BIGINT()).build()).build())

    @udtf(result_types=[DataTypes.STRING()])
    def split(line: Row):
        for s in line[0].split():
            yield Row(s)

    # compute word count
    tab.flat_map(split).alias('word') \
       .group_by(col('word')) \
       .select(col('word'), lit(1).count) \
       .execute_insert('sink') \
       .wait()
Exemple #7
0
def word_count():
    tmp_dir = tempfile.gettempdir()
    source_path = tmp_dir + '/streaming.csv'
    if os.path.isfile(source_path):
        os.remove(source_path)
    content = "line Licensed to the Apache Software Foundation ASF under one " \
              "line or more contributor license agreements See the NOTICE file " \
              "line distributed with this work for additional information " \
              "line regarding copyright ownership The ASF licenses this file " \
              "to you under the Apache License Version the " \
              "License you may not use this file except in compliance " \
              "with the License"

    with open(source_path, 'w') as f:
        for word in content.split(" "):
            f.write(",".join([word, "1"]))
            f.write("\n")
            f.flush()
        f.close()

    t_config = TableConfig.Builder().as_batch_execution().set_parallelism(1).build()
    t_env = TableEnvironment.create(t_config)

    field_names = ["word", "cout"]
    field_types = [DataTypes.STRING, DataTypes.LONG]

    # register Orders table in table environment
    t_env.register_table_source(
        "Word",
        CsvTableSource(source_path, field_names, field_types))

    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    tmp_csv = tmp_dir + '/streaming2.csv'
    if os.path.isfile(tmp_csv):
        os.remove(tmp_csv)

    t_env.register_table_sink(
        "Results",
        field_names, field_types, CsvTableSink(tmp_csv))

    t_env.scan("Word") \
        .group_by("word") \
        .select("word, count(1) as count") \
        .insert_into("Results")

    t_env.execute()
Exemple #8
0
def word_count():
    tmp_dir = tempfile.gettempdir()
    source_path = tmp_dir + '/streaming.csv'
    if os.path.isfile(source_path):
        os.remove(source_path)
    content = "line Licensed to the Apache Software Foundation ASF under one " \
              "line or more contributor license agreements See the NOTICE file " \
              "line distributed with this work for additional information " \
              "line regarding copyright ownership The ASF licenses this file " \
              "to you under the Apache License Version the " \
              "License you may not use this file except in compliance " \
              "with the License"

    with open(source_path, 'w') as f:
        for word in content.split(" "):
            f.write(",".join([word, "1"]))
            f.write("\n")
            f.flush()
        f.close()

    t_config = TableConfig.Builder().as_batch_execution().set_parallelism(
        1).build()
    t_env = TableEnvironment.create(t_config)

    field_names = ["word", "cout"]
    field_types = [DataTypes.STRING, DataTypes.LONG]

    # register Orders table in table environment
    t_env.register_table_source(
        "Word", CsvTableSource(source_path, field_names, field_types))

    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    tmp_csv = tmp_dir + '/streaming2.csv'
    if os.path.isfile(tmp_csv):
        os.remove(tmp_csv)

    t_env.register_table_sink("Results", field_names, field_types,
                              CsvTableSink(tmp_csv))

    t_env.scan("Word") \
        .group_by("word") \
        .select("word, count(1) as count") \
        .insert_into("Results")

    t_env.execute()
Exemple #9
0
def multi_sink():
    t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode())

    table = t_env.from_elements(elements=[(1, 'Hello'), (2, 'World'),
                                          (3, "Flink"), (4, "PyFlink")],
                                schema=['id', 'data'])

    # define the sink tables
    t_env.execute_sql("""
        CREATE TABLE first_sink (
            id BIGINT,
            data VARCHAR
        ) WITH (
            'connector' = 'print'
        )
    """)

    t_env.execute_sql("""
        CREATE TABLE second_sink (
            id BIGINT,
            data VARCHAR
        ) WITH (
            'connector' = 'print'
        )
    """)

    # create a statement set
    statement_set = t_env.create_statement_set()

    # emit the data with id <= 3 to the "first_sink" via sql statement
    statement_set.add_insert_sql(
        "INSERT INTO first_sink SELECT * FROM %s WHERE id <= 3" % table)

    # emit the data which contains "Flink" to the "second_sink"
    @udf(result_type=DataTypes.BOOLEAN())
    def contains_flink(data):
        return "Flink" in data

    second_table = table.where(contains_flink(table.data))
    statement_set.add_insert("second_sink", second_table)

    # execute the statement set
    # remove .wait if submitting to a remote cluster, refer to
    # https://nightlies.apache.org/flink/flink-docs-stable/docs/dev/python/faq/#wait-for-jobs-to-finish-when-executing-jobs-in-mini-cluster
    # for more details
    statement_set.execute().wait()
Exemple #10
0
def word_count():
    content = "line Licensed to the Apache Software Foundation ASF under one " \
              "line or more contributor license agreements See the NOTICE file " \
              "line distributed with this work for additional information " \
              "line regarding copyright ownership The ASF licenses this file " \
              "to you under the Apache License Version the " \
              "License you may not use this file except in compliance " \
              "with the License"

    env_settings = EnvironmentSettings.new_instance().in_batch_mode(
    ).use_blink_planner().build()
    t_env = TableEnvironment.create(environment_settings=env_settings)

    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    result_path = tmp_dir + '/result'
    if os.path.exists(result_path):
        try:
            if os.path.isfile(result_path):
                os.remove(result_path)
            else:
                shutil.rmtree(result_path)
        except OSError as e:
            logging.error("Error removing directory: %s - %s.", e.filename,
                          e.strerror)

    logging.info("Results directory: %s", result_path)

    sink_ddl = """
        create table Results(
            word VARCHAR,
            `count` BIGINT
        ) with (
            'connector.type' = 'filesystem',
            'format.type' = 'csv',
            'connector.path' = '{}'
        )
        """.format(result_path)
    t_env.execute_sql(sink_ddl)

    elements = [(word, 1) for word in content.split(" ")]
    table = t_env.from_elements(elements, ["word", "count"])
    table.group_by(table.word) \
         .select(table.word, expr.lit(1).count.alias('count')) \
         .execute_insert("Results")
Exemple #11
0
def word_count():
    content = "line Licensed to the Apache Software Foundation ASF under one " \
              "line or more contributor license agreements See the NOTICE file " \
              "line distributed with this work for additional information " \
              "line regarding copyright ownership The ASF licenses this file " \
              "to you under the Apache License Version the " \
              "License you may not use this file except in compliance " \
              "with the License"

    t_config = TableConfig.Builder().as_batch_execution().build()
    t_env = TableEnvironment.create(t_config)

    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    result_path = tmp_dir + '/result'
    if os.path.exists(result_path):
        try:
            if os.path.isfile(result_path):
                os.remove(result_path)
            else:
                shutil.rmtree(result_path)
        except OSError as e:
            logging.error("Error removing directory: %s - %s.", e.filename,
                          e.strerror)

    logging.info("Results directory: %s", result_path)

    t_env.connect(FileSystem().path(result_path)) \
        .with_format(OldCsv()
                     .field_delimiter(',')
                     .field("word", DataTypes.STRING())
                     .field("count", DataTypes.BIGINT())) \
        .with_schema(Schema()
                     .field("word", DataTypes.STRING())
                     .field("count", DataTypes.BIGINT())) \
        .register_table_sink("Results")

    elements = [(word, 1) for word in content.split(" ")]
    t_env.from_elements(elements, ["word", "count"]) \
         .group_by("word") \
         .select("word, count(1) as count") \
         .insert_into("Results")

    t_env.execute()
Exemple #12
0
def test_end_to_end():
    tmp_dir = tempfile.gettempdir()
    source_path = tmp_dir + '/streaming.csv'
    if os.path.isfile(source_path):
        os.remove(source_path)
    with open(source_path, 'w') as f:
        lines = '1,hi,hello\n' + '2,hi,hello\n'
        f.write(lines)
        f.close()
    _find_flink_home()
    print("using %s as FLINK_HOME..." % os.environ["FLINK_HOME"])

    t_config = TableConfig.Builder().as_streaming_execution().set_parallelism(1).build()
    t_env = TableEnvironment.get_table_environment(t_config)

    field_names = ["a", "b", "c"]
    field_types = [DataTypes.INT, DataTypes.STRING, DataTypes.STRING]

    # register Orders table in table environment
    t_env.register_table_source(
        "Orders",
        CsvTableSource(source_path, field_names, field_types))

    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    tmp_csv = tmp_dir + '/streaming2.csv'
    if os.path.isfile(tmp_csv):
        os.remove(tmp_csv)

    t_env.register_table_sink(
        "Results",
        field_names, field_types, CsvTableSink(tmp_csv))

    t_env.scan("Orders") \
         .where("a > 0") \
         .select("a + 1, b, c") \
         .insert_into("Results")

    t_env.execute()
    with open(tmp_csv, 'r') as f:
        lines = f.read()
        assert lines == '2,hi,hello\n' + '3,hi,hello\n'
    print("test passed, the log file is under this directory: %s/log" % os.environ["FLINK_HOME"])
Exemple #13
0
def pandas_udf_demo():
    # Init environment
    environment_settings = EnvironmentSettings.new_instance(
    ).use_blink_planner().in_batch_mode().build()
    t_env = TableEnvironment.create(environment_settings=environment_settings)
    t_env.get_config().get_configuration().set_string('parallelism.default',
                                                      '1')

    # Define and register UDF
    add = udf(lambda i, j: i + j,
              [DataTypes.BIGINT(), DataTypes.BIGINT()],
              DataTypes.BIGINT(),
              udf_type="pandas")

    # Register Source
    t_env.execute_sql("""
            CREATE TABLE mySource (
              a BIGINT,
              b BIGINT
            ) WITH (
              'connector' = 'filesystem',
              'format' = 'csv',
              'path' = '/opt/examples/table/input/udf_add_input'
            )
        """)

    # Register Sink
    t_env.execute_sql("""
            CREATE TABLE mySink (
              `sum` BIGINT
            ) WITH (
              'connector' = 'filesystem',
              'format' = 'csv',
              'path' = '/opt/examples/table/output/pandas_udf_add_output'
            )
        """)

    # Query
    t_env.from_path('mySource')\
        .select(add(col('a'), col('b'))) \
        .insert_into('mySink')

    t_env.execute("5-pandas_udf_add")
def log_processing():
    env_settings = EnvironmentSettings.new_instance().use_blink_planner(
    ).is_streaming_mode().build()
    t_env = TableEnvironment.create(env_settings)
    # specify connector and format jars
    t_env.get_config().get_configuration().set_string(
        "pipeline.jars",
        "file:///my/jar/path/connector.jar;file:///my/jar/path/json.jar")

    source_ddl = """
            CREATE TABLE source_table(
                a VARCHAR,
                b SERIAL
            ) WITH (
              'connector' = 'kafka',
              'topic' = 'topic',
              'properties.bootstrap.servers' = 'localhost:9092',
              'properties.group.id' = 'testGroup'
              'scan.startup.mode' = 'latest-offset',
              'format' = 'json'
            )
            """

    sink_ddl = """
            CREATE TABLE sink_table(
                a VARCHAR
            ) WITH (
              'connector' = 'kafka',
              'topic' = 'topic2',
              'properties.bootstrap.servers' = 'kafka:9092',
              'format' = 'json'
            )
            """

    t_env.execute_sql(source_ddl)
    t_env.execute_sql(sink_ddl)

    t_env.sql_query("SELECT a FROM source_table") \
        .execute_insert("sink_table").wait()
Exemple #15
0
def process_json_data_with_udf():
    t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode())

    # define the source
    table = t_env.from_elements(elements=[
        (1,
         '{"name": "Flink", "tel": 123, "addr": {"country": "Germany", "city": "Berlin"}}'
         ),
        (2,
         '{"name": "hello", "tel": 135, "addr": {"country": "China", "city": "Shanghai"}}'
         ),
        (3,
         '{"name": "world", "tel": 124, "addr": {"country": "USA", "city": "NewYork"}}'
         ),
        (4,
         '{"name": "PyFlink", "tel": 32, "addr": {"country": "China", "city": "Hangzhou"}}'
         )
    ],
                                schema=['id', 'data'])

    # define the sink
    t_env.create_temporary_table(
        'sink',
        TableDescriptor.for_connector('print').schema(
            Schema.new_builder().column('id', DataTypes.BIGINT()).column(
                'data', DataTypes.STRING()).build()).build())

    # update json columns
    @udf(result_type=DataTypes.STRING())
    def update_tel(data):
        json_data = json.loads(data)
        json_data['tel'] += 1
        return json.dumps(json_data)

    table = table.select(table.id, update_tel(table.data))

    # execute
    table.execute_insert('sink') \
         .wait()
Exemple #16
0
 def setUp(self):
     super(PyFlinkStreamTableTestCase, self).setUp()
     self.t_config = TableConfig.Builder().as_streaming_execution().set_parallelism(1).build()
     self.t_env = TableEnvironment.create(self.t_config)
Exemple #17
0
 def setUp(self):
     super(StreamDependencyTests, self).setUp()
     self.st_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode())
     self.st_env._execution_mode = "loopback"
 def setUp(self):
     super(PyFlinkBatchTableTestCase, self).setUp()
     self.t_config = TableConfig.Builder().as_batch_execution().set_parallelism(4).build()
     self.t_env = TableEnvironment.get_table_environment(self.t_config)
Exemple #19
0
def column_operations():
    t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode())

    # define the source
    table = t_env.from_elements(elements=[
        (1,
         '{"name": "Flink", "tel": 123, "addr": {"country": "Germany", "city": "Berlin"}}'
         ),
        (2,
         '{"name": "hello", "tel": 135, "addr": {"country": "China", "city": "Shanghai"}}'
         ),
        (3,
         '{"name": "world", "tel": 124, "addr": {"country": "USA", "city": "NewYork"}}'
         ),
        (4,
         '{"name": "PyFlink", "tel": 32, "addr": {"country": "China", "city": "Hangzhou"}}'
         )
    ],
                                schema=['id', 'data'])

    # add columns
    table = table.add_columns(
        col('data').json_value('$.name', DataTypes.STRING()).alias('name'),
        col('data').json_value('$.tel', DataTypes.STRING()).alias('tel'),
        col('data').json_value('$.addr.country',
                               DataTypes.STRING()).alias('country'))

    table.execute().print()
    # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+--------------------------------+
    # | op |                   id |                           data |                           name |                            tel |                        country |
    # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+--------------------------------+
    # | +I |                    1 | {"name": "Flink", "tel": 12... |                          Flink |                            123 |                        Germany |
    # | +I |                    2 | {"name": "hello", "tel": 13... |                          hello |                            135 |                          China |
    # | +I |                    3 | {"name": "world", "tel": 12... |                          world |                            124 |                            USA |
    # | +I |                    4 | {"name": "PyFlink", "tel": ... |                        PyFlink |                             32 |                          China |
    # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+--------------------------------+

    # drop columns
    table = table.drop_columns(col('data'))
    table.execute().print()
    # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+
    # | op |                   id |                           name |                            tel |                        country |
    # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+
    # | +I |                    1 |                          Flink |                            123 |                        Germany |
    # | +I |                    2 |                          hello |                            135 |                          China |
    # | +I |                    3 |                          world |                            124 |                            USA |
    # | +I |                    4 |                        PyFlink |                             32 |                          China |
    # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+

    # rename columns
    table = table.rename_columns(col('tel').alias('telephone'))
    table.execute().print()
    # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+
    # | op |                   id |                           name |                      telephone |                        country |
    # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+
    # | +I |                    1 |                          Flink |                            123 |                        Germany |
    # | +I |                    2 |                          hello |                            135 |                          China |
    # | +I |                    3 |                          world |                            124 |                            USA |
    # | +I |                    4 |                        PyFlink |                             32 |                          China |
    # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+

    # replace columns
    table = table.add_or_replace_columns(
        concat(col('id').cast(DataTypes.STRING()), '_',
               col('name')).alias('id'))
    table.execute().print()
Exemple #20
0
 def setUp(self):
     super(PyFlinkBatchTableTestCase, self).setUp()
     self.t_config = TableConfig.Builder().as_batch_execution(
     ).set_parallelism(1).build()
     self.t_env = TableEnvironment.create(self.t_config)
Exemple #21
0
def basic_operations():
    t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode())

    # define the source
    table = t_env.from_elements(elements=[
        (1,
         '{"name": "Flink", "tel": 123, "addr": {"country": "Germany", "city": "Berlin"}}'
         ),
        (2,
         '{"name": "hello", "tel": 135, "addr": {"country": "China", "city": "Shanghai"}}'
         ),
        (3,
         '{"name": "world", "tel": 124, "addr": {"country": "USA", "city": "NewYork"}}'
         ),
        (4,
         '{"name": "PyFlink", "tel": 32, "addr": {"country": "China", "city": "Hangzhou"}}'
         )
    ],
                                schema=['id', 'data'])

    right_table = t_env.from_elements(elements=[(1, 18), (2, 30), (3, 25),
                                                (4, 10)],
                                      schema=['id', 'age'])

    table = table.add_columns(
                    col('data').json_value('$.name', DataTypes.STRING()).alias('name'),
                    col('data').json_value('$.tel', DataTypes.STRING()).alias('tel'),
                    col('data').json_value('$.addr.country', DataTypes.STRING()).alias('country')) \
                 .drop_columns(col('data'))
    table.execute().print()
    # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+
    # | op |                   id |                           name |                            tel |                        country |
    # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+
    # | +I |                    1 |                          Flink |                            123 |                        Germany |
    # | +I |                    2 |                          hello |                            135 |                          China |
    # | +I |                    3 |                          world |                            124 |                            USA |
    # | +I |                    4 |                        PyFlink |                             32 |                          China |
    # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+

    # limit the number of outputs
    table.limit(3).execute().print()
    # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+
    # | op |                   id |                           name |                            tel |                        country |
    # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+
    # | +I |                    1 |                          Flink |                            123 |                        Germany |
    # | +I |                    2 |                          hello |                            135 |                          China |
    # | +I |                    3 |                          world |                            124 |                            USA |
    # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+

    # filter
    table.filter(col('id') != 3).execute().print()
    # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+
    # | op |                   id |                           name |                            tel |                        country |
    # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+
    # | +I |                    1 |                          Flink |                            123 |                        Germany |
    # | +I |                    2 |                          hello |                            135 |                          China |
    # | +I |                    4 |                        PyFlink |                             32 |                          China |
    # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+

    # aggregation
    table.group_by(col('country')) \
         .select(col('country'), col('id').count, col('tel').cast(DataTypes.BIGINT()).max) \
         .execute().print()
    # +----+--------------------------------+----------------------+----------------------+
    # | op |                        country |               EXPR$0 |               EXPR$1 |
    # +----+--------------------------------+----------------------+----------------------+
    # | +I |                        Germany |                    1 |                  123 |
    # | +I |                            USA |                    1 |                  124 |
    # | +I |                          China |                    1 |                  135 |
    # | -U |                          China |                    1 |                  135 |
    # | +U |                          China |                    2 |                  135 |
    # +----+--------------------------------+----------------------+----------------------+

    # distinct
    table.select(col('country')).distinct() \
         .execute().print()
    # +----+--------------------------------+
    # | op |                        country |
    # +----+--------------------------------+
    # | +I |                        Germany |
    # | +I |                          China |
    # | +I |                            USA |
    # +----+--------------------------------+

    # join
    # Note that it still doesn't support duplicate column names between the joined tables
    table.join(right_table.rename_columns(col('id').alias('r_id')), col('id') == col('r_id')) \
         .execute().print()
    # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+----------------------+----------------------+
    # | op |                   id |                           name |                            tel |                        country |                 r_id |                  age |
    # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+----------------------+----------------------+
    # | +I |                    4 |                        PyFlink |                             32 |                          China |                    4 |                   10 |
    # | +I |                    1 |                          Flink |                            123 |                        Germany |                    1 |                   18 |
    # | +I |                    2 |                          hello |                            135 |                          China |                    2 |                   30 |
    # | +I |                    3 |                          world |                            124 |                            USA |                    3 |                   25 |
    # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+----------------------+----------------------+

    # join lateral
    @udtf(result_types=[DataTypes.STRING()])
    def split(r: Row):
        for s in r.name.split("i"):
            yield s

    table.join_lateral(split.alias('a')) \
         .execute().print()
    # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+--------------------------------+
    # | op |                   id |                           name |                            tel |                        country |                              a |
    # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+--------------------------------+
    # | +I |                    1 |                          Flink |                            123 |                        Germany |                             Fl |
    # | +I |                    1 |                          Flink |                            123 |                        Germany |                             nk |
    # | +I |                    2 |                          hello |                            135 |                          China |                          hello |
    # | +I |                    3 |                          world |                            124 |                            USA |                          world |
    # | +I |                    4 |                        PyFlink |                             32 |                          China |                           PyFl |
    # | +I |                    4 |                        PyFlink |                             32 |                          China |                             nk |
    # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+--------------------------------+

    # show schema
    table.print_schema()
    # (
    #   `id` BIGINT,
    #   `name` STRING,
    #   `tel` STRING,
    #   `country` STRING
    # )

    # show execute plan
    print(table.join_lateral(split.alias('a')).explain())
Exemple #22
0
def row_operations():
    t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode())

    # define the source
    table = t_env.from_elements(elements=[
        (1,
         '{"name": "Flink", "tel": 123, "addr": {"country": "Germany", "city": "Berlin"}}'
         ),
        (2,
         '{"name": "hello", "tel": 135, "addr": {"country": "China", "city": "Shanghai"}}'
         ),
        (3,
         '{"name": "world", "tel": 124, "addr": {"country": "China", "city": "NewYork"}}'
         ),
        (4,
         '{"name": "PyFlink", "tel": 32, "addr": {"country": "China", "city": "Hangzhou"}}'
         )
    ],
                                schema=['id', 'data'])

    # map operation
    @udf(result_type=DataTypes.ROW([
        DataTypes.FIELD("id", DataTypes.BIGINT()),
        DataTypes.FIELD("country", DataTypes.STRING())
    ]))
    def extract_country(input_row: Row):
        data = json.loads(input_row.data)
        return Row(input_row.id, data['addr']['country'])

    table.map(extract_country) \
         .execute().print()
    # +----+----------------------+--------------------------------+
    # | op |                   id |                        country |
    # +----+----------------------+--------------------------------+
    # | +I |                    1 |                        Germany |
    # | +I |                    2 |                          China |
    # | +I |                    3 |                          China |
    # | +I |                    4 |                          China |
    # +----+----------------------+--------------------------------+

    # flat_map operation
    @udtf(result_types=[DataTypes.BIGINT(), DataTypes.STRING()])
    def extract_city(input_row: Row):
        data = json.loads(input_row.data)
        yield input_row.id, data['addr']['city']

    table.flat_map(extract_city) \
         .execute().print()

    # +----+----------------------+--------------------------------+
    # | op |                   f0 |                             f1 |
    # +----+----------------------+--------------------------------+
    # | +I |                    1 |                         Berlin |
    # | +I |                    2 |                       Shanghai |
    # | +I |                    3 |                        NewYork |
    # | +I |                    4 |                       Hangzhou |
    # +----+----------------------+--------------------------------+

    # aggregate operation
    class CountAndSumAggregateFunction(AggregateFunction):
        def get_value(self, accumulator):
            return Row(accumulator[0], accumulator[1])

        def create_accumulator(self):
            return Row(0, 0)

        def accumulate(self, accumulator, input_row):
            accumulator[0] += 1
            accumulator[1] += int(input_row.tel)

        def retract(self, accumulator, input_row):
            accumulator[0] -= 1
            accumulator[1] -= int(input_row.tel)

        def merge(self, accumulator, accumulators):
            for other_acc in accumulators:
                accumulator[0] += other_acc[0]
                accumulator[1] += other_acc[1]

        def get_accumulator_type(self):
            return DataTypes.ROW([
                DataTypes.FIELD("cnt", DataTypes.BIGINT()),
                DataTypes.FIELD("sum", DataTypes.BIGINT())
            ])

        def get_result_type(self):
            return DataTypes.ROW([
                DataTypes.FIELD("cnt", DataTypes.BIGINT()),
                DataTypes.FIELD("sum", DataTypes.BIGINT())
            ])

    count_sum = udaf(CountAndSumAggregateFunction())
    table.add_columns(
            col('data').json_value('$.name', DataTypes.STRING()).alias('name'),
            col('data').json_value('$.tel', DataTypes.STRING()).alias('tel'),
            col('data').json_value('$.addr.country', DataTypes.STRING()).alias('country')) \
         .group_by(col('country')) \
         .aggregate(count_sum.alias("cnt", "sum")) \
         .select(col('country'), col('cnt'), col('sum')) \
         .execute().print()

    # +----+--------------------------------+----------------------+----------------------+
    # | op |                        country |                  cnt |                  sum |
    # +----+--------------------------------+----------------------+----------------------+
    # | +I |                          China |                    3 |                  291 |
    # | +I |                        Germany |                    1 |                  123 |
    # +----+--------------------------------+----------------------+----------------------+

    # flat_aggregate operation
    class Top2(TableAggregateFunction):
        def emit_value(self, accumulator):
            for v in accumulator:
                if v:
                    yield Row(v)

        def create_accumulator(self):
            return [None, None]

        def accumulate(self, accumulator, input_row):
            tel = int(input_row.tel)
            if accumulator[0] is None or tel > accumulator[0]:
                accumulator[1] = accumulator[0]
                accumulator[0] = tel
            elif accumulator[1] is None or tel > accumulator[1]:
                accumulator[1] = tel

        def get_accumulator_type(self):
            return DataTypes.ARRAY(DataTypes.BIGINT())

        def get_result_type(self):
            return DataTypes.ROW([DataTypes.FIELD("tel", DataTypes.BIGINT())])

    top2 = udtaf(Top2())
    table.add_columns(
            col('data').json_value('$.name', DataTypes.STRING()).alias('name'),
            col('data').json_value('$.tel', DataTypes.STRING()).alias('tel'),
            col('data').json_value('$.addr.country', DataTypes.STRING()).alias('country')) \
        .group_by(col('country')) \
        .flat_aggregate(top2) \
        .select(col('country'), col('tel')) \
        .execute().print()
Exemple #23
0
from pyflink.table import DataTypes, EnvironmentSettings, TableEnvironment
from pyflink.table.expressions import col
from pyflink.table.udf import udf

env_settings = EnvironmentSettings.new_instance().in_streaming_mode(
).use_blink_planner().build()
t_env = TableEnvironment.create(environment_settings=env_settings)
t_env.get_config().get_configuration().set_string('parallelism.default', '1')


@udf(input_types=[DataTypes.BIGINT(), DataTypes.BIGINT()],
     result_type=DataTypes.BIGINT())
def add(i, j):
    from mpmath import fadd  # add third-party dependency
    return int(fadd(1, 2))


t_env.set_python_requirements("/opt/examples/table/data/requirements.txt")

t_env.execute_sql("""
        CREATE TABLE mySource (
          a BIGINT,
          b BIGINT
        ) WITH (
          'connector' = 'filesystem',
          'format' = 'csv',
          'path' = '/opt/examples/table/input/udf_add_input'
        )
    """)

t_env.execute_sql("""
Exemple #24
0
 def setUp(self):
     super(PyFlinkStreamTableTestCase, self).setUp()
     self.t_config = TableConfig.Builder().as_streaming_execution(
     ).set_parallelism(4).build()
     self.t_env = TableEnvironment.get_table_environment(self.t_config)