Ejemplo n.º 1
0
    def __init__(self,
                 parallelism: int = 1,
                 checkpoint_interval: Optional[int] = None,
                 state_ttl: Optional[int] = None) -> None:
        # setting env
        env = StreamExecutionEnvironment.get_execution_environment()
        env.set_parallelism(parallelism)
        if checkpoint_interval:
            env.set_state_backend(
                RocksDBStateBackend(self.checkpoints_path,
                                    enable_incremental_checkpointing=True))
            env.enable_checkpointing(checkpoint_interval * 1000)
        t_config = TableConfig()
        if state_ttl:
            t_config.set_idle_state_retention_time(
                timedelta(seconds=state_ttl),
                timedelta(seconds=state_ttl + 300))
        table_env = StreamTableEnvironment.create(env, table_config=t_config)
        table_env.get_config().get_configuration().set_string(
            "pipeline.jars", self.flink_sql_connector_kafka_jar)

        # set up table
        for ddl in self.tables:
            table_env.execute_sql(ddl)

        self.env = env
        self.table_env = table_env
    def test_get_configuration(self):
        table_config = TableConfig.get_default()

        table_config.get_configuration().set_string("k1", "v1")

        self.assertEqual(table_config.get_configuration().get_string("k1", ""),
                         "v1")
Ejemplo n.º 3
0
def word_count():
    content = "line Licensed to the Apache Software Foundation ASF under one " \
              "line or more contributor license agreements See the NOTICE file " \
              "line distributed with this work for additional information " \
              "line regarding copyright ownership The ASF licenses this file " \
              "to you under the Apache License Version the " \
              "License you may not use this file except in compliance " \
              "with the License"

    t_config = TableConfig()
    env = ExecutionEnvironment.get_execution_environment()
    t_env = BatchTableEnvironment.create(env, t_config)

    # used to test pipeline.jars and pipleline.classpaths
    config_key = sys.argv[1]
    config_value = sys.argv[2]
    t_env.get_config().get_configuration().set_string(config_key, config_value)

    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    result_path = tmp_dir + '/result'
    if os.path.exists(result_path):
        try:
            if os.path.isfile(result_path):
                os.remove(result_path)
            else:
                shutil.rmtree(result_path)
        except OSError as e:
            logging.error("Error removing directory: %s - %s.", e.filename,
                          e.strerror)

    logging.info("Results directory: %s", result_path)

    sink_ddl = """
        create table Results(
            word VARCHAR,
            `count` BIGINT,
            `count_java` BIGINT
        ) with (
            'connector.type' = 'filesystem',
            'format.type' = 'csv',
            'connector.path' = '{}'
        )
        """.format(result_path)
    t_env.sql_update(sink_ddl)

    t_env.sql_update(
        "create temporary system function add_one as 'add_one.add_one' language python"
    )
    t_env.register_java_function("add_one_java",
                                 "org.apache.flink.python.tests.util.AddOne")

    elements = [(word, 0) for word in content.split(" ")]
    t_env.from_elements(elements, ["word", "count"]) \
        .select("word, add_one(count) as count, add_one_java(count) as count_java") \
        .group_by("word") \
        .select("word, count(count) as count, count(count_java) as count_java") \
        .insert_into("Results")

    t_env.execute("word_count")
Ejemplo n.º 4
0
 def setUp(self):
     super(PyFlinkOldBatchTableTestCase, self).setUp()
     self.env = ExecutionEnvironment.get_execution_environment()
     self.env.set_parallelism(2)
     self.t_env = BatchTableEnvironment.create(self.env, TableConfig())
     self.t_env.get_config().get_configuration().set_string(
         "python.fn-execution.bundle.size", "1")
 def create_table_env(self):
     exec_env = ExecutionEnvironment.get_execution_environment()
     exec_env.set_parallelism(1)
     t_config = TableConfig()
     t_env = BatchTableEnvironment.create(exec_env, t_config)
     statement_set = t_env.create_statement_set()
     return exec_env, t_env, statement_set
    def test_get_set_local_timezone(self):
        table_config = TableConfig.get_default()

        table_config.set_local_timezone("Asia/Shanghai")
        timezone = table_config.get_local_timezone()

        self.assertEqual(timezone, "Asia/Shanghai")
Ejemplo n.º 7
0
 def setUp(self):
     super(PyFlinkBatchTableTestCase, self).setUp()
     self.env = ExecutionEnvironment.get_execution_environment()
     self.env.set_parallelism(2)
     self.t_env = BatchTableEnvironment.create(self.env, TableConfig())
     self.t_env.get_config().get_configuration().set_string(
         "taskmanager.memory.task.off-heap.size", "80mb")
    def test_get_set_idle_state_rentention(self):
        table_config = TableConfig.get_default()

        table_config.set_idle_state_retention(datetime.timedelta(days=1))

        self.assertEqual(datetime.timedelta(days=1),
                         table_config.get_idle_state_retention())
Ejemplo n.º 9
0
def demo02():
    exec_env = ExecutionEnvironment.get_execution_environment()
    exec_env.set_parallelism(1)
    t_config = TableConfig()
    t_env = BatchTableEnvironment.create(exec_env, t_config)
    # StreamExecutionEnvironment

    my_source_ddl = """
        create table mySource (
            word VARCHAR
        ) with (
            'connector' = 'filesystem',
            'format.type' = 'csv',
            'connector.path' = 'F:/github/openjw/penter/bigdata_study/pyflink1.x/batch/demo01/input'
        )
    """

    my_sink_ddl = """
        create table mySink (
            word VARCHAR,
            `count` BIGINT
        ) with (
            'connector' = 'filesystem',
            'format.type' = 'csv',
            'connector.path' = 'F:/github/openjw/penter/bigdata_study/pyflink1.x/batch/demo01/output'
        )
    """

    t_env.execute_sql(my_source_ddl)
    t_env.execute_sql(my_sink_ddl)

    tab = t_env.from_path('mySource')
    tab.group_by(tab.word) \
        .select(tab.word, lit(1).count) \
        .execute_insert('mySink').wait()
Ejemplo n.º 10
0
def demo01():
    exec_env = ExecutionEnvironment.get_execution_environment()
    exec_env.set_parallelism(1)
    t_config = TableConfig()
    t_env = BatchTableEnvironment.create(exec_env, t_config)
    # StreamExecutionEnvironment

    t_env.connect(FileSystem().path(r'F:\github\openjw\penter\bigdata_study\pyflink1.x\batch\demo01\input')) \
        .with_format(OldCsv()
                     .field('word', DataTypes.STRING())) \
        .with_schema(Schema()
                     .field('word', DataTypes.STRING())) \
        .create_temporary_table('mySource')
    # 文件存在会报错
    t_env.connect(FileSystem().path(r'F:\github\openjw\penter\bigdata_study\pyflink1.x\batch\demo01\output')) \
        .with_format(OldCsv()
                     .field_delimiter('\t')
                     .field('word', DataTypes.STRING())
                     .field('count', DataTypes.BIGINT())) \
        .with_schema(Schema()
                     .field('word', DataTypes.STRING())
                     .field('count', DataTypes.BIGINT())) \
        .create_temporary_table('mySink')

    tab = t_env.from_path('mySource')
    tab.group_by(tab.word) \
        .select(tab.word, lit(1).count) \
        .execute_insert('mySink').wait()
Ejemplo n.º 11
0
    def test_get_set_max_generated_code_length(self):
        table_config = TableConfig.get_default()

        table_config.set_max_generated_code_length(32000)
        max_generated_code_length = table_config.get_max_generated_code_length()

        self.assertEqual(max_generated_code_length, 32000)
Ejemplo n.º 12
0
    def test_add_configuration(self):
        table_config = TableConfig.get_default()
        configuration = Configuration()
        configuration.set_string("k1", "v1")

        table_config.add_configuration(configuration)

        self.assertEqual(table_config.get("k1", ""), "v1")
Ejemplo n.º 13
0
    def test_get_set_idle_state_retention_time(self):
        table_config = TableConfig.get_default()

        table_config.set_idle_state_retention_time(
            datetime.timedelta(days=1), datetime.timedelta(days=2))

        self.assertEqual(3 * 24 * 3600 * 1000 / 2, table_config.get_max_idle_state_retention_time())
        self.assertEqual(24 * 3600 * 1000, table_config.get_min_idle_state_retention_time())
Ejemplo n.º 14
0
    def test_get_set_sql_dialect(self):
        table_config = TableConfig.get_default()

        sql_dialect = table_config.get_sql_dialect()
        self.assertEqual(sql_dialect, SqlDialect.DEFAULT)

        table_config.set_sql_dialect(SqlDialect.HIVE)
        sql_dialect = table_config.get_sql_dialect()
        self.assertEqual(sql_dialect, SqlDialect.HIVE)
Ejemplo n.º 15
0
 def create_env(
         self) -> (ExecutionEnvironment, TableEnvironment, StatementSet):
     exec_env = StreamExecutionEnvironment.get_execution_environment()
     exec_env.set_parallelism(1)
     t_config = TableConfig()
     t_env = StreamTableEnvironment.create(exec_env, t_config)
     t_env.get_config().get_configuration().set_string(
         "taskmanager.memory.task.off-heap.size", '80m')
     statement_set = t_env.create_statement_set()
     return exec_env, t_env, statement_set
Ejemplo n.º 16
0
    def test_get_set_null_check(self):
        table_config = TableConfig.get_default()

        null_check = table_config.get_null_check()
        self.assertTrue(null_check)

        table_config.set_null_check(False)
        null_check = table_config.get_null_check()

        self.assertFalse(null_check)
Ejemplo n.º 17
0
def word_count():
    result = wikipedia.page("New York City")
    content = result.summary


    t_config = TableConfig()
    env = ExecutionEnvironment.get_execution_environment()
    t_env = BatchTableEnvironment.create(env, t_config)
    print(add.add(10,5))
    print("Word Count");
    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    result_path = tmp_dir + '/result'
    if os.path.exists(result_path):
        try:
            if os.path.isfile(result_path):
                os.remove(result_path)
            else:
                shutil.rmtree(result_path)
        except OSError as e:
            logging.error("Error removing directory: %s - %s.", e.filename, e.strerror)

    logging.info("Results directory: %s", result_path)

    #sink_ddl = """
    #    create table Results(
    #        word VARCHAR,
    #        `count` BIGINT
    #    ) with (
    #        'connector.type' = 'filesystem',
    #        'format.type' = 'csv',
    #        'connector.path' = '{}'
    #   )
    #    """.format(result_path)
    t_env.connect(FileSystem().path('/tmp/output')) \
    .with_format(OldCsv()
                 .field_delimiter('\t')
                 .field('word', DataTypes.STRING())
                 .field('count', DataTypes.BIGINT())) \
    .with_schema(Schema()
                 .field('word', DataTypes.STRING())
                 .field('count', DataTypes.BIGINT())) \
    .create_temporary_table('Results')
    #t_env.sql_update(sink_ddl)

    elements = [(word, 1) for word in content.split(" ")]
    t_env.from_elements(elements, ["word", "count"]) \
         .group_by("word") \
         .select("word, count(1) as count") \
         .insert_into("Results")

    t_env.execute("word_count")
Ejemplo n.º 18
0
def word_count():
    content = "line Licensed to the Apache Software Foundation ASF under one " \
              "line or more contributor license agreements See the NOTICE file " \
              "line distributed with this work for additional information " \
              "line regarding copyright ownership The ASF licenses this file " \
              "to you under the Apache License Version the " \
              "License you may not use this file except in compliance " \
              "with the License"

    t_config = TableConfig()
    env = ExecutionEnvironment.get_execution_environment()
    #t_config.set_python_executable("/opt/python38/bin/python3")
    # con/flink-conf.yaml 添加 python.client.executable: /usr/bin/python3
    t_env = BatchTableEnvironment.create(env, t_config)

    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    result_path = tmp_dir + '/result'
    if os.path.exists(result_path):
        try:
            if os.path.isfile(result_path):
                os.remove(result_path)
            else:
                shutil.rmtree(result_path)
        except OSError as e:
            logging.error("Error removing directory: %s - %s.", e.filename,
                          e.strerror)

    logging.info("Results directory: %s", result_path)

    sink_ddl = """
        create table Results(
            word VARCHAR,
            `count` BIGINT
        ) with (
            'connector.type' = 'filesystem',
            'format.type' = 'csv',
            'connector.path' = '{}'
        )
        """.format(result_path)
    t_env.execute_sql(sink_ddl)

    elements = [(word, 1) for word in content.split(" ")]
    table = t_env.from_elements(elements, ["word", "count"])


    table.group_by(table.word) \
        .select(table.word, expr.lit(1).count.alias('count')) \
        .execute_insert("Results").wait()
Ejemplo n.º 19
0
def word_count():
    content = "line Licensed to the Apache Software Foundation ASF under one " \
              "line or more contributor license agreements See the NOTICE file " \
              "line distributed with this work for additional information " \
              "line regarding copyright ownership The ASF licenses this file " \
              "to you under the Apache License Version the " \
              "License you may not use this file except in compliance " \
              "with the License"

    t_config = TableConfig()
    env = ExecutionEnvironment.get_execution_environment()
    env.set_parallelism(1)
    t_env = BatchTableEnvironment.create(env, t_config)

    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    result_path = tmp_dir + '/result'
    if os.path.exists(result_path):
        try:
            if os.path.isfile(result_path):
                os.remove(result_path)
            else:
                shutil.rmtree(result_path)
        except OSError as e:
            logging.error("Error removing directory: %s - %s.", e.filename,
                          e.strerror)

    logging.info("Results directory: %s", result_path)

    t_env.connect(FileSystem().path(result_path)) \
        .with_format(OldCsv()
                     .field_delimiter(',')
                     .field("word", DataTypes.STRING())
                     .field("len", DataTypes.INT())
                     .field("count", DataTypes.BIGINT())) \
        .with_schema(Schema()
                     .field("word", DataTypes.STRING())
                     .field("len", DataTypes.INT())
                     .field("count", DataTypes.BIGINT())) \
        .register_table_sink("Results")

    t_env.register_java_function("len", "org.apache.flink.udf.UDFLength")
    elements = [(word, 1) for word in content.split(" ")]
    t_env.from_elements(elements, ["word", "count"]) \
         .group_by("word") \
         .select("word, len(word), count(1) as count") \
         .insert_into("Results")

    t_env.execute("word_count")
Ejemplo n.º 20
0
def word_count():
    tmp_dir = tempfile.gettempdir()
    source_path = tmp_dir + '/streaming.csv'
    if os.path.isfile(source_path):
        os.remove(source_path)
    content = "line Licensed to the Apache Software Foundation ASF under one " \
              "line or more contributor license agreements See the NOTICE file " \
              "line distributed with this work for additional information " \
              "line regarding copyright ownership The ASF licenses this file " \
              "to you under the Apache License Version the " \
              "License you may not use this file except in compliance " \
              "with the License"

    with open(source_path, 'w') as f:
        for word in content.split(" "):
            f.write(",".join([word, "1"]))
            f.write("\n")
            f.flush()
        f.close()

    t_config = TableConfig.Builder().as_batch_execution().set_parallelism(
        1).build()
    t_env = TableEnvironment.create(t_config)

    field_names = ["word", "cout"]
    field_types = [DataTypes.STRING, DataTypes.LONG]

    # register Orders table in table environment
    t_env.register_table_source(
        "Word", CsvTableSource(source_path, field_names, field_types))

    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    tmp_csv = tmp_dir + '/streaming2.csv'
    if os.path.isfile(tmp_csv):
        os.remove(tmp_csv)

    t_env.register_table_sink("Results", field_names, field_types,
                              CsvTableSink(tmp_csv))

    t_env.scan("Word") \
        .group_by("word") \
        .select("word, count(1) as count") \
        .insert_into("Results")

    t_env.execute()
def word_count():
    f1 = open("/home/mnm/flink-1.9.1/1", "r")
    f2 = open("/home/mnm/flink-1.9.1/2", "r")
    f3 = open("/home/mnm/flink-1.9.1/3", "r")
    f4 = open("/home/mnm/flink-1.9.1/4", "r")
    f5 = open("/home/mnm/flink-1.9.1/5", "r")
    content = f1.read() + f2.read() + f3.read() + f4.read() + f5.read()

    t_config = TableConfig()
    env = ExecutionEnvironment.get_execution_environment()
    t_env = BatchTableEnvironment.create(env, t_config)

    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    result_path = tmp_dir + '/result'
    if os.path.exists(result_path):
        try:
            if os.path.isfile(result_path):
                os.remove(result_path)
            else:
                shutil.rmtree(result_path)
        except OSError as e:
            logging.error("Error removing directory: %s - %s.", e.filename,
                          e.strerror)

    logging.info("Results directory: %s", result_path)

    t_env.connect(FileSystem().path(result_path)) \
        .with_format(OldCsv()
                     .field_delimiter(',')
                     .field("word", DataTypes.STRING())
                     .field("count", DataTypes.BIGINT())) \
        .with_schema(Schema()
                     .field("word", DataTypes.STRING())
                     .field("count", DataTypes.BIGINT())) \
        .register_table_sink("Results")

    elements = [(word, 1) for word in content.split(" ")]
    t_env.from_elements(elements, ["word", "count"]) \
         .group_by("word") \
         .select("word, count(1) as count") \
         .insert_into("Results")

    t_env.execute("Python batch word count")
Ejemplo n.º 22
0
    def test_get_set_decimal_context(self):
        table_config = TableConfig.get_default()

        table_config.set_decimal_context(20, "UNNECESSARY")
        self.assertEqual((20, "UNNECESSARY"), table_config.get_decimal_context())
        table_config.set_decimal_context(20, "HALF_EVEN")
        self.assertEqual((20, "HALF_EVEN"), table_config.get_decimal_context())
        table_config.set_decimal_context(20, "HALF_DOWN")
        self.assertEqual((20, "HALF_DOWN"), table_config.get_decimal_context())
        table_config.set_decimal_context(20, "HALF_UP")
        self.assertEqual((20, "HALF_UP"), table_config.get_decimal_context())
        table_config.set_decimal_context(20, "FLOOR")
        self.assertEqual((20, "FLOOR"), table_config.get_decimal_context())
        table_config.set_decimal_context(20, "CEILING")
        self.assertEqual((20, "CEILING"), table_config.get_decimal_context())
        table_config.set_decimal_context(20, "DOWN")
        self.assertEqual((20, "DOWN"), table_config.get_decimal_context())
        table_config.set_decimal_context(20, "UP")
        self.assertEqual((20, "UP"), table_config.get_decimal_context())
Ejemplo n.º 23
0
def test_end_to_end():
    tmp_dir = tempfile.gettempdir()
    source_path = tmp_dir + '/streaming.csv'
    if os.path.isfile(source_path):
        os.remove(source_path)
    with open(source_path, 'w') as f:
        lines = '1,hi,hello\n' + '2,hi,hello\n'
        f.write(lines)
        f.close()
    _find_flink_home()
    print("using %s as FLINK_HOME..." % os.environ["FLINK_HOME"])

    t_config = TableConfig.Builder().as_streaming_execution().set_parallelism(1).build()
    t_env = TableEnvironment.get_table_environment(t_config)

    field_names = ["a", "b", "c"]
    field_types = [DataTypes.INT, DataTypes.STRING, DataTypes.STRING]

    # register Orders table in table environment
    t_env.register_table_source(
        "Orders",
        CsvTableSource(source_path, field_names, field_types))

    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    tmp_csv = tmp_dir + '/streaming2.csv'
    if os.path.isfile(tmp_csv):
        os.remove(tmp_csv)

    t_env.register_table_sink(
        "Results",
        field_names, field_types, CsvTableSink(tmp_csv))

    t_env.scan("Orders") \
         .where("a > 0") \
         .select("a + 1, b, c") \
         .insert_into("Results")

    t_env.execute()
    with open(tmp_csv, 'r') as f:
        lines = f.read()
        assert lines == '2,hi,hello\n' + '3,hi,hello\n'
    print("test passed, the log file is under this directory: %s/log" % os.environ["FLINK_HOME"])
Ejemplo n.º 24
0
def word_count():
    # declare a table environment, set configurations.
    env = ExecutionEnvironment.get_execution_environment()
    env.set_parallelism(1)

    t_config = TableConfig()
    t_env = BatchTableEnvironment.create(env, t_config)

    # register Results table in table environment
    output_file = os.path.abspath('.') + '/out.txt'
    if os.path.exists(output_file):
        try:
            if os.path.isfile(output_file):
                os.remove(output_file)
        except OSError as e:
            print("Error", e.filename, e.strerror)
    print("Results:", output_file)

    sink_ddl = """
            create table Results(
                word VARCHAR,
                `count` BIGINT
            ) with (
            'connector.type' = 'filesystem',
            'format.type' = 'csv',
            'connector.path' = '{}'
            )
        """.format(output_file)
    t_env.sql_update(sink_ddl)

    # create the source table with a single string
    # preforms some transformations, and writes the results to table Results
    content = "Who's there? I think I hear them. Stand, ho! Who's there?"
    elements = [(word, 1) for word in content.split(" ")]
    t_env.from_elements(elements, ["word", "count"]) \
        .group_by("word") \
        .select("word, count(1) as count") \
        .insert_into("Results")

    # execute the Flink Python Table API job
    t_env.execute("word_count")
Ejemplo n.º 25
0
    def run(self):
        exec_env = ExecutionEnvironment.get_execution_environment()
        exec_env.set_parallelism(1)
        t_config = TableConfig()
        t_env = StreamTableEnvironment.create(exec_env, t_config)

        t_env.connect(FileSystem().path('/tmp/input')) \
            .with_format(OldCsv()
                .field('word', DataTypes.STRING())) \
            .with_schema(Schema()
                .field('word', DataTypes.STRING())) \
            .create_temporary_table('mySource')

        t_env.connect(FileSystem().path('/tmp/output')) \
            .with_format(OldCsv()
                         .field_delimiter('\t')
                         .field('word', DataTypes.STRING())
                         .field('count', DataTypes.BIGINT())) \
            .with_schema(Schema()
                         .field('word', DataTypes.STRING())
                         .field('count', DataTypes.BIGINT())) \
            .create_temporary_table('mySink')
        model = Model.fromFile('./../batch_ml/model.pmml')

        t_env.from_path('mySource') \
            .group_by('word') \
            .select('word, count(1)') \
            .insert_into('mySink')

        t_env.execute("tutorial_job")

        self.read_data()
        result = model.predict({
            "Sepal_Length": 5.1,
            "Sepal_Width": 3.5,
            "Petal_Length": 1.4,
            "Petal_Width": 0.2
        })
Ejemplo n.º 26
0
class main():
    exec_env = ExecutionEnvironment.get_execution_environment()
    exec_env.set_parallelism(1)
    t_config = TableConfig()
    t_env = BatchTableEnvironment.create(exec_env, t_config)

    # t_env.connect(FileSystem().path('./temp/deviceorientation')) \
    #     .with_format(OldCsv()
    #                 .field('word', DataTypes.STRING())) \
    #     .with_schema(Schema()
    #                 .field('word', DataTypes.STRING())) \
    #     .create_temporary_table('mySource')
    my_source_ddl = """
        create table mySource (
            word VARCHAR
        ) with (
            'connector.type' = 'filesystem',
            'format.type' = 'csv',
            'connector.path' = './temp/input'
        )
    """
    t_env.sql_update(my_source_ddl)

    t_env.connect(FileSystem().path('/tmp/output')) \
        .with_format(OldCsv()
                    .field_delimiter('\t')
                    .field('word', DataTypes.STRING())
                    .field('count', DataTypes.BIGINT())) \
        .with_schema(Schema()
                    .field('word', DataTypes.STRING())
                    .field('count', DataTypes.BIGINT())) \
        .create_temporary_table('mySink')
    t_env.from_path('mySource') \
        .group_by('word') \
        .select('word, count(1)') \
        .insert_into('mySink')

    t_env.execute("tutorial_job")
Ejemplo n.º 27
0
 def setUp(self):
     super(PyFlinkStreamTableTestCase, self).setUp()
     self.t_config = TableConfig.Builder().as_streaming_execution(
     ).set_parallelism(4).build()
     self.t_env = TableEnvironment.get_table_environment(self.t_config)
Ejemplo n.º 28
0
from pyflink.dataset import ExecutionEnvironment
from pyflink.table import TableConfig, DataTypes, BatchTableEnvironment
from pyflink.table.descriptors import Schema, OldCsv, FileSystem

exec_env = ExecutionEnvironment.get_execution_environment()
exec_env.set_parallelism(2)
t_config = TableConfig()
t_env = BatchTableEnvironment.create(exec_env, t_config)

t_env.connect(FileSystem().path('input')) \
    .with_format(OldCsv()
                 .line_delimiter(' ')
                 .field('word', DataTypes.STRING())) \
    .with_schema(Schema()
                 .field('word', DataTypes.STRING())) \
    .register_table_source("inputSource")

t_env.connect(FileSystem().path('output')) \
    .with_format(OldCsv().field_delimiter(',').field('word', DataTypes.STRING()).field('count', DataTypes.BIGINT()))\
    .with_schema(Schema().field('word', DataTypes.STRING()).field('count', DataTypes.BIGINT()))\
    .register_table_sink('sink')

t_env.scan('inputSource').group_by('word').select('word, count(1)').insert_into('sink')

t_env.execute('my first job')
Ejemplo n.º 29
0
    def test_set_get_python_executable(self):
        table_config = TableConfig()
        table_config.set_python_executable("/usr/bin/python3")

        self.assertEqual("/usr/bin/python3",
                         table_config.get_python_executable())
Ejemplo n.º 30
0
 def setUp(self):
     super(PyFlinkBatchTableTestCase, self).setUp()
     self.t_config = TableConfig.Builder().as_batch_execution(
     ).set_parallelism(1).build()
     self.t_env = TableEnvironment.create(self.t_config)