コード例 #1
0
def main_flink():
    #之前的步骤是拿文件然后预处理然后写文件到input这个文件中
    env = StreamExecutionEnvironment.get_execution_environment()
    parr_num = 4
    env.set_parallelism(parr_num)
    t_env = StreamTableEnvironment.create(env)
    @udf(input_types=DataTypes.STRING(), result_type=DataTypes.STRING())
    def cut_extract(string):
        return cut_posseg.cut_extract(string)


    t_env.register_function("cut_extract",cut_extract)
    #这里是建表然后从input拿,问题1:有没有办法从自定义的list来当做输入来节省IO开销呢,比如我输入[文本A,文本B]这样的list作为输入
    t_env.connect(FileSystem().path('/home/sjtu/input')) \
        .with_format(OldCsv()
                     .field('text', DataTypes.STRING())) \
        .with_schema(Schema()
                     .field('text', DataTypes.STRING())) \
        .create_temporary_table('mySource')

    t_env.connect(FileSystem().path('/home/sjtu/output')) \
        .with_format(OldCsv()
                     .field('result', DataTypes.STRING())) \
        .with_schema(Schema()
                     .field('result', DataTypes.STRING())) \
        .create_temporary_table('mySink')

    t_env.from_path('mySource')\
        .select("cut_extract(text)")\
        .insert_into('mySink')
    #问题2:这里我是将结果的表写了文件,但实际上我还需要在这个代码中继续对这些处理完的数据进行处理,也没有办法直接将上述的mySink表直接
    #作为内存数据取出来而不是从硬盘再读入呢
    t_env.execute("tutorial_job")
コード例 #2
0
    def test_temporary_tables(self):
        t_env = self.t_env
        t_env.connect(FileSystem().path(os.path.join(self.tempdir + '/temp_1.csv'))) \
            .with_format(OldCsv()
                         .field_delimiter(',')
                         .field("a", DataTypes.INT())
                         .field("b", DataTypes.STRING())) \
            .with_schema(Schema()
                         .field("a", DataTypes.INT())
                         .field("b", DataTypes.STRING())) \
            .create_temporary_table("temporary_table_1")

        t_env.connect(FileSystem().path(os.path.join(self.tempdir + '/temp_2.csv'))) \
            .with_format(OldCsv()
                         .field_delimiter(',')
                         .field("a", DataTypes.INT())
                         .field("b", DataTypes.STRING())) \
            .with_schema(Schema()
                         .field("a", DataTypes.INT())
                         .field("b", DataTypes.STRING())) \
            .create_temporary_table("temporary_table_2")

        actual = t_env.list_temporary_tables()
        expected = ['temporary_table_1', 'temporary_table_2']
        self.assert_equals(actual, expected)

        t_env.drop_temporary_table("temporary_table_1")
        actual = t_env.list_temporary_tables()
        expected = ['temporary_table_2']
        self.assert_equals(actual, expected)
コード例 #3
0
def demo01():
    exec_env = ExecutionEnvironment.get_execution_environment()
    exec_env.set_parallelism(1)
    t_config = TableConfig()
    t_env = BatchTableEnvironment.create(exec_env, t_config)
    # StreamExecutionEnvironment

    t_env.connect(FileSystem().path(r'F:\github\openjw\penter\bigdata_study\pyflink1.x\batch\demo01\input')) \
        .with_format(OldCsv()
                     .field('word', DataTypes.STRING())) \
        .with_schema(Schema()
                     .field('word', DataTypes.STRING())) \
        .create_temporary_table('mySource')
    # 文件存在会报错
    t_env.connect(FileSystem().path(r'F:\github\openjw\penter\bigdata_study\pyflink1.x\batch\demo01\output')) \
        .with_format(OldCsv()
                     .field_delimiter('\t')
                     .field('word', DataTypes.STRING())
                     .field('count', DataTypes.BIGINT())) \
        .with_schema(Schema()
                     .field('word', DataTypes.STRING())
                     .field('count', DataTypes.BIGINT())) \
        .create_temporary_table('mySink')

    tab = t_env.from_path('mySource')
    tab.group_by(tab.word) \
        .select(tab.word, lit(1).count) \
        .execute_insert('mySink').wait()
コード例 #4
0
ファイル: test_descriptor.py プロジェクト: zhaoawd/flink
    def test_field_delimiter(self):
        csv = OldCsv().field_delimiter("|")

        properties = csv.to_properties()
        expected = {'format.field-delimiter': '|',
                    'format.type': 'csv',
                    'format.property-version': '1'}
        self.assertEqual(expected, properties)
コード例 #5
0
ファイル: test_descriptor.py プロジェクト: zhaoawd/flink
    def test_ignore_parse_errors(self):
        csv = OldCsv().ignore_parse_errors()

        properties = csv.to_properties()
        expected = {'format.ignore-parse-errors': 'true',
                    'format.type': 'csv',
                    'format.property-version': '1'}
        self.assertEqual(expected, properties)
コード例 #6
0
ファイル: test_descriptor.py プロジェクト: zhaoawd/flink
    def test_quote_character(self):
        csv = OldCsv().quote_character("*")

        properties = csv.to_properties()
        expected = {'format.quote-character': '*',
                    'format.type': 'csv',
                    'format.property-version': '1'}
        self.assertEqual(expected, properties)
コード例 #7
0
ファイル: test_descriptor.py プロジェクト: zhaoawd/flink
    def test_comment_prefix(self):
        csv = OldCsv().comment_prefix("#")

        properties = csv.to_properties()
        expected = {'format.comment-prefix': '#',
                    'format.type': 'csv',
                    'format.property-version': '1'}
        self.assertEqual(expected, properties)
コード例 #8
0
ファイル: test_descriptor.py プロジェクト: zhaoawd/flink
    def test_ignore_first_line(self):
        csv = OldCsv().ignore_first_line()

        properties = csv.to_properties()
        expected = {'format.ignore-first-line': 'true',
                    'format.type': 'csv',
                    'format.property-version': '1'}
        self.assertEqual(expected, properties)
コード例 #9
0
ファイル: test_descriptor.py プロジェクト: zhaoawd/flink
    def test_line_delimiter(self):
        csv = OldCsv().line_delimiter(";")

        expected = {'format.type': 'csv',
                    'format.property-version': '1',
                    'format.line-delimiter': ';'}

        properties = csv.to_properties()
        self.assertEqual(expected, properties)
コード例 #10
0
ファイル: test_descriptor.py プロジェクト: zhaoawd/flink
    def test_schema(self):
        csv = OldCsv()
        schema = TableSchema(["a", "b"], [DataTypes.INT(), DataTypes.STRING()])

        csv = csv.schema(schema)

        properties = csv.to_properties()
        expected = {'format.fields.0.name': 'a',
                    'format.fields.0.data-type': 'INT',
                    'format.fields.1.name': 'b',
                    'format.fields.1.data-type': 'VARCHAR(2147483647)',
                    'format.type': 'csv',
                    'format.property-version': '1'}

        self.assertEqual(expected, properties)
コード例 #11
0
 def _local_execute_func(exec_func, write_func, pickle_func, python_path):
     table_env = BatchTableEnvironment.create(
         environment_settings=EnvironmentSettings.new_instance(
         ).use_blink_planner().in_batch_mode().build())
     table_env.get_config().get_configuration().set_string(
         'parallelism.default', '1')
     table_env.get_config().set_python_executable(python_path)
     table_env.register_function(
         exec_func,
         udf(lambda _: pickle_func, DataTypes.BIGINT(), DataTypes.STRING()))
     table_env.connect(FileSystem().path(write_func)) \
         .with_format(OldCsv().field('func', DataTypes.STRING())) \
         .with_schema(Schema().field('func', DataTypes.STRING())) \
         .create_temporary_table(exec_func)
     table = table_env.from_elements([(1, 'Joblib')])
     table.select('{}(_1)'.format(exec_func)).insert_into(exec_func)
     table_env.execute(exec_func)
     # decode execution result from table sink file.
     execute_result = cloudpickle.loads(
         codecs.decode(
             pd.DataFrame(pd.read_csv(write_func))[0:].columns[0].encode(),
             'base64'))
     # remove table sink file to clear ineffective files.
     os.remove(write_func)
     return execute_result
コード例 #12
0
 def execute(self, function_context: FlinkFunctionContext,
             input_list: List[Table]) -> List[Table]:
     t_env = function_context.get_table_env()
     statement_set = function_context.get_statement_set()
     table = input_list[0]
     Popen('rm -rf /root/debug', shell=True)
     t_env.register_function(
         "build_index",
         udf(BuildIndexUDF(self.path, self.element_type, self.dimension),
             [DataTypes.STRING(), DataTypes.STRING()], DataTypes.STRING()))
     dummy_output_path = '/tmp/indexed_key'
     if os.path.exists(dummy_output_path):
         if os.path.isdir(dummy_output_path):
             shutil.rmtree(dummy_output_path)
         else:
             os.remove(dummy_output_path)
     t_env.connect(FileSystem().path(dummy_output_path)) \
         .with_format(OldCsv()
                      .field('key', DataTypes.STRING())) \
         .with_schema(Schema()
                      .field('key', DataTypes.STRING())) \
         .create_temporary_table('train_sink')
     statement_set.add_insert(
         "train_sink", table.select("build_index(uuid, feature_data)"))
     return []
コード例 #13
0
ファイル: test_descriptor.py プロジェクト: pinggle/flink
    def test_register_table_source_and_register_table_sink(self):
        self.env.set_parallelism(1)
        source_path = os.path.join(self.tempdir + '/streaming.csv')
        field_names = ["a", "b", "c"]
        field_types = [DataTypes.INT(), DataTypes.STRING(), DataTypes.STRING()]
        data = [(1, "Hi", "Hello"), (2, "Hello", "Hello")]
        self.prepare_csv_source(source_path, data, field_types, field_names)
        sink_path = os.path.join(self.tempdir + '/streaming2.csv')
        if os.path.isfile(sink_path):
            os.remove(sink_path)

        t_env = self.t_env
        # register_table_source
        t_env.connect(FileSystem().path(source_path))\
             .with_format(OldCsv()
                          .field_delimiter(',')
                          .field("a", DataTypes.INT())
                          .field("b", DataTypes.STRING())
                          .field("c", DataTypes.STRING()))\
             .with_schema(Schema()
                          .field("a", DataTypes.INT())
                          .field("b", DataTypes.STRING())
                          .field("c", DataTypes.STRING()))\
             .register_table_source("source")

        # register_table_sink
        t_env.connect(FileSystem().path(sink_path))\
             .with_format(OldCsv()
                          .field_delimiter(',')
                          .field("a", DataTypes.INT())
                          .field("b", DataTypes.STRING())
                          .field("c", DataTypes.STRING()))\
             .with_schema(Schema()
                          .field("a", DataTypes.INT())
                          .field("b", DataTypes.STRING())
                          .field("c", DataTypes.STRING()))\
             .register_table_sink("sink")

        t_env.scan("source") \
             .select("a + 1, b, c") \
             .insert_into("sink")
        self.t_env.execute("test")

        with open(sink_path, 'r') as f:
            lines = f.read()
            assert lines == '2,Hi,Hello\n' + '3,Hello,Hello\n'
コード例 #14
0
 def process(self, execution_context: flink.ExecutionContext, input_list: List[Table] = None) -> List[Table]:
     input_file = os.path.join(os.getcwd(), 'resources', 'word_count.txt')
     t_env = execution_context.table_env
     t_env.connect(FileSystem().path(input_file)) \
         .with_format(OldCsv()
                      .field('word', DataTypes.STRING())) \
         .with_schema(Schema()
                      .field('word', DataTypes.STRING())) \
         .create_temporary_table('mySource')
     return [t_env.from_path('mySource')]
コード例 #15
0
 def execute(self, function_context: FlinkFunctionContext) -> Table:
     example_meta: af.ExampleMeta = function_context.get_example_meta()
     t_env = function_context.get_table_env()
     t_env.connect(FileSystem().path(example_meta.batch_uri)) \
         .with_format(OldCsv()
                      .field('word', DataTypes.STRING())) \
         .with_schema(Schema()
                      .field('word', DataTypes.STRING())) \
         .create_temporary_table('mySource')
     return t_env.from_path('mySource')
コード例 #16
0
    def test_register_temporary_table(self):
        self.t_env.get_config().get_configuration().set_string(
            "parallelism.default", "1")
        source_path = os.path.join(self.tempdir + '/streaming.csv')
        field_names = ["a", "b", "c"]
        field_types = [DataTypes.INT(), DataTypes.STRING(), DataTypes.STRING()]
        data = [(1, "Hi", "Hello"), (2, "Hello", "Hello")]
        self.prepare_csv_source(source_path, data, field_types, field_names)
        sink_path = os.path.join(self.tempdir + '/streaming2.csv')
        if os.path.isfile(sink_path):
            os.remove(sink_path)
        t_env = self.t_env

        t_env.connect(FileSystem().path(source_path))\
             .with_format(OldCsv()
                          .field_delimiter(',')
                          .field("a", DataTypes.INT())
                          .field("b", DataTypes.STRING())
                          .field("c", DataTypes.STRING()))\
             .with_schema(Schema()
                          .field("a", DataTypes.INT())
                          .field("b", DataTypes.STRING())
                          .field("c", DataTypes.STRING()))\
             .create_temporary_table("source")
        t_env.connect(FileSystem().path(sink_path))\
             .with_format(OldCsv()
                          .field_delimiter(',')
                          .field("a", DataTypes.INT())
                          .field("b", DataTypes.STRING())
                          .field("c", DataTypes.STRING()))\
             .with_schema(Schema()
                          .field("a", DataTypes.INT())
                          .field("b", DataTypes.STRING())
                          .field("c", DataTypes.STRING()))\
             .create_temporary_table("sink")
        t_env.from_path("source").select("a + 1, b, c").execute_insert(
            "sink").wait()

        with open(sink_path, 'r') as f:
            lines = f.read()
            assert lines == '2,Hi,Hello\n' + "3,Hello,Hello\n"
コード例 #17
0
ファイル: word_count.py プロジェクト: Bhola-B2C/PyFlink
def word_count():
    result = wikipedia.page("New York City")
    content = result.summary


    t_config = TableConfig()
    env = ExecutionEnvironment.get_execution_environment()
    t_env = BatchTableEnvironment.create(env, t_config)
    print(add.add(10,5))
    print("Word Count");
    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    result_path = tmp_dir + '/result'
    if os.path.exists(result_path):
        try:
            if os.path.isfile(result_path):
                os.remove(result_path)
            else:
                shutil.rmtree(result_path)
        except OSError as e:
            logging.error("Error removing directory: %s - %s.", e.filename, e.strerror)

    logging.info("Results directory: %s", result_path)

    #sink_ddl = """
    #    create table Results(
    #        word VARCHAR,
    #        `count` BIGINT
    #    ) with (
    #        'connector.type' = 'filesystem',
    #        'format.type' = 'csv',
    #        'connector.path' = '{}'
    #   )
    #    """.format(result_path)
    t_env.connect(FileSystem().path('/tmp/output')) \
    .with_format(OldCsv()
                 .field_delimiter('\t')
                 .field('word', DataTypes.STRING())
                 .field('count', DataTypes.BIGINT())) \
    .with_schema(Schema()
                 .field('word', DataTypes.STRING())
                 .field('count', DataTypes.BIGINT())) \
    .create_temporary_table('Results')
    #t_env.sql_update(sink_ddl)

    elements = [(word, 1) for word in content.split(" ")]
    t_env.from_elements(elements, ["word", "count"]) \
         .group_by("word") \
         .select("word, count(1) as count") \
         .insert_into("Results")

    t_env.execute("word_count")
コード例 #18
0
ファイル: test_descriptor.py プロジェクト: zhaoawd/flink
    def test_with_schema(self):
        descriptor = self.t_env.connect(FileSystem())

        descriptor = descriptor.with_format(OldCsv()).with_schema(Schema().field("a", "INT"))

        properties = descriptor.to_properties()
        expected = {'schema.0.name': 'a',
                    'schema.0.data-type': 'INT',
                    'format.type': 'csv',
                    'format.property-version': '1',
                    'connector.type': 'filesystem',
                    'connector.property-version': '1'}
        assert properties == expected
コード例 #19
0
ファイル: Stream.py プロジェクト: nicolas2lee/fraud-detection
    def run(self):
        exec_env = ExecutionEnvironment.get_execution_environment()
        exec_env.set_parallelism(1)
        t_config = TableConfig()
        t_env = StreamTableEnvironment.create(exec_env, t_config)

        t_env.connect(FileSystem().path('/tmp/input')) \
            .with_format(OldCsv()
                .field('word', DataTypes.STRING())) \
            .with_schema(Schema()
                .field('word', DataTypes.STRING())) \
            .create_temporary_table('mySource')

        t_env.connect(FileSystem().path('/tmp/output')) \
            .with_format(OldCsv()
                         .field_delimiter('\t')
                         .field('word', DataTypes.STRING())
                         .field('count', DataTypes.BIGINT())) \
            .with_schema(Schema()
                         .field('word', DataTypes.STRING())
                         .field('count', DataTypes.BIGINT())) \
            .create_temporary_table('mySink')
        model = Model.fromFile('./../batch_ml/model.pmml')

        t_env.from_path('mySource') \
            .group_by('word') \
            .select('word, count(1)') \
            .insert_into('mySink')

        t_env.execute("tutorial_job")

        self.read_data()
        result = model.predict({
            "Sepal_Length": 5.1,
            "Sepal_Width": 3.5,
            "Petal_Length": 1.4,
            "Petal_Width": 0.2
        })
コード例 #20
0
ファイル: test_descriptor.py プロジェクト: Temitope-A/cog
    def test_in_append_mode(self):
        descriptor = self.t_env.connect(FileSystem())

        descriptor = descriptor\
            .with_format(OldCsv())\
            .in_append_mode()

        properties = descriptor.to_properties()
        expected = {'update-mode': 'append',
                    'format.type': 'csv',
                    'format.property-version': '1',
                    'connector.property-version': '1',
                    'connector.type': 'filesystem'}
        assert properties == expected
コード例 #21
0
def word_count():
    content = "line Licensed to the Apache Software Foundation ASF under one " \
              "line or more contributor license agreements See the NOTICE file " \
              "line distributed with this work for additional information " \
              "line regarding copyright ownership The ASF licenses this file " \
              "to you under the Apache License Version the " \
              "License you may not use this file except in compliance " \
              "with the License"

    t_config = TableConfig()
    env = ExecutionEnvironment.get_execution_environment()
    env.set_parallelism(1)
    t_env = BatchTableEnvironment.create(env, t_config)

    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    result_path = tmp_dir + '/result'
    if os.path.exists(result_path):
        try:
            if os.path.isfile(result_path):
                os.remove(result_path)
            else:
                shutil.rmtree(result_path)
        except OSError as e:
            logging.error("Error removing directory: %s - %s.", e.filename,
                          e.strerror)

    logging.info("Results directory: %s", result_path)

    t_env.connect(FileSystem().path(result_path)) \
        .with_format(OldCsv()
                     .field_delimiter(',')
                     .field("word", DataTypes.STRING())
                     .field("len", DataTypes.INT())
                     .field("count", DataTypes.BIGINT())) \
        .with_schema(Schema()
                     .field("word", DataTypes.STRING())
                     .field("len", DataTypes.INT())
                     .field("count", DataTypes.BIGINT())) \
        .register_table_sink("Results")

    t_env.register_java_function("len", "org.apache.flink.udf.UDFLength")
    elements = [(word, 1) for word in content.split(" ")]
    t_env.from_elements(elements, ["word", "count"]) \
         .group_by("word") \
         .select("word, len(word), count(1) as count") \
         .insert_into("Results")

    t_env.execute("word_count")
コード例 #22
0
    def process(self, execution_context: flink.ExecutionContext, input_list: List[Table] = None) -> List[Table]:
        output_file = os.path.join(os.getcwd(), 'output')
        if os.path.exists(output_file):
            os.remove(output_file)

        t_env = execution_context.table_env
        statement_set = execution_context.statement_set
        t_env.connect(FileSystem().path(output_file)) \
            .with_format(OldCsv()
                         .field_delimiter('\t')
                         .field('word', DataTypes.STRING())
                         .field('count', DataTypes.BIGINT())) \
            .with_schema(Schema()
                         .field('word', DataTypes.STRING())
                         .field('count', DataTypes.BIGINT())) \
            .create_temporary_table('mySink')
        statement_set.add_insert('mySink', input_list[0])
        return []
コード例 #23
0
    def execute(self, function_context: FlinkFunctionContext, input_table: Table) -> None:
        example_meta: af.ExampleMeta = function_context.get_example_meta()
        output_file = example_meta.batch_uri
        if os.path.exists(output_file):
            os.remove(output_file)

        t_env = function_context.get_table_env()
        statement_set = function_context.get_statement_set()
        t_env.connect(FileSystem().path(output_file)) \
            .with_format(OldCsv()
                         .field_delimiter('\t')
                         .field('word', DataTypes.STRING())
                         .field('count', DataTypes.BIGINT())) \
            .with_schema(Schema()
                         .field('word', DataTypes.STRING())
                         .field('count', DataTypes.BIGINT())) \
            .create_temporary_table('mySink')
        statement_set.add_insert('mySink', input_table)
コード例 #24
0
def word_count():
    f1 = open("/home/mnm/flink-1.9.1/1", "r")
    f2 = open("/home/mnm/flink-1.9.1/2", "r")
    f3 = open("/home/mnm/flink-1.9.1/3", "r")
    f4 = open("/home/mnm/flink-1.9.1/4", "r")
    f5 = open("/home/mnm/flink-1.9.1/5", "r")
    content = f1.read() + f2.read() + f3.read() + f4.read() + f5.read()

    t_config = TableConfig()
    env = ExecutionEnvironment.get_execution_environment()
    t_env = BatchTableEnvironment.create(env, t_config)

    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    result_path = tmp_dir + '/result'
    if os.path.exists(result_path):
        try:
            if os.path.isfile(result_path):
                os.remove(result_path)
            else:
                shutil.rmtree(result_path)
        except OSError as e:
            logging.error("Error removing directory: %s - %s.", e.filename,
                          e.strerror)

    logging.info("Results directory: %s", result_path)

    t_env.connect(FileSystem().path(result_path)) \
        .with_format(OldCsv()
                     .field_delimiter(',')
                     .field("word", DataTypes.STRING())
                     .field("count", DataTypes.BIGINT())) \
        .with_schema(Schema()
                     .field("word", DataTypes.STRING())
                     .field("count", DataTypes.BIGINT())) \
        .register_table_sink("Results")

    elements = [(word, 1) for word in content.split(" ")]
    t_env.from_elements(elements, ["word", "count"]) \
         .group_by("word") \
         .select("word, count(1) as count") \
         .insert_into("Results")

    t_env.execute("Python batch word count")
コード例 #25
0
ファイル: test_descriptor.py プロジェクト: zhaoawd/flink
    def test_field(self):
        csv = OldCsv()

        csv.field("a", DataTypes.BIGINT())
        csv.field("b", DataTypes.STRING())
        csv.field("c", "SQL_TIMESTAMP")

        properties = csv.to_properties()
        expected = {'format.fields.0.name': 'a',
                    'format.fields.0.data-type': 'BIGINT',
                    'format.fields.1.name': 'b',
                    'format.fields.1.data-type': 'VARCHAR(2147483647)',
                    'format.fields.2.name': 'c',
                    'format.fields.2.data-type': 'TIMESTAMP(3)',
                    'format.type': 'csv',
                    'format.property-version': '1'}
        self.assertEqual(expected, properties)
コード例 #26
0
class main():
    exec_env = ExecutionEnvironment.get_execution_environment()
    exec_env.set_parallelism(1)
    t_config = TableConfig()
    t_env = BatchTableEnvironment.create(exec_env, t_config)

    # t_env.connect(FileSystem().path('./temp/deviceorientation')) \
    #     .with_format(OldCsv()
    #                 .field('word', DataTypes.STRING())) \
    #     .with_schema(Schema()
    #                 .field('word', DataTypes.STRING())) \
    #     .create_temporary_table('mySource')
    my_source_ddl = """
        create table mySource (
            word VARCHAR
        ) with (
            'connector.type' = 'filesystem',
            'format.type' = 'csv',
            'connector.path' = './temp/input'
        )
    """
    t_env.sql_update(my_source_ddl)

    t_env.connect(FileSystem().path('/tmp/output')) \
        .with_format(OldCsv()
                    .field_delimiter('\t')
                    .field('word', DataTypes.STRING())
                    .field('count', DataTypes.BIGINT())) \
        .with_schema(Schema()
                    .field('word', DataTypes.STRING())
                    .field('count', DataTypes.BIGINT())) \
        .create_temporary_table('mySink')
    t_env.from_path('mySource') \
        .group_by('word') \
        .select('word, count(1)') \
        .insert_into('mySink')

    t_env.execute("tutorial_job")
コード例 #27
0
ファイル: 3-udf_add.py プロジェクト: wupengbo125/penter
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.table import StreamTableEnvironment, DataTypes
from pyflink.table.descriptors import Schema, OldCsv, FileSystem
from pyflink.table.udf import udf

env = StreamExecutionEnvironment.get_execution_environment()
env.set_parallelism(1)
t_env = StreamTableEnvironment.create(env)

add = udf(lambda i, j: i + j, [DataTypes.BIGINT(), DataTypes.BIGINT()], DataTypes.BIGINT())
t_env.register_function("add", add)

t_env.connect(FileSystem().path('/opt/examples/data/udf_add_input')) \
    .with_format(OldCsv()
                 .field('a', DataTypes.BIGINT())
                 .field('b', DataTypes.BIGINT())) \
    .with_schema(Schema()
                 .field('a', DataTypes.BIGINT())
                 .field('b', DataTypes.BIGINT())) \
    .create_temporary_table('mySource')

t_env.connect(FileSystem().path('/opt/examples/data/udf_add_output')) \
    .with_format(OldCsv()
                 .field('sum', DataTypes.BIGINT())) \
    .with_schema(Schema()
                 .field('sum', DataTypes.BIGINT())) \
    .create_temporary_table('mySink')

t_env.from_path('mySource')\
    .select("add(a, b)") \
    .insert_into('mySink')
コード例 #28
0
def word_count():
    environment_settings = EnvironmentSettings.new_instance().in_batch_mode(
    ).use_blink_planner().build()
    t_env = BatchTableEnvironment.create(
        environment_settings=environment_settings)

    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    result_path = tmp_dir + '/result'
    if os.path.exists(result_path):
        try:
            if os.path.isfile(result_path):
                os.remove(result_path)
            else:
                shutil.rmtree(result_path)
        except OSError as e:
            logging.error("Error removing directory: %s - %s.", e.filename,
                          e.strerror)

    logging.info("Results directory: %s", result_path)

    # we should set the Python verison here if `Python` not point
    t_env.get_config().set_python_executable("python3")

    t_env.connect(FileSystem().path(result_path)) \
        .with_format(OldCsv()
                     .field_delimiter(',')
                     .field("city", DataTypes.STRING())
                     .field("sales_volume", DataTypes.BIGINT())
                     .field("sales", DataTypes.BIGINT())) \
        .with_schema(Schema()
                     .field("city", DataTypes.STRING())
                     .field("sales_volume", DataTypes.BIGINT())
                     .field("sales", DataTypes.BIGINT())) \
        .register_table_sink("Results")

    @udf(input_types=DataTypes.STRING(),
         result_type=DataTypes.ARRAY(DataTypes.STRING()))
    def split(input_str: str):
        return input_str.split(",")

    @udf(input_types=[DataTypes.ARRAY(DataTypes.STRING()),
                      DataTypes.INT()],
         result_type=DataTypes.STRING())
    def get(arr, index):
        return arr[index]

    t_env.register_function("split", split)
    t_env.register_function("get", get)

    t_env.get_config().get_configuration().set_string("parallelism.default",
                                                      "1")

    data = [
        ("iPhone 11,30,5499,Beijing", ), ("iPhone 11 Pro,20,8699,Guangzhou", ),
        ("MacBook Pro,10,9999,Beijing", ), ("AirPods Pro,50,1999,Beijing", ),
        ("MacBook Pro,10,11499,Shanghai", ), ("iPhone 11,30,5999,Shanghai", ),
        ("iPhone 11 Pro,20,9999,Shenzhen", ),
        ("MacBook Pro,10,13899,Hangzhou", ), ("iPhone 11,10,6799,Beijing", ),
        ("MacBook Pro,10,18999,Beijing", ),
        ("iPhone 11 Pro,10,11799,Shenzhen", ),
        ("MacBook Pro,10,22199,Shanghai", ), ("AirPods Pro,40,1999,Shanghai", )
    ]
    t_env.from_elements(data, ["line"]) \
        .select("split(line) as str_array") \
        .select("get(str_array, 3) as city, "
                "get(str_array, 1).cast(LONG) as count, "
                "get(str_array, 2).cast(LONG) as unit_price") \
        .select("city, count, count * unit_price as total_price") \
        .group_by("city") \
        .select("city, "
                "sum(count) as sales_volume, "
                "sum(total_price) as sales") \
        .insert_into("Results")

    t_env.execute("word_count")
コード例 #29
0
ファイル: wordcount.py プロジェクト: imfht/flaskapps
env = StreamExecutionEnvironment.get_execution_environment()
t_env = StreamTableEnvironment.create(
    env,
    environment_settings = EnvironmentSettings.new_instance()
    .use_blink_planner()
    .build(),
)

result_path = '/notebooks/output.csv'

print('Results directory:', result_path)

t_env.connect(FileSystem().path(result_path)).with_format(
    OldCsv()
    .field_delimiter(',')
    .field('word', DataTypes.STRING())
    .field('count', DataTypes.BIGINT())
).with_schema(
    Schema()
    .field('word', DataTypes.STRING())
    .field('count', DataTypes.BIGINT())
).register_table_sink(
    'Results'
)

elements = [(word, 1) for word in content.split(' ')]

t_env.from_elements(elements, ['word', 'count']).group_by('word').select(
    'word, count(1) as count'
).insert_into('Results')
コード例 #30
0
from pyflink.dataset import ExecutionEnvironment
from pyflink.table import TableConfig, DataTypes, BatchTableEnvironment
from pyflink.table.descriptors import Schema, OldCsv, FileSystem

exec_env = ExecutionEnvironment.get_execution_environment()
exec_env.set_parallelism(2)
t_config = TableConfig()
t_env = BatchTableEnvironment.create(exec_env, t_config)

t_env.connect(FileSystem().path('input')) \
    .with_format(OldCsv()
                 .line_delimiter(' ')
                 .field('word', DataTypes.STRING())) \
    .with_schema(Schema()
                 .field('word', DataTypes.STRING())) \
    .register_table_source("inputSource")

t_env.connect(FileSystem().path('output')) \
    .with_format(OldCsv().field_delimiter(',').field('word', DataTypes.STRING()).field('count', DataTypes.BIGINT()))\
    .with_schema(Schema().field('word', DataTypes.STRING()).field('count', DataTypes.BIGINT()))\
    .register_table_sink('sink')

t_env.scan('inputSource').group_by('word').select('word, count(1)').insert_into('sink')

t_env.execute('my first job')