def test_aggregate(self):
        import pandas as pd
        t = self.t_env.from_elements(
            [(1, 2, 3), (2, 1, 3), (1, 5, 4), (1, 8, 6), (2, 3, 4)],
            DataTypes.ROW(
                [DataTypes.FIELD("a", DataTypes.BIGINT()),
                 DataTypes.FIELD("b", DataTypes.SMALLINT()),
                 DataTypes.FIELD("c", DataTypes.INT())]))

        function = CountAndSumAggregateFunction()
        agg = udaf(function,
                   result_type=function.get_result_type(),
                   accumulator_type=function.get_accumulator_type(),
                   name=str(function.__class__.__name__))
        result = t.group_by(t.a) \
            .aggregate(agg(t.b).alias("c", "d")) \
            .select("a, c, d") \
            .to_pandas()
        assert_frame_equal(result, pd.DataFrame([[1, 3, 15], [2, 2, 4]], columns=['a', 'c', 'd']))
Beispiel #2
0
    def test_from_origin_field(self):
        schema = Schema()

        schema = schema\
            .field("int_field", DataTypes.INT())\
            .field("long_field", DataTypes.BIGINT()).from_origin_field("origin_field_a")\
            .field("string_field", DataTypes.STRING())

        properties = schema.to_properties()
        expected = {
            'schema.0.name': 'int_field',
            'schema.0.type': 'INT',
            'schema.1.name': 'long_field',
            'schema.1.type': 'BIGINT',
            'schema.1.from': 'origin_field_a',
            'schema.2.name': 'string_field',
            'schema.2.type': 'VARCHAR'
        }
        assert properties == expected
Beispiel #3
0
    def test_basic_type(self):
        test_types = [DataTypes.STRING(),
                      DataTypes.BOOLEAN(),
                      DataTypes.BYTES(),
                      DataTypes.TINYINT(),
                      DataTypes.SMALLINT(),
                      DataTypes.INT(),
                      DataTypes.BIGINT(),
                      DataTypes.FLOAT(),
                      DataTypes.DOUBLE(),
                      DataTypes.DATE(),
                      DataTypes.TIME(),
                      DataTypes.TIMESTAMP(3)]

        java_types = [_to_java_type(item) for item in test_types]

        converted_python_types = [_from_java_type(item) for item in java_types]

        self.assertEqual(test_types, converted_python_types)
Beispiel #4
0
    def test_proctime(self):
        schema = Schema()

        schema = schema\
            .field("int_field", DataTypes.INT())\
            .field("ptime", DataTypes.BIGINT()).proctime()\
            .field("string_field", DataTypes.STRING())

        properties = schema.to_properties()
        expected = {
            'schema.0.name': 'int_field',
            'schema.0.type': 'INT',
            'schema.1.name': 'ptime',
            'schema.1.type': 'BIGINT',
            'schema.1.proctime': 'true',
            'schema.2.name': 'string_field',
            'schema.2.type': 'VARCHAR'
        }
        assert properties == expected
Beispiel #5
0
    def test_sql_update(self):
        t_env = self.t_env
        source = t_env.from_elements([(1, "Hi", "Hello"),
                                      (2, "Hello", "Hello")], ["a", "b", "c"])
        field_names = ["a", "b", "c"]
        field_types = [
            DataTypes.BIGINT(),
            DataTypes.STRING(),
            DataTypes.STRING()
        ]
        t_env.register_table_sink("sinks", field_names, field_types,
                                  source_sink_utils.TestAppendSink())

        t_env.sql_update("insert into sinks select * from %s" % source)
        t_env.execute("test_sql_job")

        actual = source_sink_utils.results()
        expected = ['1,Hi,Hello', '2,Hello,Hello']
        self.assert_equals(actual, expected)
Beispiel #6
0
    def test_register_table_sink(self):
        t_env = self.t_env
        field_names = ["a", "b", "c"]
        field_types = [
            DataTypes.BIGINT(),
            DataTypes.STRING(),
            DataTypes.STRING()
        ]
        t_env.register_table_sink(
            "Sinks", source_sink_utils.TestAppendSink(field_names,
                                                      field_types))

        t_env.from_elements([(1, "Hi", "Hello")],
                            ["a", "b", "c"]).insert_into("Sinks")
        self.env.execute()
        actual = source_sink_utils.results()

        expected = ['1,Hi,Hello']
        self.assert_equals(actual, expected)
Beispiel #7
0
    def test_field(self):
        csv = OldCsv()

        csv.field("a", DataTypes.BIGINT())
        csv.field("b", DataTypes.STRING())
        csv.field("c", "SQL_TIMESTAMP")

        properties = csv.to_properties()
        expected = {
            'format.fields.0.name': 'a',
            'format.fields.0.type': 'BIGINT',
            'format.fields.1.name': 'b',
            'format.fields.1.type': 'VARCHAR',
            'format.fields.2.name': 'c',
            'format.fields.2.type': 'SQL_TIMESTAMP',
            'format.type': 'csv',
            'format.property-version': '1'
        }
        self.assertEqual(expected, properties)
Beispiel #8
0
    def test_field(self):
        schema = Schema()

        schema = schema\
            .field("int_field", DataTypes.INT())\
            .field("long_field", DataTypes.BIGINT())\
            .field("string_field", DataTypes.STRING())\
            .field("timestamp_field", DataTypes.TIMESTAMP())\
            .field("time_field", DataTypes.TIME())\
            .field("date_field", DataTypes.DATE())\
            .field("double_field", DataTypes.DOUBLE())\
            .field("float_field", DataTypes.FLOAT())\
            .field("byte_field", DataTypes.TINYINT())\
            .field("short_field", DataTypes.SMALLINT())\
            .field("boolean_field", DataTypes.BOOLEAN())

        properties = schema.to_properties()
        expected = {
            'schema.0.name': 'int_field',
            'schema.0.type': 'INT',
            'schema.1.name': 'long_field',
            'schema.1.type': 'BIGINT',
            'schema.2.name': 'string_field',
            'schema.2.type': 'VARCHAR',
            'schema.3.name': 'timestamp_field',
            'schema.3.type': 'TIMESTAMP',
            'schema.4.name': 'time_field',
            'schema.4.type': 'TIME',
            'schema.5.name': 'date_field',
            'schema.5.type': 'DATE',
            'schema.6.name': 'double_field',
            'schema.6.type': 'DOUBLE',
            'schema.7.name': 'float_field',
            'schema.7.type': 'FLOAT',
            'schema.8.name': 'byte_field',
            'schema.8.type': 'TINYINT',
            'schema.9.name': 'short_field',
            'schema.9.type': 'SMALLINT',
            'schema.10.name': 'boolean_field',
            'schema.10.type': 'BOOLEAN'
        }
        assert properties == expected
Beispiel #9
0
    def test_sql_query(self):
        t_env = self.t_env
        source = t_env.from_elements([(1, "Hi", "Hello"),
                                      (2, "Hello", "Hello")], ["a", "b", "c"])
        field_names = ["a", "b", "c"]
        field_types = [
            DataTypes.BIGINT(),
            DataTypes.STRING(),
            DataTypes.STRING()
        ]
        t_env.register_table_sink("sinks", field_names, field_types,
                                  source_sink_utils.TestAppendSink())

        result = t_env.sql_query("select a + 1, b, c from %s" % source)
        result.insert_into("sinks")
        t_env.execute()
        actual = source_sink_utils.results()

        expected = ['2,Hi,Hello', '3,Hello,Hello']
        self.assert_equals(actual, expected)
    def test_map_with_pandas_udf(self):
        t = self.t_env.from_elements(
            [(1, Row(2, 3)), (2, Row(1, 3)), (1, Row(5, 4)), (1, Row(8, 6)), (2, Row(3, 4))],
            DataTypes.ROW(
                [DataTypes.FIELD("a", DataTypes.TINYINT()),
                 DataTypes.FIELD("b",
                                 DataTypes.ROW([DataTypes.FIELD("c", DataTypes.INT()),
                                                DataTypes.FIELD("d", DataTypes.INT())]))]))

        sink_table_ddl = """
        CREATE TABLE Results(a BIGINT, b BIGINT) WITH ('connector'='test-sink')
        """
        self.t_env.execute_sql(sink_table_ddl)

        def func(x):
            import pandas as pd
            res = pd.concat([x.a, x.c + x.d], axis=1)
            return res

        def func2(x):
            return x * 2

        def func3(x):
            assert isinstance(x, Row)
            return x

        pandas_udf = udf(func,
                         result_type=DataTypes.ROW(
                             [DataTypes.FIELD("c", DataTypes.BIGINT()),
                              DataTypes.FIELD("d", DataTypes.BIGINT())]),
                         func_type='pandas')

        pandas_udf_2 = udf(func2,
                           result_type=DataTypes.ROW(
                               [DataTypes.FIELD("c", DataTypes.BIGINT()),
                                DataTypes.FIELD("d", DataTypes.BIGINT())]),
                           func_type='pandas')

        general_udf = udf(func3,
                          result_type=DataTypes.ROW(
                              [DataTypes.FIELD("c", DataTypes.BIGINT()),
                               DataTypes.FIELD("d", DataTypes.BIGINT())]))

        t.map(pandas_udf).map(pandas_udf_2).map(general_udf).execute_insert("Results").wait()
        actual = source_sink_utils.results()
        self.assert_equals(
            actual,
            ["+I[4, 8]", "+I[2, 10]", "+I[2, 28]", "+I[2, 18]", "+I[4, 14]"])
    def test_rename_columns(self):
        t_env = self.t_env
        t = t_env.from_elements([(1, 'Hi', 'Hello'), (2, 'Hello', 'Hello')],
                                ['a', 'b', 'c'])
        field_names = ["d", "e", "f"]
        field_types = [
            DataTypes.BIGINT(),
            DataTypes.STRING(),
            DataTypes.STRING()
        ]
        t_env.register_table_sink("Results", field_names, field_types,
                                  source_sink_utils.TestAppendSink())

        result = t.select("a, b, c").rename_columns(
            "a as d, c as f, b as e").select("d, e, f")
        result.insert_into("Results")
        t_env.execute()
        actual = source_sink_utils.results()

        expected = ['1,Hi,Hello', '2,Hello,Hello']
        self.assert_equals(actual, expected)
Beispiel #12
0
    def test_field(self):
        schema = Schema()\
            .field("int_field", DataTypes.INT())\
            .field("long_field", DataTypes.BIGINT())\
            .field("string_field", DataTypes.STRING())\
            .field("timestamp_field", DataTypes.TIMESTAMP(3))\
            .field("time_field", DataTypes.TIME())\
            .field("date_field", DataTypes.DATE())\
            .field("double_field", DataTypes.DOUBLE())\
            .field("float_field", DataTypes.FLOAT())\
            .field("byte_field", DataTypes.TINYINT())\
            .field("short_field", DataTypes.SMALLINT())\
            .field("boolean_field", DataTypes.BOOLEAN())

        properties = schema.to_properties()
        expected = {
            'schema.0.name': 'int_field',
            'schema.0.data-type': 'INT',
            'schema.1.name': 'long_field',
            'schema.1.data-type': 'BIGINT',
            'schema.2.name': 'string_field',
            'schema.2.data-type': 'VARCHAR(2147483647)',
            'schema.3.name': 'timestamp_field',
            'schema.3.data-type': 'TIMESTAMP(3)',
            'schema.4.name': 'time_field',
            'schema.4.data-type': 'TIME(0)',
            'schema.5.name': 'date_field',
            'schema.5.data-type': 'DATE',
            'schema.6.name': 'double_field',
            'schema.6.data-type': 'DOUBLE',
            'schema.7.name': 'float_field',
            'schema.7.data-type': 'FLOAT',
            'schema.8.name': 'byte_field',
            'schema.8.data-type': 'TINYINT',
            'schema.9.name': 'short_field',
            'schema.9.data-type': 'SMALLINT',
            'schema.10.name': 'boolean_field',
            'schema.10.data-type': 'BOOLEAN'
        }
        self.assertEqual(expected, properties)
    def test_map_with_pandas_udf(self):
        t = self.t_env.from_elements(
            [(1, Row(2, 3)), (2, Row(1, 3)), (1, Row(5, 4)), (1, Row(8, 6)),
             (2, Row(3, 4))],
            DataTypes.ROW([
                DataTypes.FIELD("a", DataTypes.TINYINT()),
                DataTypes.FIELD(
                    "b",
                    DataTypes.ROW([
                        DataTypes.FIELD("c", DataTypes.INT()),
                        DataTypes.FIELD("d", DataTypes.INT())
                    ]))
            ]))

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b'],
            [DataTypes.BIGINT(), DataTypes.BIGINT()])
        self.t_env.register_table_sink("Results", table_sink)

        def func(x):
            import pandas as pd
            res = pd.concat([x.a, x.c + x.d], axis=1)
            return res

        def func2(x):
            return x * 2

        pandas_udf = udf(func,
                         result_type=DataTypes.ROW([
                             DataTypes.FIELD("c", DataTypes.BIGINT()),
                             DataTypes.FIELD("d", DataTypes.BIGINT())
                         ]),
                         func_type='pandas')

        pandas_udf_2 = udf(func2,
                           result_type=DataTypes.ROW([
                               DataTypes.FIELD("c", DataTypes.BIGINT()),
                               DataTypes.FIELD("d", DataTypes.BIGINT())
                           ]),
                           func_type='pandas')

        t.map(pandas_udf).map(pandas_udf_2).execute_insert("Results").wait()
        actual = source_sink_utils.results()
        self.assert_equals(
            actual,
            ["+I[4, 8]", "+I[2, 10]", "+I[2, 28]", "+I[2, 18]", "+I[4, 14]"])
Beispiel #14
0
    def test_flat_map(self):
        t = self.t_env.from_elements(
            [(1, "2,3"), (2, "1"), (1, "5,6,7")],
            DataTypes.ROW(
                [DataTypes.FIELD("a", DataTypes.TINYINT()),
                 DataTypes.FIELD("b", DataTypes.STRING())]))

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b'],
            [DataTypes.BIGINT(), DataTypes.STRING()])
        self.t_env.register_table_sink("Results", table_sink)

        @udtf(result_types=[DataTypes.INT(), DataTypes.STRING()])
        def split(x):
            for s in x[1].split(","):
                yield x[0], s

        t.flat_map(split) \
            .flat_map(split) \
            .execute_insert("Results") \
            .wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["1,2", "1,3", "2,1", "1,5", "1,6", "1,7"])
Beispiel #15
0
    def test_sql_update_with_query_config(self):
        t_env = self.t_env
        source = t_env.from_elements([(1, "Hi", "Hello"),
                                      (2, "Hello", "Hello")], ["a", "b", "c"])
        field_names = ["a", "b", "c"]
        field_types = [
            DataTypes.BIGINT(),
            DataTypes.STRING(),
            DataTypes.STRING()
        ]
        t_env.register_table_sink("sinks", field_names, field_types,
                                  source_sink_utils.TestAppendSink())
        query_config = t_env.query_config()
        query_config.with_idle_state_retention_time(datetime.timedelta(days=1),
                                                    datetime.timedelta(days=2))

        t_env.sql_update("insert into sinks select * from %s" % source,
                         query_config)
        t_env.execute("test_sql_job")

        actual = source_sink_utils.results()
        expected = ['1,Hi,Hello', '2,Hello,Hello']
        self.assert_equals(actual, expected)
Beispiel #16
0
 def test_parquet_columnar_basic(self):
     parquet_file_name = tempfile.mktemp(suffix='.parquet',
                                         dir=self.tempdir)
     schema, records = _create_basic_avro_schema_and_records()
     FileSourceParquetAvroFormatTests._create_parquet_avro_file(
         parquet_file_name, schema, records)
     row_type = DataTypes.ROW([
         DataTypes.FIELD(
             'null',
             DataTypes.STRING()),  # DataTypes.NULL cannot be serialized
         DataTypes.FIELD('boolean', DataTypes.BOOLEAN()),
         DataTypes.FIELD('int', DataTypes.INT()),
         DataTypes.FIELD('long', DataTypes.BIGINT()),
         DataTypes.FIELD('float', DataTypes.FLOAT()),
         DataTypes.FIELD('double', DataTypes.DOUBLE()),
         DataTypes.FIELD('string', DataTypes.STRING()),
         DataTypes.FIELD('unknown', DataTypes.STRING())
     ])
     self._build_parquet_columnar_job(row_type, parquet_file_name)
     self.env.execute('test_parquet_columnar_basic')
     results = self.test_sink.get_results(True, False)
     _check_basic_avro_schema_results(self, results)
     self.assertIsNone(results[0]['unknown'])
     self.assertIsNone(results[1]['unknown'])
Beispiel #17
0
    def test_merge_type(self):
        self.assertEqual(_merge_type(DataTypes.BIGINT(), DataTypes.NULL()),
                         DataTypes.BIGINT())
        self.assertEqual(_merge_type(DataTypes.NULL(), DataTypes.BIGINT()),
                         DataTypes.BIGINT())

        self.assertEqual(_merge_type(DataTypes.BIGINT(), DataTypes.BIGINT()),
                         DataTypes.BIGINT())

        self.assertEqual(
            _merge_type(DataTypes.ARRAY(DataTypes.BIGINT()),
                        DataTypes.ARRAY(DataTypes.BIGINT())),
            DataTypes.ARRAY(DataTypes.BIGINT()))
        with self.assertRaises(TypeError):
            _merge_type(DataTypes.ARRAY(DataTypes.BIGINT()),
                        DataTypes.ARRAY(DataTypes.DOUBLE()))

        self.assertEqual(
            _merge_type(DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT()),
                        DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT())),
            DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT()))
        with self.assertRaises(TypeError):
            _merge_type(DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT()),
                        DataTypes.MAP(DataTypes.DOUBLE(), DataTypes.BIGINT()))
        with self.assertRaises(TypeError):
            _merge_type(DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT()),
                        DataTypes.MAP(DataTypes.STRING(), DataTypes.DOUBLE()))

        self.assertEqual(
            _merge_type(
                DataTypes.ROW([
                    DataTypes.FIELD('f1', DataTypes.BIGINT()),
                    DataTypes.FIELD('f2', DataTypes.STRING())
                ]),
                DataTypes.ROW([
                    DataTypes.FIELD('f1', DataTypes.BIGINT()),
                    DataTypes.FIELD('f2', DataTypes.STRING())
                ])),
            DataTypes.ROW([
                DataTypes.FIELD('f1', DataTypes.BIGINT()),
                DataTypes.FIELD('f2', DataTypes.STRING())
            ]))
        with self.assertRaises(TypeError):
            _merge_type(
                DataTypes.ROW([
                    DataTypes.FIELD('f1', DataTypes.BIGINT()),
                    DataTypes.FIELD('f2', DataTypes.STRING())
                ]),
                DataTypes.ROW([
                    DataTypes.FIELD('f1', DataTypes.DOUBLE()),
                    DataTypes.FIELD('f2', DataTypes.STRING())
                ]))

        self.assertEqual(
            _merge_type(
                DataTypes.ROW([
                    DataTypes.FIELD(
                        'f1',
                        DataTypes.ROW(
                            [DataTypes.FIELD('f2', DataTypes.BIGINT())]))
                ]),
                DataTypes.ROW([
                    DataTypes.FIELD(
                        'f1',
                        DataTypes.ROW(
                            [DataTypes.FIELD('f2', DataTypes.BIGINT())]))
                ])),
            DataTypes.ROW([
                DataTypes.FIELD(
                    'f1',
                    DataTypes.ROW([DataTypes.FIELD('f2', DataTypes.BIGINT())]))
            ]))
        with self.assertRaises(TypeError):
            _merge_type(
                DataTypes.ROW([
                    DataTypes.FIELD(
                        'f1',
                        DataTypes.ROW(
                            [DataTypes.FIELD('f2', DataTypes.BIGINT())]))
                ]),
                DataTypes.ROW([
                    DataTypes.FIELD(
                        'f1',
                        DataTypes.ROW(
                            [DataTypes.FIELD('f2', DataTypes.STRING())]))
                ]))

        self.assertEqual(
            _merge_type(
                DataTypes.ROW([
                    DataTypes.FIELD('f1', DataTypes.ARRAY(DataTypes.BIGINT())),
                    DataTypes.FIELD('f2', DataTypes.STRING())
                ]),
                DataTypes.ROW([
                    DataTypes.FIELD('f1', DataTypes.ARRAY(DataTypes.BIGINT())),
                    DataTypes.FIELD('f2', DataTypes.STRING())
                ])),
            DataTypes.ROW([
                DataTypes.FIELD('f1', DataTypes.ARRAY(DataTypes.BIGINT())),
                DataTypes.FIELD('f2', DataTypes.STRING())
            ]))
        with self.assertRaises(TypeError):
            _merge_type(
                DataTypes.ROW([
                    DataTypes.FIELD('f1', DataTypes.ARRAY(DataTypes.BIGINT())),
                    DataTypes.FIELD('f2', DataTypes.STRING())
                ]),
                DataTypes.ROW([
                    DataTypes.FIELD('f1', DataTypes.ARRAY(DataTypes.DOUBLE())),
                    DataTypes.FIELD('f2', DataTypes.STRING())
                ]))

        self.assertEqual(
            _merge_type(
                DataTypes.ROW([
                    DataTypes.FIELD(
                        'f1',
                        DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT())),
                    DataTypes.FIELD('f2', DataTypes.STRING())
                ]),
                DataTypes.ROW([
                    DataTypes.FIELD(
                        'f1',
                        DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT())),
                    DataTypes.FIELD('f2', DataTypes.STRING())
                ])),
            DataTypes.ROW([
                DataTypes.FIELD(
                    'f1', DataTypes.MAP(DataTypes.STRING(),
                                        DataTypes.BIGINT())),
                DataTypes.FIELD('f2', DataTypes.STRING())
            ]))
        with self.assertRaises(TypeError):
            _merge_type(
                DataTypes.ROW([
                    DataTypes.FIELD(
                        'f1',
                        DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT())),
                    DataTypes.FIELD('f2', DataTypes.STRING())
                ]),
                DataTypes.ROW([
                    DataTypes.FIELD(
                        'f1',
                        DataTypes.MAP(DataTypes.STRING(), DataTypes.DOUBLE())),
                    DataTypes.FIELD('f2', DataTypes.STRING())
                ]))

        self.assertEqual(
            _merge_type(
                DataTypes.ROW([
                    DataTypes.FIELD(
                        'f1',
                        DataTypes.ARRAY(
                            DataTypes.MAP(DataTypes.STRING(),
                                          DataTypes.BIGINT())))
                ]),
                DataTypes.ROW([
                    DataTypes.FIELD(
                        'f1',
                        DataTypes.ARRAY(
                            DataTypes.MAP(DataTypes.STRING(),
                                          DataTypes.BIGINT())))
                ])),
            DataTypes.ROW([
                DataTypes.FIELD(
                    'f1',
                    DataTypes.ARRAY(
                        DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT())))
            ]))
        with self.assertRaises(TypeError):
            _merge_type(
                DataTypes.ROW([
                    DataTypes.FIELD(
                        'f1',
                        DataTypes.ARRAY(
                            DataTypes.MAP(DataTypes.STRING(),
                                          DataTypes.BIGINT())))
                ]),
                DataTypes.ROW([
                    DataTypes.FIELD(
                        'f1',
                        DataTypes.ARRAY(
                            DataTypes.MAP(DataTypes.DOUBLE(),
                                          DataTypes.BIGINT())))
                ]))
Beispiel #18
0
    def test_repr(self):
        schema = TableSchema(["a", "b", "c"],
                             [DataTypes.INT(), DataTypes.BIGINT(), DataTypes.STRING()])

        expected = "root\n |-- a: INT\n |-- b: BIGINT\n |-- c: STRING\n"
        self.assertEqual(expected, repr(schema))
 def get_accumulator_type(self):
     return DataTypes.ROW([
         DataTypes.FIELD("f0", DataTypes.LIST_VIEW(DataTypes.STRING())),
         DataTypes.FIELD("f1", DataTypes.BIGINT())
     ])
 def get_result_type(self):
     return DataTypes.BIGINT()
 def get_accumulator_type(self):
     return DataTypes.ARRAY(DataTypes.BIGINT())
 def get_result_type(self):
     return DataTypes.ROW([
         DataTypes.FIELD("a", DataTypes.BIGINT()),
         DataTypes.FIELD("b", DataTypes.BIGINT())
     ])
 def get_accumulator_type(self):
     return DataTypes.ROW([
         DataTypes.FIELD("a", DataTypes.BIGINT()),
         DataTypes.FIELD("b", DataTypes.BIGINT())
     ])
def word_count():
    environment_settings = EnvironmentSettings.new_instance().in_batch_mode(
    ).use_blink_planner().build()
    t_env = BatchTableEnvironment.create(
        environment_settings=environment_settings)

    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    result_path = tmp_dir + '/result'
    if os.path.exists(result_path):
        try:
            if os.path.isfile(result_path):
                os.remove(result_path)
            else:
                shutil.rmtree(result_path)
        except OSError as e:
            logging.error("Error removing directory: %s - %s.", e.filename,
                          e.strerror)

    logging.info("Results directory: %s", result_path)

    # we should set the Python verison here if `Python` not point
    t_env.get_config().set_python_executable("python3")

    t_env.connect(FileSystem().path(result_path)) \
        .with_format(OldCsv()
                     .field_delimiter(',')
                     .field("city", DataTypes.STRING())
                     .field("sales_volume", DataTypes.BIGINT())
                     .field("sales", DataTypes.BIGINT())) \
        .with_schema(Schema()
                     .field("city", DataTypes.STRING())
                     .field("sales_volume", DataTypes.BIGINT())
                     .field("sales", DataTypes.BIGINT())) \
        .register_table_sink("Results")

    @udf(input_types=DataTypes.STRING(),
         result_type=DataTypes.ARRAY(DataTypes.STRING()))
    def split(input_str: str):
        return input_str.split(",")

    @udf(input_types=[DataTypes.ARRAY(DataTypes.STRING()),
                      DataTypes.INT()],
         result_type=DataTypes.STRING())
    def get(arr, index):
        return arr[index]

    t_env.register_function("split", split)
    t_env.register_function("get", get)

    t_env.get_config().get_configuration().set_string("parallelism.default",
                                                      "1")

    data = [
        ("iPhone 11,30,5499,Beijing", ), ("iPhone 11 Pro,20,8699,Guangzhou", ),
        ("MacBook Pro,10,9999,Beijing", ), ("AirPods Pro,50,1999,Beijing", ),
        ("MacBook Pro,10,11499,Shanghai", ), ("iPhone 11,30,5999,Shanghai", ),
        ("iPhone 11 Pro,20,9999,Shenzhen", ),
        ("MacBook Pro,10,13899,Hangzhou", ), ("iPhone 11,10,6799,Beijing", ),
        ("MacBook Pro,10,18999,Beijing", ),
        ("iPhone 11 Pro,10,11799,Shenzhen", ),
        ("MacBook Pro,10,22199,Shanghai", ), ("AirPods Pro,40,1999,Shanghai", )
    ]
    t_env.from_elements(data, ["line"]) \
        .select("split(line) as str_array") \
        .select("get(str_array, 3) as city, "
                "get(str_array, 1).cast(LONG) as count, "
                "get(str_array, 2).cast(LONG) as unit_price") \
        .select("city, count, count * unit_price as total_price") \
        .group_by("city") \
        .select("city, "
                "sum(count) as sales_volume, "
                "sum(total_price) as sales") \
        .insert_into("Results")

    t_env.execute("word_count")
Beispiel #25
0
#!/user/bin/env python
# -*- coding: utf-8 -*-

import os

from pyflink.datastream import StreamExecutionEnvironment, TimeCharacteristic, CheckpointingMode, FsStateBackend, ExternalizedCheckpointCleanup, CheckpointConfi
from pyflink.table import StreamTableEnvironment, EnvironmentSettings
from pyflink.table.types import DataTypes
from imos_ddl import source_kafka_tbl_face_image_record, sink_pgsql_tbl_face_image_record, source_test, sink_test

from pyflink.table.udf import udf

add = udf(lambda i, j: i + j,
          [DataTypes.BIGINT(), DataTypes.BIGINT()], DataTypes.BIGINT())

# 创建Table Environment, 并选择使用的Planner
env = StreamExecutionEnvironment.get_execution_environment()
config = env.get_checkpoint_config()

env.enable_checkpointing(5000, CheckpointingMode.EXACTLY_ONCE)
# env.get_checkpoint_config(config)
env.set_stream_time_characteristic(TimeCharacteristic.ProcessingTime)

#checkPointPath = FileSystem.path("file:///home/flink/cdn_daemon_checkpoints")
stateBackend = FsStateBackend("file:///var/flink/face_image/")
env.set_state_backend(stateBackend)

config.enable_externalized_checkpoints(
    ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION)

t_env = StreamTableEnvironment.create(
Beispiel #26
0
 def test_data_type_eq(self):
     lt = DataTypes.BIGINT()
     lt2 = pickle.loads(pickle.dumps(DataTypes.BIGINT()))
     self.assertEqual(lt, lt2)
Beispiel #27
0
import tempfile

from pyflink.dataset import ExecutionEnvironment
from pyflink.table import BatchTableEnvironment, TableConfig, WriteMode
from pyflink.table.descriptors import FileSystem, OldCsv, Schema
from pyflink.table.types import DataTypes

t_config = TableConfig()
env = ExecutionEnvironment.get_execution_environment()
env.set_parallelism(1)
t_env = BatchTableEnvironment.create(env, t_config)

source_file = '/notebooks/big-text.txt'
sink_file = '/notebooks/sink.csv'

t_env.connect(FileSystem().path(source_file)).with_format(
    OldCsv().line_delimiter('\n').field(
        'word', DataTypes.STRING())).with_schema(Schema().field(
            'word', DataTypes.STRING())).register_table_source('mySource')

t_env.connect(FileSystem().path(sink_file)).with_format(
    OldCsv().field_delimiter(',').field('word', DataTypes.STRING()).field(
        'count', DataTypes.BIGINT())).with_schema(Schema().field(
            'word', DataTypes.STRING()).field(
                'count', DataTypes.BIGINT())).register_table_sink('mySink')

t_env.scan('mySource').group_by('word').select('word, count(1)').insert_into(
    'mySink')

t_env.execute('wordcount')
Beispiel #28
0
    def test_verify_type_not_nullable(self):
        import array
        import datetime
        import decimal

        schema = DataTypes.ROW([
            DataTypes.FIELD('s', DataTypes.STRING(nullable=False)),
            DataTypes.FIELD('i', DataTypes.INT(True))
        ])

        class MyObj:
            def __init__(self, **kwargs):
                for k, v in kwargs.items():
                    setattr(self, k, v)

        # obj, data_type
        success_spec = [
            # String
            ("", DataTypes.STRING()),
            (u"", DataTypes.STRING()),

            # UDT
            (ExamplePoint(1.0, 2.0), ExamplePointUDT()),

            # Boolean
            (True, DataTypes.BOOLEAN()),

            # TinyInt
            (-(2**7), DataTypes.TINYINT()),
            (2**7 - 1, DataTypes.TINYINT()),

            # SmallInt
            (-(2**15), DataTypes.SMALLINT()),
            (2**15 - 1, DataTypes.SMALLINT()),

            # Int
            (-(2**31), DataTypes.INT()),
            (2**31 - 1, DataTypes.INT()),

            # BigInt
            (2**64, DataTypes.BIGINT()),

            # Float & Double
            (1.0, DataTypes.FLOAT()),
            (1.0, DataTypes.DOUBLE()),

            # Decimal
            (decimal.Decimal("1.0"), DataTypes.DECIMAL(10, 0)),

            # Binary
            (bytearray([1]), DataTypes.BINARY(1)),

            # Date/Time/Timestamp
            (datetime.date(2000, 1, 2), DataTypes.DATE()),
            (datetime.datetime(2000, 1, 2, 3, 4), DataTypes.DATE()),
            (datetime.time(1, 1, 2), DataTypes.TIME()),
            (datetime.datetime(2000, 1, 2, 3, 4), DataTypes.TIMESTAMP()),

            # Array
            ([], DataTypes.ARRAY(DataTypes.INT())),
            (["1", None], DataTypes.ARRAY(DataTypes.STRING(nullable=True))),
            ([1, 2], DataTypes.ARRAY(DataTypes.INT())),
            ((1, 2), DataTypes.ARRAY(DataTypes.INT())),
            (array.array('h', [1, 2]), DataTypes.ARRAY(DataTypes.INT())),

            # Map
            ({}, DataTypes.MAP(DataTypes.STRING(), DataTypes.INT())),
            ({
                "a": 1
            }, DataTypes.MAP(DataTypes.STRING(), DataTypes.INT())),
            ({
                "a": None
            },
             DataTypes.MAP(DataTypes.STRING(nullable=False),
                           DataTypes.INT(True))),

            # Struct
            ({
                "s": "a",
                "i": 1
            }, schema),
            ({
                "s": "a",
                "i": None
            }, schema),
            ({
                "s": "a"
            }, schema),
            ({
                "s": "a",
                "f": 1.0
            }, schema),
            (Row(s="a", i=1), schema),
            (Row(s="a", i=None), schema),
            (Row(s="a", i=1, f=1.0), schema),
            (["a", 1], schema),
            (["a", None], schema),
            (("a", 1), schema),
            (MyObj(s="a", i=1), schema),
            (MyObj(s="a", i=None), schema),
            (MyObj(s="a"), schema),
        ]

        # obj, data_type, exception class
        failure_spec = [
            # Char/VarChar (match anything but None)
            (None, DataTypes.VARCHAR(1), ValueError),
            (None, DataTypes.CHAR(1), ValueError),

            # VarChar (length exceeds maximum length)
            ("abc", DataTypes.VARCHAR(1), ValueError),
            # Char (length exceeds length)
            ("abc", DataTypes.CHAR(1), ValueError),

            # UDT
            (ExamplePoint(1.0, 2.0), PythonOnlyUDT(), ValueError),

            # Boolean
            (1, DataTypes.BOOLEAN(), TypeError),
            ("True", DataTypes.BOOLEAN(), TypeError),
            ([1], DataTypes.BOOLEAN(), TypeError),

            # TinyInt
            (-(2**7) - 1, DataTypes.TINYINT(), ValueError),
            (2**7, DataTypes.TINYINT(), ValueError),
            ("1", DataTypes.TINYINT(), TypeError),
            (1.0, DataTypes.TINYINT(), TypeError),

            # SmallInt
            (-(2**15) - 1, DataTypes.SMALLINT(), ValueError),
            (2**15, DataTypes.SMALLINT(), ValueError),

            # Int
            (-(2**31) - 1, DataTypes.INT(), ValueError),
            (2**31, DataTypes.INT(), ValueError),

            # Float & Double
            (1, DataTypes.FLOAT(), TypeError),
            (1, DataTypes.DOUBLE(), TypeError),

            # Decimal
            (1.0, DataTypes.DECIMAL(10, 0), TypeError),
            (1, DataTypes.DECIMAL(10, 0), TypeError),
            ("1.0", DataTypes.DECIMAL(10, 0), TypeError),

            # Binary
            (1, DataTypes.BINARY(1), TypeError),
            # VarBinary (length exceeds maximum length)
            (bytearray([1, 2]), DataTypes.VARBINARY(1), ValueError),
            # Char (length exceeds length)
            (bytearray([1, 2]), DataTypes.BINARY(1), ValueError),

            # Date/Time/Timestamp
            ("2000-01-02", DataTypes.DATE(), TypeError),
            ("10:01:02", DataTypes.TIME(), TypeError),
            (946811040, DataTypes.TIMESTAMP(), TypeError),

            # Array
            (["1", None], DataTypes.ARRAY(DataTypes.VARCHAR(1,
                                                            nullable=False)),
             ValueError),
            ([1, "2"], DataTypes.ARRAY(DataTypes.INT()), TypeError),

            # Map
            ({
                "a": 1
            }, DataTypes.MAP(DataTypes.INT(), DataTypes.INT()), TypeError),
            ({
                "a": "1"
            }, DataTypes.MAP(DataTypes.VARCHAR(1),
                             DataTypes.INT()), TypeError),
            ({
                "a": None
            }, DataTypes.MAP(DataTypes.VARCHAR(1),
                             DataTypes.INT(False)), ValueError),

            # Struct
            ({
                "s": "a",
                "i": "1"
            }, schema, TypeError),
            (Row(s="a"), schema, ValueError),  # Row can't have missing field
            (Row(s="a", i="1"), schema, TypeError),
            (["a"], schema, ValueError),
            (["a", "1"], schema, TypeError),
            (MyObj(s="a", i="1"), schema, TypeError),
            (MyObj(s=None, i="1"), schema, ValueError),
        ]

        # Check success cases
        for obj, data_type in success_spec:
            try:
                _create_type_verifier(data_type.not_null())(obj)
            except (TypeError, ValueError):
                self.fail("verify_type(%s, %s, nullable=False)" %
                          (obj, data_type))

        # Check failure cases
        for obj, data_type, exp in failure_spec:
            msg = "verify_type(%s, %s, nullable=False) == %s" % (
                obj, data_type, exp)
            with self.assertRaises(exp, msg=msg):
                _create_type_verifier(data_type.not_null())(obj)
Beispiel #29
0
    def test_get_schema(self):
        t = self.t_env.from_elements([(1, 'Hi', 'Hello')], ['a', 'b', 'c'])
        result = t.group_by("c").select("a.sum as a, c as b")
        schema = result.get_schema()

        assert schema == TableSchema(["a", "b"], [DataTypes.BIGINT(), DataTypes.STRING()])