def test_aggregate(self): import pandas as pd t = self.t_env.from_elements( [(1, 2, 3), (2, 1, 3), (1, 5, 4), (1, 8, 6), (2, 3, 4)], DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.BIGINT()), DataTypes.FIELD("b", DataTypes.SMALLINT()), DataTypes.FIELD("c", DataTypes.INT())])) function = CountAndSumAggregateFunction() agg = udaf(function, result_type=function.get_result_type(), accumulator_type=function.get_accumulator_type(), name=str(function.__class__.__name__)) result = t.group_by(t.a) \ .aggregate(agg(t.b).alias("c", "d")) \ .select("a, c, d") \ .to_pandas() assert_frame_equal(result, pd.DataFrame([[1, 3, 15], [2, 2, 4]], columns=['a', 'c', 'd']))
def test_from_origin_field(self): schema = Schema() schema = schema\ .field("int_field", DataTypes.INT())\ .field("long_field", DataTypes.BIGINT()).from_origin_field("origin_field_a")\ .field("string_field", DataTypes.STRING()) properties = schema.to_properties() expected = { 'schema.0.name': 'int_field', 'schema.0.type': 'INT', 'schema.1.name': 'long_field', 'schema.1.type': 'BIGINT', 'schema.1.from': 'origin_field_a', 'schema.2.name': 'string_field', 'schema.2.type': 'VARCHAR' } assert properties == expected
def test_basic_type(self): test_types = [DataTypes.STRING(), DataTypes.BOOLEAN(), DataTypes.BYTES(), DataTypes.TINYINT(), DataTypes.SMALLINT(), DataTypes.INT(), DataTypes.BIGINT(), DataTypes.FLOAT(), DataTypes.DOUBLE(), DataTypes.DATE(), DataTypes.TIME(), DataTypes.TIMESTAMP(3)] java_types = [_to_java_type(item) for item in test_types] converted_python_types = [_from_java_type(item) for item in java_types] self.assertEqual(test_types, converted_python_types)
def test_proctime(self): schema = Schema() schema = schema\ .field("int_field", DataTypes.INT())\ .field("ptime", DataTypes.BIGINT()).proctime()\ .field("string_field", DataTypes.STRING()) properties = schema.to_properties() expected = { 'schema.0.name': 'int_field', 'schema.0.type': 'INT', 'schema.1.name': 'ptime', 'schema.1.type': 'BIGINT', 'schema.1.proctime': 'true', 'schema.2.name': 'string_field', 'schema.2.type': 'VARCHAR' } assert properties == expected
def test_sql_update(self): t_env = self.t_env source = t_env.from_elements([(1, "Hi", "Hello"), (2, "Hello", "Hello")], ["a", "b", "c"]) field_names = ["a", "b", "c"] field_types = [ DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING() ] t_env.register_table_sink("sinks", field_names, field_types, source_sink_utils.TestAppendSink()) t_env.sql_update("insert into sinks select * from %s" % source) t_env.execute("test_sql_job") actual = source_sink_utils.results() expected = ['1,Hi,Hello', '2,Hello,Hello'] self.assert_equals(actual, expected)
def test_register_table_sink(self): t_env = self.t_env field_names = ["a", "b", "c"] field_types = [ DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING() ] t_env.register_table_sink( "Sinks", source_sink_utils.TestAppendSink(field_names, field_types)) t_env.from_elements([(1, "Hi", "Hello")], ["a", "b", "c"]).insert_into("Sinks") self.env.execute() actual = source_sink_utils.results() expected = ['1,Hi,Hello'] self.assert_equals(actual, expected)
def test_field(self): csv = OldCsv() csv.field("a", DataTypes.BIGINT()) csv.field("b", DataTypes.STRING()) csv.field("c", "SQL_TIMESTAMP") properties = csv.to_properties() expected = { 'format.fields.0.name': 'a', 'format.fields.0.type': 'BIGINT', 'format.fields.1.name': 'b', 'format.fields.1.type': 'VARCHAR', 'format.fields.2.name': 'c', 'format.fields.2.type': 'SQL_TIMESTAMP', 'format.type': 'csv', 'format.property-version': '1' } self.assertEqual(expected, properties)
def test_field(self): schema = Schema() schema = schema\ .field("int_field", DataTypes.INT())\ .field("long_field", DataTypes.BIGINT())\ .field("string_field", DataTypes.STRING())\ .field("timestamp_field", DataTypes.TIMESTAMP())\ .field("time_field", DataTypes.TIME())\ .field("date_field", DataTypes.DATE())\ .field("double_field", DataTypes.DOUBLE())\ .field("float_field", DataTypes.FLOAT())\ .field("byte_field", DataTypes.TINYINT())\ .field("short_field", DataTypes.SMALLINT())\ .field("boolean_field", DataTypes.BOOLEAN()) properties = schema.to_properties() expected = { 'schema.0.name': 'int_field', 'schema.0.type': 'INT', 'schema.1.name': 'long_field', 'schema.1.type': 'BIGINT', 'schema.2.name': 'string_field', 'schema.2.type': 'VARCHAR', 'schema.3.name': 'timestamp_field', 'schema.3.type': 'TIMESTAMP', 'schema.4.name': 'time_field', 'schema.4.type': 'TIME', 'schema.5.name': 'date_field', 'schema.5.type': 'DATE', 'schema.6.name': 'double_field', 'schema.6.type': 'DOUBLE', 'schema.7.name': 'float_field', 'schema.7.type': 'FLOAT', 'schema.8.name': 'byte_field', 'schema.8.type': 'TINYINT', 'schema.9.name': 'short_field', 'schema.9.type': 'SMALLINT', 'schema.10.name': 'boolean_field', 'schema.10.type': 'BOOLEAN' } assert properties == expected
def test_sql_query(self): t_env = self.t_env source = t_env.from_elements([(1, "Hi", "Hello"), (2, "Hello", "Hello")], ["a", "b", "c"]) field_names = ["a", "b", "c"] field_types = [ DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING() ] t_env.register_table_sink("sinks", field_names, field_types, source_sink_utils.TestAppendSink()) result = t_env.sql_query("select a + 1, b, c from %s" % source) result.insert_into("sinks") t_env.execute() actual = source_sink_utils.results() expected = ['2,Hi,Hello', '3,Hello,Hello'] self.assert_equals(actual, expected)
def test_map_with_pandas_udf(self): t = self.t_env.from_elements( [(1, Row(2, 3)), (2, Row(1, 3)), (1, Row(5, 4)), (1, Row(8, 6)), (2, Row(3, 4))], DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.ROW([DataTypes.FIELD("c", DataTypes.INT()), DataTypes.FIELD("d", DataTypes.INT())]))])) sink_table_ddl = """ CREATE TABLE Results(a BIGINT, b BIGINT) WITH ('connector'='test-sink') """ self.t_env.execute_sql(sink_table_ddl) def func(x): import pandas as pd res = pd.concat([x.a, x.c + x.d], axis=1) return res def func2(x): return x * 2 def func3(x): assert isinstance(x, Row) return x pandas_udf = udf(func, result_type=DataTypes.ROW( [DataTypes.FIELD("c", DataTypes.BIGINT()), DataTypes.FIELD("d", DataTypes.BIGINT())]), func_type='pandas') pandas_udf_2 = udf(func2, result_type=DataTypes.ROW( [DataTypes.FIELD("c", DataTypes.BIGINT()), DataTypes.FIELD("d", DataTypes.BIGINT())]), func_type='pandas') general_udf = udf(func3, result_type=DataTypes.ROW( [DataTypes.FIELD("c", DataTypes.BIGINT()), DataTypes.FIELD("d", DataTypes.BIGINT())])) t.map(pandas_udf).map(pandas_udf_2).map(general_udf).execute_insert("Results").wait() actual = source_sink_utils.results() self.assert_equals( actual, ["+I[4, 8]", "+I[2, 10]", "+I[2, 28]", "+I[2, 18]", "+I[4, 14]"])
def test_rename_columns(self): t_env = self.t_env t = t_env.from_elements([(1, 'Hi', 'Hello'), (2, 'Hello', 'Hello')], ['a', 'b', 'c']) field_names = ["d", "e", "f"] field_types = [ DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING() ] t_env.register_table_sink("Results", field_names, field_types, source_sink_utils.TestAppendSink()) result = t.select("a, b, c").rename_columns( "a as d, c as f, b as e").select("d, e, f") result.insert_into("Results") t_env.execute() actual = source_sink_utils.results() expected = ['1,Hi,Hello', '2,Hello,Hello'] self.assert_equals(actual, expected)
def test_field(self): schema = Schema()\ .field("int_field", DataTypes.INT())\ .field("long_field", DataTypes.BIGINT())\ .field("string_field", DataTypes.STRING())\ .field("timestamp_field", DataTypes.TIMESTAMP(3))\ .field("time_field", DataTypes.TIME())\ .field("date_field", DataTypes.DATE())\ .field("double_field", DataTypes.DOUBLE())\ .field("float_field", DataTypes.FLOAT())\ .field("byte_field", DataTypes.TINYINT())\ .field("short_field", DataTypes.SMALLINT())\ .field("boolean_field", DataTypes.BOOLEAN()) properties = schema.to_properties() expected = { 'schema.0.name': 'int_field', 'schema.0.data-type': 'INT', 'schema.1.name': 'long_field', 'schema.1.data-type': 'BIGINT', 'schema.2.name': 'string_field', 'schema.2.data-type': 'VARCHAR(2147483647)', 'schema.3.name': 'timestamp_field', 'schema.3.data-type': 'TIMESTAMP(3)', 'schema.4.name': 'time_field', 'schema.4.data-type': 'TIME(0)', 'schema.5.name': 'date_field', 'schema.5.data-type': 'DATE', 'schema.6.name': 'double_field', 'schema.6.data-type': 'DOUBLE', 'schema.7.name': 'float_field', 'schema.7.data-type': 'FLOAT', 'schema.8.name': 'byte_field', 'schema.8.data-type': 'TINYINT', 'schema.9.name': 'short_field', 'schema.9.data-type': 'SMALLINT', 'schema.10.name': 'boolean_field', 'schema.10.data-type': 'BOOLEAN' } self.assertEqual(expected, properties)
def test_map_with_pandas_udf(self): t = self.t_env.from_elements( [(1, Row(2, 3)), (2, Row(1, 3)), (1, Row(5, 4)), (1, Row(8, 6)), (2, Row(3, 4))], DataTypes.ROW([ DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD( "b", DataTypes.ROW([ DataTypes.FIELD("c", DataTypes.INT()), DataTypes.FIELD("d", DataTypes.INT()) ])) ])) table_sink = source_sink_utils.TestAppendSink( ['a', 'b'], [DataTypes.BIGINT(), DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) def func(x): import pandas as pd res = pd.concat([x.a, x.c + x.d], axis=1) return res def func2(x): return x * 2 pandas_udf = udf(func, result_type=DataTypes.ROW([ DataTypes.FIELD("c", DataTypes.BIGINT()), DataTypes.FIELD("d", DataTypes.BIGINT()) ]), func_type='pandas') pandas_udf_2 = udf(func2, result_type=DataTypes.ROW([ DataTypes.FIELD("c", DataTypes.BIGINT()), DataTypes.FIELD("d", DataTypes.BIGINT()) ]), func_type='pandas') t.map(pandas_udf).map(pandas_udf_2).execute_insert("Results").wait() actual = source_sink_utils.results() self.assert_equals( actual, ["+I[4, 8]", "+I[2, 10]", "+I[2, 28]", "+I[2, 18]", "+I[4, 14]"])
def test_flat_map(self): t = self.t_env.from_elements( [(1, "2,3"), (2, "1"), (1, "5,6,7")], DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.STRING())])) table_sink = source_sink_utils.TestAppendSink( ['a', 'b'], [DataTypes.BIGINT(), DataTypes.STRING()]) self.t_env.register_table_sink("Results", table_sink) @udtf(result_types=[DataTypes.INT(), DataTypes.STRING()]) def split(x): for s in x[1].split(","): yield x[0], s t.flat_map(split) \ .flat_map(split) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["1,2", "1,3", "2,1", "1,5", "1,6", "1,7"])
def test_sql_update_with_query_config(self): t_env = self.t_env source = t_env.from_elements([(1, "Hi", "Hello"), (2, "Hello", "Hello")], ["a", "b", "c"]) field_names = ["a", "b", "c"] field_types = [ DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING() ] t_env.register_table_sink("sinks", field_names, field_types, source_sink_utils.TestAppendSink()) query_config = t_env.query_config() query_config.with_idle_state_retention_time(datetime.timedelta(days=1), datetime.timedelta(days=2)) t_env.sql_update("insert into sinks select * from %s" % source, query_config) t_env.execute("test_sql_job") actual = source_sink_utils.results() expected = ['1,Hi,Hello', '2,Hello,Hello'] self.assert_equals(actual, expected)
def test_parquet_columnar_basic(self): parquet_file_name = tempfile.mktemp(suffix='.parquet', dir=self.tempdir) schema, records = _create_basic_avro_schema_and_records() FileSourceParquetAvroFormatTests._create_parquet_avro_file( parquet_file_name, schema, records) row_type = DataTypes.ROW([ DataTypes.FIELD( 'null', DataTypes.STRING()), # DataTypes.NULL cannot be serialized DataTypes.FIELD('boolean', DataTypes.BOOLEAN()), DataTypes.FIELD('int', DataTypes.INT()), DataTypes.FIELD('long', DataTypes.BIGINT()), DataTypes.FIELD('float', DataTypes.FLOAT()), DataTypes.FIELD('double', DataTypes.DOUBLE()), DataTypes.FIELD('string', DataTypes.STRING()), DataTypes.FIELD('unknown', DataTypes.STRING()) ]) self._build_parquet_columnar_job(row_type, parquet_file_name) self.env.execute('test_parquet_columnar_basic') results = self.test_sink.get_results(True, False) _check_basic_avro_schema_results(self, results) self.assertIsNone(results[0]['unknown']) self.assertIsNone(results[1]['unknown'])
def test_merge_type(self): self.assertEqual(_merge_type(DataTypes.BIGINT(), DataTypes.NULL()), DataTypes.BIGINT()) self.assertEqual(_merge_type(DataTypes.NULL(), DataTypes.BIGINT()), DataTypes.BIGINT()) self.assertEqual(_merge_type(DataTypes.BIGINT(), DataTypes.BIGINT()), DataTypes.BIGINT()) self.assertEqual( _merge_type(DataTypes.ARRAY(DataTypes.BIGINT()), DataTypes.ARRAY(DataTypes.BIGINT())), DataTypes.ARRAY(DataTypes.BIGINT())) with self.assertRaises(TypeError): _merge_type(DataTypes.ARRAY(DataTypes.BIGINT()), DataTypes.ARRAY(DataTypes.DOUBLE())) self.assertEqual( _merge_type(DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT()), DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT())), DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT())) with self.assertRaises(TypeError): _merge_type(DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT()), DataTypes.MAP(DataTypes.DOUBLE(), DataTypes.BIGINT())) with self.assertRaises(TypeError): _merge_type(DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT()), DataTypes.MAP(DataTypes.STRING(), DataTypes.DOUBLE())) self.assertEqual( _merge_type( DataTypes.ROW([ DataTypes.FIELD('f1', DataTypes.BIGINT()), DataTypes.FIELD('f2', DataTypes.STRING()) ]), DataTypes.ROW([ DataTypes.FIELD('f1', DataTypes.BIGINT()), DataTypes.FIELD('f2', DataTypes.STRING()) ])), DataTypes.ROW([ DataTypes.FIELD('f1', DataTypes.BIGINT()), DataTypes.FIELD('f2', DataTypes.STRING()) ])) with self.assertRaises(TypeError): _merge_type( DataTypes.ROW([ DataTypes.FIELD('f1', DataTypes.BIGINT()), DataTypes.FIELD('f2', DataTypes.STRING()) ]), DataTypes.ROW([ DataTypes.FIELD('f1', DataTypes.DOUBLE()), DataTypes.FIELD('f2', DataTypes.STRING()) ])) self.assertEqual( _merge_type( DataTypes.ROW([ DataTypes.FIELD( 'f1', DataTypes.ROW( [DataTypes.FIELD('f2', DataTypes.BIGINT())])) ]), DataTypes.ROW([ DataTypes.FIELD( 'f1', DataTypes.ROW( [DataTypes.FIELD('f2', DataTypes.BIGINT())])) ])), DataTypes.ROW([ DataTypes.FIELD( 'f1', DataTypes.ROW([DataTypes.FIELD('f2', DataTypes.BIGINT())])) ])) with self.assertRaises(TypeError): _merge_type( DataTypes.ROW([ DataTypes.FIELD( 'f1', DataTypes.ROW( [DataTypes.FIELD('f2', DataTypes.BIGINT())])) ]), DataTypes.ROW([ DataTypes.FIELD( 'f1', DataTypes.ROW( [DataTypes.FIELD('f2', DataTypes.STRING())])) ])) self.assertEqual( _merge_type( DataTypes.ROW([ DataTypes.FIELD('f1', DataTypes.ARRAY(DataTypes.BIGINT())), DataTypes.FIELD('f2', DataTypes.STRING()) ]), DataTypes.ROW([ DataTypes.FIELD('f1', DataTypes.ARRAY(DataTypes.BIGINT())), DataTypes.FIELD('f2', DataTypes.STRING()) ])), DataTypes.ROW([ DataTypes.FIELD('f1', DataTypes.ARRAY(DataTypes.BIGINT())), DataTypes.FIELD('f2', DataTypes.STRING()) ])) with self.assertRaises(TypeError): _merge_type( DataTypes.ROW([ DataTypes.FIELD('f1', DataTypes.ARRAY(DataTypes.BIGINT())), DataTypes.FIELD('f2', DataTypes.STRING()) ]), DataTypes.ROW([ DataTypes.FIELD('f1', DataTypes.ARRAY(DataTypes.DOUBLE())), DataTypes.FIELD('f2', DataTypes.STRING()) ])) self.assertEqual( _merge_type( DataTypes.ROW([ DataTypes.FIELD( 'f1', DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT())), DataTypes.FIELD('f2', DataTypes.STRING()) ]), DataTypes.ROW([ DataTypes.FIELD( 'f1', DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT())), DataTypes.FIELD('f2', DataTypes.STRING()) ])), DataTypes.ROW([ DataTypes.FIELD( 'f1', DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT())), DataTypes.FIELD('f2', DataTypes.STRING()) ])) with self.assertRaises(TypeError): _merge_type( DataTypes.ROW([ DataTypes.FIELD( 'f1', DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT())), DataTypes.FIELD('f2', DataTypes.STRING()) ]), DataTypes.ROW([ DataTypes.FIELD( 'f1', DataTypes.MAP(DataTypes.STRING(), DataTypes.DOUBLE())), DataTypes.FIELD('f2', DataTypes.STRING()) ])) self.assertEqual( _merge_type( DataTypes.ROW([ DataTypes.FIELD( 'f1', DataTypes.ARRAY( DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT()))) ]), DataTypes.ROW([ DataTypes.FIELD( 'f1', DataTypes.ARRAY( DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT()))) ])), DataTypes.ROW([ DataTypes.FIELD( 'f1', DataTypes.ARRAY( DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT()))) ])) with self.assertRaises(TypeError): _merge_type( DataTypes.ROW([ DataTypes.FIELD( 'f1', DataTypes.ARRAY( DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT()))) ]), DataTypes.ROW([ DataTypes.FIELD( 'f1', DataTypes.ARRAY( DataTypes.MAP(DataTypes.DOUBLE(), DataTypes.BIGINT()))) ]))
def test_repr(self): schema = TableSchema(["a", "b", "c"], [DataTypes.INT(), DataTypes.BIGINT(), DataTypes.STRING()]) expected = "root\n |-- a: INT\n |-- b: BIGINT\n |-- c: STRING\n" self.assertEqual(expected, repr(schema))
def get_accumulator_type(self): return DataTypes.ROW([ DataTypes.FIELD("f0", DataTypes.LIST_VIEW(DataTypes.STRING())), DataTypes.FIELD("f1", DataTypes.BIGINT()) ])
def get_result_type(self): return DataTypes.BIGINT()
def get_accumulator_type(self): return DataTypes.ARRAY(DataTypes.BIGINT())
def get_result_type(self): return DataTypes.ROW([ DataTypes.FIELD("a", DataTypes.BIGINT()), DataTypes.FIELD("b", DataTypes.BIGINT()) ])
def get_accumulator_type(self): return DataTypes.ROW([ DataTypes.FIELD("a", DataTypes.BIGINT()), DataTypes.FIELD("b", DataTypes.BIGINT()) ])
def word_count(): environment_settings = EnvironmentSettings.new_instance().in_batch_mode( ).use_blink_planner().build() t_env = BatchTableEnvironment.create( environment_settings=environment_settings) # register Results table in table environment tmp_dir = tempfile.gettempdir() result_path = tmp_dir + '/result' if os.path.exists(result_path): try: if os.path.isfile(result_path): os.remove(result_path) else: shutil.rmtree(result_path) except OSError as e: logging.error("Error removing directory: %s - %s.", e.filename, e.strerror) logging.info("Results directory: %s", result_path) # we should set the Python verison here if `Python` not point t_env.get_config().set_python_executable("python3") t_env.connect(FileSystem().path(result_path)) \ .with_format(OldCsv() .field_delimiter(',') .field("city", DataTypes.STRING()) .field("sales_volume", DataTypes.BIGINT()) .field("sales", DataTypes.BIGINT())) \ .with_schema(Schema() .field("city", DataTypes.STRING()) .field("sales_volume", DataTypes.BIGINT()) .field("sales", DataTypes.BIGINT())) \ .register_table_sink("Results") @udf(input_types=DataTypes.STRING(), result_type=DataTypes.ARRAY(DataTypes.STRING())) def split(input_str: str): return input_str.split(",") @udf(input_types=[DataTypes.ARRAY(DataTypes.STRING()), DataTypes.INT()], result_type=DataTypes.STRING()) def get(arr, index): return arr[index] t_env.register_function("split", split) t_env.register_function("get", get) t_env.get_config().get_configuration().set_string("parallelism.default", "1") data = [ ("iPhone 11,30,5499,Beijing", ), ("iPhone 11 Pro,20,8699,Guangzhou", ), ("MacBook Pro,10,9999,Beijing", ), ("AirPods Pro,50,1999,Beijing", ), ("MacBook Pro,10,11499,Shanghai", ), ("iPhone 11,30,5999,Shanghai", ), ("iPhone 11 Pro,20,9999,Shenzhen", ), ("MacBook Pro,10,13899,Hangzhou", ), ("iPhone 11,10,6799,Beijing", ), ("MacBook Pro,10,18999,Beijing", ), ("iPhone 11 Pro,10,11799,Shenzhen", ), ("MacBook Pro,10,22199,Shanghai", ), ("AirPods Pro,40,1999,Shanghai", ) ] t_env.from_elements(data, ["line"]) \ .select("split(line) as str_array") \ .select("get(str_array, 3) as city, " "get(str_array, 1).cast(LONG) as count, " "get(str_array, 2).cast(LONG) as unit_price") \ .select("city, count, count * unit_price as total_price") \ .group_by("city") \ .select("city, " "sum(count) as sales_volume, " "sum(total_price) as sales") \ .insert_into("Results") t_env.execute("word_count")
#!/user/bin/env python # -*- coding: utf-8 -*- import os from pyflink.datastream import StreamExecutionEnvironment, TimeCharacteristic, CheckpointingMode, FsStateBackend, ExternalizedCheckpointCleanup, CheckpointConfi from pyflink.table import StreamTableEnvironment, EnvironmentSettings from pyflink.table.types import DataTypes from imos_ddl import source_kafka_tbl_face_image_record, sink_pgsql_tbl_face_image_record, source_test, sink_test from pyflink.table.udf import udf add = udf(lambda i, j: i + j, [DataTypes.BIGINT(), DataTypes.BIGINT()], DataTypes.BIGINT()) # 创建Table Environment, 并选择使用的Planner env = StreamExecutionEnvironment.get_execution_environment() config = env.get_checkpoint_config() env.enable_checkpointing(5000, CheckpointingMode.EXACTLY_ONCE) # env.get_checkpoint_config(config) env.set_stream_time_characteristic(TimeCharacteristic.ProcessingTime) #checkPointPath = FileSystem.path("file:///home/flink/cdn_daemon_checkpoints") stateBackend = FsStateBackend("file:///var/flink/face_image/") env.set_state_backend(stateBackend) config.enable_externalized_checkpoints( ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION) t_env = StreamTableEnvironment.create(
def test_data_type_eq(self): lt = DataTypes.BIGINT() lt2 = pickle.loads(pickle.dumps(DataTypes.BIGINT())) self.assertEqual(lt, lt2)
import tempfile from pyflink.dataset import ExecutionEnvironment from pyflink.table import BatchTableEnvironment, TableConfig, WriteMode from pyflink.table.descriptors import FileSystem, OldCsv, Schema from pyflink.table.types import DataTypes t_config = TableConfig() env = ExecutionEnvironment.get_execution_environment() env.set_parallelism(1) t_env = BatchTableEnvironment.create(env, t_config) source_file = '/notebooks/big-text.txt' sink_file = '/notebooks/sink.csv' t_env.connect(FileSystem().path(source_file)).with_format( OldCsv().line_delimiter('\n').field( 'word', DataTypes.STRING())).with_schema(Schema().field( 'word', DataTypes.STRING())).register_table_source('mySource') t_env.connect(FileSystem().path(sink_file)).with_format( OldCsv().field_delimiter(',').field('word', DataTypes.STRING()).field( 'count', DataTypes.BIGINT())).with_schema(Schema().field( 'word', DataTypes.STRING()).field( 'count', DataTypes.BIGINT())).register_table_sink('mySink') t_env.scan('mySource').group_by('word').select('word, count(1)').insert_into( 'mySink') t_env.execute('wordcount')
def test_verify_type_not_nullable(self): import array import datetime import decimal schema = DataTypes.ROW([ DataTypes.FIELD('s', DataTypes.STRING(nullable=False)), DataTypes.FIELD('i', DataTypes.INT(True)) ]) class MyObj: def __init__(self, **kwargs): for k, v in kwargs.items(): setattr(self, k, v) # obj, data_type success_spec = [ # String ("", DataTypes.STRING()), (u"", DataTypes.STRING()), # UDT (ExamplePoint(1.0, 2.0), ExamplePointUDT()), # Boolean (True, DataTypes.BOOLEAN()), # TinyInt (-(2**7), DataTypes.TINYINT()), (2**7 - 1, DataTypes.TINYINT()), # SmallInt (-(2**15), DataTypes.SMALLINT()), (2**15 - 1, DataTypes.SMALLINT()), # Int (-(2**31), DataTypes.INT()), (2**31 - 1, DataTypes.INT()), # BigInt (2**64, DataTypes.BIGINT()), # Float & Double (1.0, DataTypes.FLOAT()), (1.0, DataTypes.DOUBLE()), # Decimal (decimal.Decimal("1.0"), DataTypes.DECIMAL(10, 0)), # Binary (bytearray([1]), DataTypes.BINARY(1)), # Date/Time/Timestamp (datetime.date(2000, 1, 2), DataTypes.DATE()), (datetime.datetime(2000, 1, 2, 3, 4), DataTypes.DATE()), (datetime.time(1, 1, 2), DataTypes.TIME()), (datetime.datetime(2000, 1, 2, 3, 4), DataTypes.TIMESTAMP()), # Array ([], DataTypes.ARRAY(DataTypes.INT())), (["1", None], DataTypes.ARRAY(DataTypes.STRING(nullable=True))), ([1, 2], DataTypes.ARRAY(DataTypes.INT())), ((1, 2), DataTypes.ARRAY(DataTypes.INT())), (array.array('h', [1, 2]), DataTypes.ARRAY(DataTypes.INT())), # Map ({}, DataTypes.MAP(DataTypes.STRING(), DataTypes.INT())), ({ "a": 1 }, DataTypes.MAP(DataTypes.STRING(), DataTypes.INT())), ({ "a": None }, DataTypes.MAP(DataTypes.STRING(nullable=False), DataTypes.INT(True))), # Struct ({ "s": "a", "i": 1 }, schema), ({ "s": "a", "i": None }, schema), ({ "s": "a" }, schema), ({ "s": "a", "f": 1.0 }, schema), (Row(s="a", i=1), schema), (Row(s="a", i=None), schema), (Row(s="a", i=1, f=1.0), schema), (["a", 1], schema), (["a", None], schema), (("a", 1), schema), (MyObj(s="a", i=1), schema), (MyObj(s="a", i=None), schema), (MyObj(s="a"), schema), ] # obj, data_type, exception class failure_spec = [ # Char/VarChar (match anything but None) (None, DataTypes.VARCHAR(1), ValueError), (None, DataTypes.CHAR(1), ValueError), # VarChar (length exceeds maximum length) ("abc", DataTypes.VARCHAR(1), ValueError), # Char (length exceeds length) ("abc", DataTypes.CHAR(1), ValueError), # UDT (ExamplePoint(1.0, 2.0), PythonOnlyUDT(), ValueError), # Boolean (1, DataTypes.BOOLEAN(), TypeError), ("True", DataTypes.BOOLEAN(), TypeError), ([1], DataTypes.BOOLEAN(), TypeError), # TinyInt (-(2**7) - 1, DataTypes.TINYINT(), ValueError), (2**7, DataTypes.TINYINT(), ValueError), ("1", DataTypes.TINYINT(), TypeError), (1.0, DataTypes.TINYINT(), TypeError), # SmallInt (-(2**15) - 1, DataTypes.SMALLINT(), ValueError), (2**15, DataTypes.SMALLINT(), ValueError), # Int (-(2**31) - 1, DataTypes.INT(), ValueError), (2**31, DataTypes.INT(), ValueError), # Float & Double (1, DataTypes.FLOAT(), TypeError), (1, DataTypes.DOUBLE(), TypeError), # Decimal (1.0, DataTypes.DECIMAL(10, 0), TypeError), (1, DataTypes.DECIMAL(10, 0), TypeError), ("1.0", DataTypes.DECIMAL(10, 0), TypeError), # Binary (1, DataTypes.BINARY(1), TypeError), # VarBinary (length exceeds maximum length) (bytearray([1, 2]), DataTypes.VARBINARY(1), ValueError), # Char (length exceeds length) (bytearray([1, 2]), DataTypes.BINARY(1), ValueError), # Date/Time/Timestamp ("2000-01-02", DataTypes.DATE(), TypeError), ("10:01:02", DataTypes.TIME(), TypeError), (946811040, DataTypes.TIMESTAMP(), TypeError), # Array (["1", None], DataTypes.ARRAY(DataTypes.VARCHAR(1, nullable=False)), ValueError), ([1, "2"], DataTypes.ARRAY(DataTypes.INT()), TypeError), # Map ({ "a": 1 }, DataTypes.MAP(DataTypes.INT(), DataTypes.INT()), TypeError), ({ "a": "1" }, DataTypes.MAP(DataTypes.VARCHAR(1), DataTypes.INT()), TypeError), ({ "a": None }, DataTypes.MAP(DataTypes.VARCHAR(1), DataTypes.INT(False)), ValueError), # Struct ({ "s": "a", "i": "1" }, schema, TypeError), (Row(s="a"), schema, ValueError), # Row can't have missing field (Row(s="a", i="1"), schema, TypeError), (["a"], schema, ValueError), (["a", "1"], schema, TypeError), (MyObj(s="a", i="1"), schema, TypeError), (MyObj(s=None, i="1"), schema, ValueError), ] # Check success cases for obj, data_type in success_spec: try: _create_type_verifier(data_type.not_null())(obj) except (TypeError, ValueError): self.fail("verify_type(%s, %s, nullable=False)" % (obj, data_type)) # Check failure cases for obj, data_type, exp in failure_spec: msg = "verify_type(%s, %s, nullable=False) == %s" % ( obj, data_type, exp) with self.assertRaises(exp, msg=msg): _create_type_verifier(data_type.not_null())(obj)
def test_get_schema(self): t = self.t_env.from_elements([(1, 'Hi', 'Hello')], ['a', 'b', 'c']) result = t.group_by("c").select("a.sum as a, c as b") schema = result.get_schema() assert schema == TableSchema(["a", "b"], [DataTypes.BIGINT(), DataTypes.STRING()])