def test_field_delimiter(self): csv = Csv().field_delimiter("|") properties = csv.to_properties() expected = {'format.field-delimiter': '|', 'format.type': 'csv', 'format.property-version': '1'} self.assertEqual(expected, properties)
def test_derive_schema(self): csv = Csv().derive_schema() expected = {'format.derive-schema': 'true', 'format.property-version': '1', 'format.type': 'csv'} properties = csv.to_properties() self.assertEqual(expected, properties)
def test_null_literal(self): csv = Csv().null_literal("null") expected = {'format.null-literal': 'null', 'format.property-version': '1', 'format.type': 'csv'} properties = csv.to_properties() self.assertEqual(expected, properties)
def test_escape_character(self): csv = Csv().escape_character("\\") expected = {'format.escape-character': '\\', 'format.property-version': '1', 'format.type': 'csv'} properties = csv.to_properties() self.assertEqual(expected, properties)
def test_array_element_delimiter(self): csv = Csv().array_element_delimiter("/") expected = {'format.array-element-delimiter': '/', 'format.property-version': '1', 'format.type': 'csv'} properties = csv.to_properties() self.assertEqual(expected, properties)
def test_ignore_parse_errors(self): csv = Csv().ignore_parse_errors() expected = {'format.ignore-parse-errors': 'true', 'format.property-version': '1', 'format.type': 'csv'} properties = csv.to_properties() self.assertEqual(expected, properties)
def test_allow_comments(self): csv = Csv().allow_comments() expected = {'format.allow-comments': 'true', 'format.property-version': '1', 'format.type': 'csv'} properties = csv.to_properties() self.assertEqual(expected, properties)
def test_quote_character(self): csv = Csv().quote_character("'") expected = {'format.quote-character': "'", 'format.property-version': '1', 'format.type': 'csv'} properties = csv.to_properties() self.assertEqual(expected, properties)
def test_line_delimiter(self): csv = Csv().line_delimiter(";") expected = {'format.line-delimiter': ';', 'format.property-version': '1', 'format.type': 'csv'} properties = csv.to_properties() self.assertEqual(expected, properties)
def test_schema(self): csv = Csv().schema(DataTypes.ROW([DataTypes.FIELD("a", DataTypes.INT()), DataTypes.FIELD("b", DataTypes.STRING())])) expected = {'format.schema': 'ROW<a INT, b VARCHAR>', 'format.property-version': '1', 'format.type': 'csv'} properties = csv.to_properties() self.assertEqual(expected, properties)
from pyflink.table.descriptors import Schema, OldCsv, FileSystem, Kafka, Json, Csv exec_env = StreamExecutionEnvironment.get_execution_environment() exec_env.set_parallelism(1) t_config = TableConfig() t_env = StreamTableEnvironment.create(exec_env, t_config) t_env.connect(Kafka() .version("0.11") .topic("test") .property("zookeeper.connect", "localhost:2181") .property("bootstrap.servers", "localhost:9092") ) \ .in_append_mode() \ .with_format(Csv() .line_delimiter("\r\n") \ .derive_schema()) \ .with_schema(Schema() .field("tbd", DataTypes.INT())) \ .register_table_source('mySource') t_env.connect(FileSystem().path('../production_data/kafkaoutput')) \ .with_format(OldCsv() .field('tbd', DataTypes.INT())) \ .with_schema(Schema() .field("tbd", DataTypes.INT())) \ .register_table_sink('mySink') t_env.scan('mySource') \ .select('tbd') \ .where("tbd = 1") \
StreamTableEnvironment, ) from pyflink.table.descriptors import Schema, Csv, OldCsv, FileSystem from pathlib import Path exec_env = ExecutionEnvironment.get_execution_environment() exec_env.set_parallelism(1) t_config = TableConfig() t_env = BatchTableEnvironment.create(exec_env, t_config) root = Path("vaccinations.csv").parent.resolve() out_path = root / "output_vaccines.csv" try: out_path.unlink() except: pass from pyflink.table.window import Tumble (t_env.connect(FileSystem().path(str(root / "vaccinations.csv"))).with_format( Csv()).with_schema(Schema().field("date", DataTypes.DATE(True)).field( "word", DataTypes.STRING())).create_temporary_table("mySource")) (t_env.connect(FileSystem().path(str(out_path))).with_format( Csv()).with_schema(Schema().field("word", DataTypes.STRING()).field( "count", DataTypes.BIGINT())).create_temporary_table("mySink")) (t_env.from_path("mySource").group_by("word").select( "word, count(1) as count").filter("count > 1").insert_into("mySink")) t_env.execute("word_count_vaccine")
) from pyflink.table.descriptors import Schema, Csv, OldCsv, FileSystem from pathlib import Path exec_env = ExecutionEnvironment.get_execution_environment() exec_env.set_parallelism(1) t_config = TableConfig() t_env = BatchTableEnvironment.create(exec_env, t_config) root = Path("netflix_times.csv").parent.resolve() out_path = root / "output_flix.csv" try: out_path.unlink() except: pass from pyflink.table.window import Tumble (t_env.connect(FileSystem().path(str(root / "netflix_times.csv"))).with_format( Csv()).with_schema(Schema().field("year_added", DataTypes.INT()).field( "release_year", DataTypes.INT())).create_temporary_table("mySource")) (t_env.connect(FileSystem().path(str(out_path))).with_format( Csv()).with_schema(Schema().field("release_year", DataTypes.INT()).field( "count", DataTypes.BIGINT())).create_temporary_table("mySink")) (t_env.from_path("mySource").select("year_added, release_year").insert_into( "mySink")) t_env.execute("time_gap")