def get_java_class_methods(java_class): gateway = get_gateway() s = set() method_arr = gateway.jvm.Class.forName(java_class).getDeclaredMethods() for i in range(0, len(method_arr)): s.add(method_arr[i].getName()) return s
def over_window(self, *over_windows): """ Defines over-windows on the records of a table. An over-window defines for each record an interval of records over which aggregation functions can be computed. Example: :: >>> table.window(Over.partition_by("c").order_by("rowTime")\ ... .preceding("10.seconds").alias("ow"))\ ... .select("c, b.count over ow, e.sum over ow") .. note:: Computing over window aggregates on a streaming table is only a parallel operation if the window is partitioned. Otherwise, the whole stream will be processed by a single task, i.e., with parallelism 1. .. note:: Over-windows for batch tables are currently not supported. :param over_windows: :class:`OverWindow`s created from :class:`Over`. :return: A :class:`OverWindowedTable`. """ gateway = get_gateway() window_array = to_jarray(gateway.jvm.OverWindow, [item._java_over_window for item in over_windows]) return OverWindowedTable(self._j_table.window(window_array))
def __init__(self, j_batch_query_config=None): self._jvm = get_gateway().jvm if j_batch_query_config is not None: self._j_batch_query_config = j_batch_query_config else: self._j_batch_query_config = self._jvm.BatchQueryConfig() super(BatchQueryConfig, self).__init__(self._j_batch_query_config)
def collect(self, table): j_table = table._j_table gateway = get_gateway() row_result = self.t_env._j_tenv\ .toDataSet(j_table, gateway.jvm.Class.forName("org.apache.flink.types.Row")).collect() string_result = [java_row.toString() for java_row in row_result] return string_result
def retract_results(): """ Retrieves the results from a retract table sink. """ gateway = get_gateway() results = gateway.jvm.RowCollector.getAndClearValues() return gateway.jvm.RowCollector.retractResults(results)
def with_idle_state_retention_time(self, min_time, max_time): """ Specifies a minimum and a maximum time interval for how long idle state, i.e., state which was not updated, will be retained. State will never be cleared until it was idle for less than the minimum time and will never be kept if it was idle for more than the maximum time. When new data arrives for previously cleaned-up state, the new data will be handled as if it was the first data. This can result in previous results being overwritten. Set to ``datetime.timedelta()``(zero) to never clean-up the state. .. note:: Cleaning up state requires additional bookkeeping which becomes less expensive for larger differences of minTime and maxTime. The difference between minTime and maxTime must be at least ``datetime.timedelta(minutes=5)``(5 minutes). :param min_time: The minimum time interval for which idle state is retained. Set to ``datetime.timedelta()``(zero) to never clean-up the state. :param max_time: The maximum time interval for which idle state is retained. Must be at least 5 minutes greater than minTime. Set to ``datetime.timedelta()``(zero) to never clean-up the state. :return: :class:`StreamQueryConfig` """ # type: (timedelta, timedelta) -> StreamQueryConfig j_time_class = get_gateway().jvm.org.apache.flink.api.common.time.Time j_min_time = j_time_class.milliseconds(long(round(min_time.total_seconds() * 1000))) j_max_time = j_time_class.milliseconds(long(round(max_time.total_seconds() * 1000))) self._j_stream_query_config = \ self._j_stream_query_config.withIdleStateRetentionTime(j_min_time, j_max_time) return self
def __init__(self, keys, is_append_only): TestTableSink._ensure_initialized() gateway = get_gateway() j_keys = gateway.new_array(gateway.jvm.String, len(keys)) for i in xrange(0, len(keys)): j_keys[i] = keys[i] super(TestUpsertSink, self).__init__(gateway.jvm.TestUpsertSink(j_keys, is_append_only))
def __init__(self, source_path, field_names, field_types): # type: (str, list[str], list[DataType]) -> None gateway = get_gateway() j_field_names = utils.to_jarray(gateway.jvm.String, field_names) j_field_types = utils.to_jarray(gateway.jvm.TypeInformation, [_to_java_type(field_type) for field_type in field_types]) super(CsvTableSource, self).__init__( gateway.jvm.CsvTableSource(source_path, j_field_names, j_field_types))
def upsert_results(keys): """ Retrieves the results from an upsert table sink. """ gateway = get_gateway() j_keys = gateway.new_array(gateway.jvm.int, len(keys)) for i in xrange(0, len(keys)): j_keys[i] = keys[i] results = gateway.jvm.RowCollector.getAndClearValues() return gateway.jvm.RowCollector.upsertResults(results, j_keys)
def to_jarray(j_type, arr): """ Convert python list to java type array :param j_type: java type of element in array :param arr: python type list """ gateway = get_gateway() j_arr = gateway.new_array(j_type, len(arr)) for i in range(0, len(arr)): j_arr[i] = arr[i] return j_arr
def over(cls, size): """ Creates a tumbling window. Tumbling windows are fixed-size, consecutive, non-overlapping windows of a specified fixed length. For example, a tumbling window of 5 minutes size groups elements in 5 minutes intervals. :param size: The size of the window as time or row-count interval. :return: A partially defined tumbling window. """ # type: (str) -> TumbleWithSize return TumbleWithSize( get_gateway().jvm.Tumble.over(size))
def partition_by(cls, partition_by): """ Partitions the elements on some partition keys. Each partition is individually sorted and aggregate functions are applied to each partition separately. :param partition_by: List of field references. :return: An over window with defined partitioning. """ # type: (str) -> OverWindowPartitioned return OverWindowPartitioned(get_gateway().jvm.Over.partitionBy(partition_by))
def order_by(cls, order_by): """ Specifies the time attribute on which rows are ordered. For streaming tables, reference a rowtime or proctime time attribute here to specify the time mode. :param order_by: Field reference. :return: An over window with defined order. """ # type: (str) -> OverWindowPartitionedOrdered return OverWindowPartitionedOrdered(get_gateway().jvm.Over.orderBy(order_by))
def with_gap(cls, gap): """ Creates a session window. The boundary of session windows are defined by intervals of inactivity, i.e., a session window is closes if no event appears for a defined gap period. :param gap: Specifies how long (as interval of milliseconds) to wait for new data before closing the session window. :return: A partially defined session window. """ # type: (str) -> SessionWithGap return SessionWithGap( get_gateway().jvm.Session.withGap(gap))
def properties(self, property_dict): """ Sets the configuration properties for the Kafka consumer. Resets previously set properties. :param property_dict: The dict object contains configuration properties for the Kafka consumer. Both the keys and values should be strings. :return: This object. """ gateway = get_gateway() properties = gateway.jvm.java.util.Properties() for key in property_dict: properties.setProperty(key, property_dict[key]) self._j_kafka = self._j_kafka.properties(properties) return self
def watermarks_from_strategy(self, strategy): """ Sets a custom watermark strategy to be used for the rowtime attribute. :param strategy: The java canonical class name of the WatermarkStrategy. The WatermarkStrategy must have a public no-argument constructor and can be founded by in current Java classloader. :return: This rowtime descriptor. """ gateway = get_gateway() self._j_rowtime = self._j_rowtime.watermarksFromStrategy( gateway.jvm.Thread.currentThread().getContextClassLoader().loadClass(strategy) .newInstance()) return self
def failure_handler_custom(self, failure_handler_class_name): """ Configures a failure handling strategy in case a request to Elasticsearch fails. This strategy allows for custom failure handling using a ``ActionRequestFailureHandler``. :param failure_handler_class_name: :return: This object. """ gateway = get_gateway() self._j_elasticsearch = self._j_elasticsearch.failureHandlerCustom( gateway.jvm.Thread.currentThread().getContextClassLoader() .loadClass(failure_handler_class_name)) return self
def timestamps_from_extractor(self, extractor): """ Sets a custom timestamp extractor to be used for the rowtime attribute. :param extractor: The java canonical class name of the TimestampExtractor to extract the rowtime attribute from the physical type. The TimestampExtractor must have a public no-argument constructor and can be founded by in current Java classloader. :return: This rowtime descriptor. """ gateway = get_gateway() self._j_rowtime = self._j_rowtime.timestampsFromExtractor( gateway.jvm.Thread.currentThread().getContextClassLoader().loadClass(extractor) .newInstance()) return self
def over(cls, size): """ Creates a sliding window. Sliding windows have a fixed size and slide by a specified slide interval. If the slide interval is smaller than the window size, sliding windows are overlapping. Thus, an element can be assigned to multiple windows. For example, a sliding window of size 15 minutes with 5 minutes sliding interval groups elements of 15 minutes and evaluates every five minutes. Each element is contained in three consecutive window evaluations. :param size: The size of the window as time or row-count interval. :return: A partially specified sliding window. """ # type: (str) -> SlideWithSize return SlideWithSize( get_gateway().jvm.Slide.over(size))
def register_table_sink(self, name, field_names, field_types, table_sink): """ Registers an external :class:`TableSink` with given field names and types in this :class:`TableEnvironment`'s catalog. Registered sink tables can be referenced in SQL DML statements. :param name: The name under which the :class:`TableSink` is registered. :param field_names: The field names to register with the :class:`TableSink`. :param field_types: The field types to register with the :class:`TableSink`. :param table_sink: The :class:`TableSink` to register. """ gateway = get_gateway() j_field_names = utils.to_jarray(gateway.jvm.String, field_names) j_field_types = utils.to_jarray( gateway.jvm.TypeInformation, [type_utils.to_java_type(field_type) for field_type in field_types]) self._j_tenv.registerTableSink(name, j_field_names, j_field_types, table_sink._j_table_sink)
def sink_partitioner_custom(self, partitioner_class_name): """ Configures how to partition records from Flink's partitions into Kafka's partitions. This strategy allows for a custom partitioner by providing an implementation of ``FlinkKafkaPartitioner``. :param partitioner_class_name: The java canonical class name of the FlinkKafkaPartitioner. The FlinkKafkaPartitioner must have a public no-argument constructor and can be founded by in current Java classloader. :return: This object. """ gateway = get_gateway() self._j_kafka = self._j_kafka.sinkPartitionerCustom( gateway.jvm.Thread.currentThread().getContextClassLoader() .loadClass(partitioner_class_name)) return self
def __init__(self, path, field_delimiter=',', num_files=1, write_mode=None): # type: (str, str, int, int) -> None gateway = get_gateway() if write_mode == WriteMode.NO_OVERWRITE: j_write_mode = gateway.jvm.scala.Option.apply( gateway.jvm.org.apache.flink.core.fs.FileSystem.WriteMode.NO_OVERWRITE) elif write_mode == WriteMode.OVERWRITE: j_write_mode = gateway.jvm.scala.Option.apply( gateway.jvm.org.apache.flink.core.fs.FileSystem.WriteMode.OVERWRITE) elif write_mode is None: j_write_mode = gateway.jvm.scala.Option.empty() else: raise Exception('Unsupported write_mode: %s' % write_mode) j_some_field_delimiter = gateway.jvm.scala.Option.apply(field_delimiter) j_some_num_files = gateway.jvm.scala.Option.apply(num_files) j_csv_table_sink = gateway.jvm.CsvTableSink( path, j_some_field_delimiter, j_some_num_files, j_write_mode) super(CsvTableSink, self).__init__(j_csv_table_sink)
def _ensure_initialized(cls): if TestTableSink._inited: return FLINK_SOURCE_ROOT_DIR = _find_flink_source_root() filename_pattern = ( "flink-table/flink-table-planner/target/" "flink-table-planner*-tests.jar") if not glob.glob(os.path.join(FLINK_SOURCE_ROOT_DIR, filename_pattern)): raise unittest.SkipTest( "'flink-table-planner*-tests.jar' is not available. Will skip the related tests.") gateway = get_gateway() java_import(gateway.jvm, "org.apache.flink.table.runtime.stream.table.TestAppendSink") java_import(gateway.jvm, "org.apache.flink.table.runtime.stream.table.TestRetractSink") java_import(gateway.jvm, "org.apache.flink.table.runtime.stream.table.TestUpsertSink") java_import(gateway.jvm, "org.apache.flink.table.runtime.stream.table.RowCollector") TestTableSink._inited = True
def insert_into(self, table_path, *table_path_continued): """ Writes the :class:`Table` to a :class:`TableSink` that was registered under the specified name. For the path resolution algorithm see :func:`~TableEnvironment.useDatabase`. Example: :: >>> tab.insert_into("print") :param table_path: The first part of the path of the registered :class:`TableSink` to which the :class:`Table` is written. This is to ensure at least the name of the :class:`Table` is provided. :param table_path_continued: The remaining part of the path of the registered :class:`TableSink` to which the :class:`Table` is written. """ gateway = get_gateway() j_table_path = to_jarray(gateway.jvm.String, table_path_continued) self._j_table.insertInto(table_path, j_table_path)
def get_table_environment(cls, table_config): """ Returns a :class:`StreamTableEnvironment` or a :class:`BatchTableEnvironment` which matches the :class:`TableConfig`'s content. :type table_config: The TableConfig for the new TableEnvironment. :return: Desired :class:`TableEnvironment`. """ gateway = get_gateway() if table_config.is_stream: j_execution_env = gateway.jvm.StreamExecutionEnvironment.getExecutionEnvironment() j_tenv = gateway.jvm.StreamTableEnvironment.create(j_execution_env) t_env = StreamTableEnvironment(j_tenv) else: j_execution_env = gateway.jvm.ExecutionEnvironment.getExecutionEnvironment() j_tenv = gateway.jvm.BatchTableEnvironment.create(j_execution_env) t_env = BatchTableEnvironment(j_tenv) if table_config.parallelism is not None: t_env._j_tenv.execEnv().setParallelism(table_config.parallelism) return t_env
def scan(self, *table_path): """ Scans a registered table and returns the resulting :class:`Table`. A table to scan must be registered in the TableEnvironment. It can be either directly registered as TableSource or Table. Examples: Scanning a directly registered table :: >>> tab = t_env.scan("tableName") Scanning a table from a registered catalog :: >>> tab = t_env.scan("catalogName", "dbName", "tableName") :param table_path: The path of the table to scan. :throws: Exception if no table is found using the given table path. :return: The resulting :class:`Table` """ gateway = get_gateway() j_table_paths = utils.to_jarray(gateway.jvm.String, table_path) j_table = self._j_tenv.scan(j_table_paths) return Table(j_table)
def to_java_type(py_type): global _data_types_mapping global _lock if _data_types_mapping is None: with _lock: gateway = get_gateway() TYPES = gateway.jvm.org.apache.flink.api.common.typeinfo.Types _data_types_mapping = { DataTypes.STRING: TYPES.STRING, DataTypes.BOOLEAN: TYPES.BOOLEAN, DataTypes.BYTE: TYPES.BYTE, DataTypes.CHAR: TYPES.CHAR, DataTypes.SHORT: TYPES.SHORT, DataTypes.INT: TYPES.INT, DataTypes.LONG: TYPES.LONG, DataTypes.FLOAT: TYPES.FLOAT, DataTypes.DOUBLE: TYPES.DOUBLE, DataTypes.DATE: TYPES.SQL_DATE, DataTypes.TIME: TYPES.SQL_TIME, DataTypes.TIMESTAMP: TYPES.SQL_TIMESTAMP } return _data_types_mapping[py_type]
def __init__(self, part_prefix: str, part_suffix: str): self.j_obj = get_gateway().jvm.org.apache.flink.streaming.api.\ functions.sink.filesystem.OutputFileConfig(part_prefix, part_suffix)
def _ternary_op(op_name: str, first, second, third) -> Expression: gateway = get_gateway() return Expression(getattr(gateway.jvm.Expressions, op_name)( _get_java_expression(first), _get_java_expression(second), _get_java_expression(third)))
def get_default() -> 'TableConfig': """ :return: A TableConfig object with default settings. """ return TableConfig(get_gateway().jvm.TableConfig.getDefault())
def __init__(self): gateway = get_gateway() self._j_rowtime = gateway.jvm.Rowtime() super(Rowtime, self).__init__(self._j_rowtime)
def defaults() -> 'JdbcExecutionOptions': return JdbcExecutionOptions(j_jdbc_execution_options=get_gateway( ).jvm.org.apache.flink.connector.jdbc.JdbcExecutionOptions.defaults())
def __init__(self): gateway = get_gateway() format_type = \ gateway.jvm.org.apache.flink.python.connector.CsvRetractTableSinkFactory.RETRACT_CSV super(RetractCsv, self).__init__(format_type, 1)
def from_message_time(timestamp: int) -> 'StartCursor': JStartCursor = get_gateway().jvm \ .org.apache.flink.connector.pulsar.source.enumerator.cursor.StartCursor return StartCursor(JStartCursor.fromMessageTime(timestamp))
def __init__(self): self._jvm = get_gateway().jvm self._j_table_config = self._jvm.TableConfig() self._is_stream = None # type: bool self._parallelism = None # type: int
def setUpClass(cls): super(KafkaDescriptorTests, cls).setUpClass() cls._cxt_clz_loader = get_gateway().jvm.Thread.currentThread( ).getContextClassLoader() _load_specific_flink_module_jars( '/flink-connectors/flink-connector-kafka')
def _to_j_subscription_type(self): JSubscriptionType = get_gateway().jvm.org.apache.pulsar.client.api.SubscriptionType return getattr(JSubscriptionType, self.name)
def _leaf_op(op_name: str) -> Expression: gateway = get_gateway() return Expression(getattr(gateway.jvm.Expressions, op_name)())
def add_python_archive(self, archive_path: str, target_dir: str = None): """ Adds a python archive file. The file will be extracted to the working directory of python UDF worker. If the parameter "target_dir" is specified, the archive file will be extracted to a directory named ${target_dir}. Otherwise, the archive file will be extracted to a directory with the same name of the archive file. If python UDF depends on a specific python version which does not exist in the cluster, this method can be used to upload the virtual environment. Note that the path of the python interpreter contained in the uploaded environment should be specified via the method :func:`pyflink.table.TableConfig.set_python_executable`. The files uploaded via this method are also accessible in UDFs via relative path. Example: :: # command executed in shell # assert the relative path of python interpreter is py_env/bin/python $ zip -r py_env.zip py_env # python code >>> stream_env.add_python_archive("py_env.zip") >>> stream_env.set_python_executable("py_env.zip/py_env/bin/python") # or >>> stream_env.add_python_archive("py_env.zip", "myenv") >>> stream_env.set_python_executable("myenv/py_env/bin/python") # the files contained in the archive file can be accessed in UDF >>> def my_udf(): ... with open("myenv/py_env/data/data.txt") as f: ... ... .. note:: Please make sure the uploaded python environment matches the platform that the cluster is running on and that the python version must be 3.6 or higher. .. note:: Currently only zip-format is supported. i.e. zip, jar, whl, egg, etc. The other archive formats such as tar, tar.gz, 7z, rar, etc are not supported. :param archive_path: The archive file path. :param target_dir: Optional, the target dir name that the archive file extracted to. """ jvm = get_gateway().jvm if target_dir is not None: archive_path = jvm.PythonDependencyUtils.PARAM_DELIMITER.join( [archive_path, target_dir]) env_config = jvm.org.apache.flink.python.util.PythonConfigUtil \ .getEnvironmentConfig(self._j_stream_execution_environment) python_archives = env_config.getString( jvm.PythonOptions.PYTHON_ARCHIVES.key(), None) if python_archives is not None: python_files = jvm.PythonDependencyUtils.FILE_DELIMITER.join( [python_archives, archive_path]) else: python_files = archive_path env_config.setString(jvm.PythonOptions.PYTHON_ARCHIVES.key(), python_files)
def _to_j_time_characteristic(self): gateway = get_gateway() JTimeCharacteristic = gateway.jvm.org.apache.flink.streaming.api.TimeCharacteristic return getattr(JTimeCharacteristic, self.name)
def __init__(self, database_name=None, object_name=None, j_object_path=None): if j_object_path is None: gateway = get_gateway() self._j_object_path = gateway.jvm.ObjectPath(database_name, object_name) else: self._j_object_path = j_object_path
def of_mebi_bytes(mebi_bytes: int) -> 'MemorySize': return MemorySize(get_gateway().jvm.org.apache.flink.configuration. MemorySize.ofMebiBytes(mebi_bytes))
def __init__(self): self._j_builder = get_gateway().jvm\ .org.apache.flink.connector.jdbc.JdbcExecutionOptions.builder()
def __init__(self, j_stream_query_config=None): if j_stream_query_config is not None: self._j_stream_query_config = j_stream_query_config else: self._j_stream_query_config = get_gateway().jvm.StreamQueryConfig() super(StreamQueryConfig, self).__init__(self._j_stream_query_config)
def __init__(self): self._j_options_builder = get_gateway().jvm.org.apache.flink.connector\ .jdbc.JdbcConnectionOptions.JdbcConnectionOptionsBuilder()
def _from_file(self, filename, schema): gateway = get_gateway() jds = gateway.jvm.PythonBridgeUtils.createDataSetFromFile( self._j_tenv.execEnv(), filename, True) return Table(gateway.jvm.PythonTableUtils.fromDataSet( self._j_tenv, jds, _to_java_type(schema)))
def __init__(self, j_function: Union[str, JavaObject]): if isinstance(j_function, str): j_func_class = get_gateway().jvm.__getattr__(j_function) j_function = j_func_class() self._j_function = j_function
def setUpClass(cls): super(ElasticsearchDescriptorTest, cls).setUpClass() cls._cxt_clz_loader = get_gateway().jvm.Thread.currentThread().getContextClassLoader() _load_specific_flink_module_jars('/flink-connectors/flink-connector-elasticsearch-base')
def _to_j_semantic(self): JSemantic = get_gateway().jvm \ .org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer.Semantic return getattr(JSemantic, self.name)
def __init__(self, partition_spec): if isinstance(partition_spec, dict): gateway = get_gateway() self._j_catalog_partition_spec = gateway.jvm.CatalogPartitionSpec(partition_spec) else: self._j_catalog_partition_spec = partition_spec
def __init__(self, j_table_config=None): gateway = get_gateway() if j_table_config is None: self._j_table_config = gateway.jvm.TableConfig() else: self._j_table_config = j_table_config
def from_string(full_name): gateway = get_gateway() return ObjectPath(j_object_path=gateway.jvm.ObjectPath.fromString(full_name))
def latest() -> 'StopCursor': JStopCursor = get_gateway().jvm \ .org.apache.flink.connector.pulsar.source.enumerator.cursor.StopCursor return StopCursor(JStopCursor.latest())
def _unary_op(op_name: str, arg) -> Expression: gateway = get_gateway() return Expression(getattr(gateway.jvm.Expressions, op_name)(_get_java_expression(arg)))
def test_set_sys_executable_for_local_mode(self): jvm = get_gateway().jvm actual_executable = get_j_env_configuration(self.t_env) \ .getString(jvm.PythonOptions.PYTHON_EXECUTABLE.key(), None) self.assertEqual(sys.executable, actual_executable)
def at_event_time(timestamp: int) -> 'StopCursor': JStopCursor = get_gateway().jvm \ .org.apache.flink.connector.pulsar.source.enumerator.cursor.StopCursor return StopCursor(JStopCursor.atEventTime(timestamp))
def __init__(self): JPulsarSource = \ get_gateway().jvm.org.apache.flink.connector.pulsar.source.PulsarSource self._j_pulsar_source_builder = JPulsarSource.builder()
def tearDownClass(cls): if cls._cxt_clz_loader is not None: get_gateway().jvm.Thread.currentThread().setContextClassLoader(cls._cxt_clz_loader)
def __init__(self, field_names, field_types): TestTableSink._ensure_initialized() gateway = get_gateway() super(TestRetractSink, self).__init__(gateway.jvm.TestRetractSink(), field_names, field_types)
def _get_python_env(): gateway = get_gateway() exec_type = gateway.jvm.org.apache.flink.table.functions.python.PythonEnv.ExecType.PROCESS return gateway.jvm.org.apache.flink.table.functions.python.PythonEnv( exec_type)