コード例 #1
0
 def startup_loopback_server():
     from pyflink.common import Configuration
     from pyflink.fn_execution.beam.beam_worker_pool_service import \
         BeamFnLoopbackWorkerPoolServicer
     config = Configuration(j_configuration=j_configuration)
     config.set_string("python.loopback-server.address",
                       BeamFnLoopbackWorkerPoolServicer().start())
コード例 #2
0
ファイル: test_window.py プロジェクト: tongcheng-elong/flink
    def test_side_output_late_data(self):
        self.env.set_parallelism(1)
        config = Configuration(j_configuration=get_j_env_configuration(
            self.env._j_stream_execution_environment))
        config.set_integer('python.fn-execution.bundle.size', 1)
        jvm = get_gateway().jvm
        watermark_strategy = WatermarkStrategy(
            jvm.org.apache.flink.api.common.eventtime.WatermarkStrategy.
            forGenerator(jvm.org.apache.flink.streaming.api.functions.python.
                         eventtime.PerElementWatermarkGenerator.getSupplier())
        ).with_timestamp_assigner(SecondColumnTimestampAssigner())

        tag = OutputTag('late-data',
                        type_info=Types.ROW([Types.STRING(),
                                             Types.INT()]))
        ds1 = self.env.from_collection(
            [('a', 0), ('a', 8), ('a', 4), ('a', 6)],
            type_info=Types.ROW([Types.STRING(), Types.INT()]))
        ds2 = ds1.assign_timestamps_and_watermarks(watermark_strategy) \
            .key_by(lambda e: e[0]) \
            .window(TumblingEventTimeWindows.of(Time.milliseconds(5))) \
            .allowed_lateness(0) \
            .side_output_late_data(tag) \
            .process(CountWindowProcessFunction(),
                     Types.TUPLE([Types.STRING(), Types.LONG(), Types.LONG(), Types.INT()]))
        main_sink = DataStreamTestSinkFunction()
        ds2.add_sink(main_sink)
        side_sink = DataStreamTestSinkFunction()
        ds2.get_side_output(tag).add_sink(side_sink)

        self.env.execute('test_side_output_late_data')
        main_expected = ['(a,0,5,1)', '(a,5,10,2)']
        self.assert_equals_sorted(main_expected, main_sink.get_results())
        side_expected = ['+I[a, 4]']
        self.assert_equals_sorted(side_expected, side_sink.get_results())
コード例 #3
0
 def startup_loopback_server():
     from pyflink.common import Configuration
     from pyflink.fn_execution.beam.beam_worker_pool_service import \
         BeamFnLoopbackWorkerPoolServicer
     config = Configuration(j_configuration=j_configuration)
     config.set_string("PYFLINK_LOOPBACK_SERVER_ADDRESS",
                       BeamFnLoopbackWorkerPoolServicer().start())
コード例 #4
0
    def test_from_configuration(self):

        config = Configuration()
        config.set_string("execution.runtime-mode", "batch")

        actual_setting = EnvironmentSettings.from_configuration(config)
        self.assertFalse(actual_setting.is_streaming_mode(), "Use batch mode.")
コード例 #5
0
    def test_add_configuration(self):
        table_config = TableConfig.get_default()
        configuration = Configuration()
        configuration.set_string("k1", "v1")

        table_config.add_configuration(configuration)

        self.assertEqual(table_config.get("k1", ""), "v1")
コード例 #6
0
    def test_contains_key(self):
        conf = Configuration()
        conf.set_string("k1", "v1")

        contains_k1 = conf.contains_key("k1")
        contains_k2 = conf.contains_key("k2")

        self.assertTrue(contains_k1)
        self.assertFalse(contains_k2)
コード例 #7
0
    def test_deepcopy(self):
        conf = Configuration()
        conf.set_string("k1", "v1")

        conf2 = deepcopy(conf)

        self.assertEqual(conf2, conf)

        conf2.set_string("k1", "v2")

        self.assertNotEqual(conf2, conf)
コード例 #8
0
ファイル: orc.py プロジェクト: niaoUncle/flink
    def for_row_type(row_type: RowType,
                     writer_properties: Optional[Configuration] = None,
                     hadoop_config: Optional[Configuration] = None) \
            -> BulkWriterFactory:
        """
        Create a RowDataBulkWriterFactory that writes Row records with a defined RowType into Orc
        files in a batch fashion.

        Example:
        ::

            >>> row_type = DataTypes.ROW([
            ...     DataTypes.FIELD('string', DataTypes.STRING()),
            ...     DataTypes.FIELD('int_array', DataTypes.ARRAY(DataTypes.INT()))
            ... ])
            >>> row_type_info = Types.ROW_NAMED(
            ...     ['string', 'int_array'],
            ...     [Types.STRING(), Types.LIST(Types.INT())]
            ... )
            >>> sink = FileSink.for_bulk_format(
            ...     OUTPUT_DIR, OrcBulkWriters.for_row_type(
            ...         row_type=row_type,
            ...         writer_properties=Configuration(),
            ...         hadoop_config=Configuration(),
            ...     )
            ... ).build()
            >>> ds.map(lambda e: e, output_type=row_type_info).sink_to(sink)

        Note that in the above example, an identity map to indicate its RowTypeInfo is necessary
        before ``sink_to`` when ``ds`` is a source stream producing **RowData** records,
        because RowDataBulkWriterFactory assumes the input record type is Row.
        """
        if not isinstance(row_type, RowType):
            raise TypeError('row_type must be an instance of RowType')

        j_data_type = _to_java_data_type(row_type)
        jvm = get_gateway().jvm
        j_row_type = j_data_type.getLogicalType()
        orc_types = to_jarray(
            jvm.org.apache.flink.table.types.logical.LogicalType,
            [i for i in j_row_type.getChildren()])
        type_description = jvm.org.apache.flink.orc \
            .OrcSplitReaderUtil.logicalTypeToOrcType(j_row_type)
        if writer_properties is None:
            writer_properties = Configuration()
        if hadoop_config is None:
            hadoop_config = Configuration()

        return RowDataBulkWriterFactory(
            jvm.org.apache.flink.orc.writer.OrcBulkWriterFactory(
                jvm.org.apache.flink.orc.vector.RowDataVectorizer(
                    type_description.toString(), orc_types),
                create_java_properties(writer_properties),
                create_hadoop_configuration(hadoop_config)), row_type)
コード例 #9
0
ファイル: test_utils.py プロジェクト: apache/flink-ml
    def setUp(self):
        self.env = StreamExecutionEnvironment.get_execution_environment()
        self._load_dependency_jars()
        config = Configuration(
            j_configuration=get_j_env_configuration(self.env._j_stream_execution_environment))
        config.set_boolean("execution.checkpointing.checkpoints-after-tasks-finish.enabled", True)

        self.env.set_parallelism(4)
        self.env.enable_checkpointing(100)
        self.env.set_restart_strategy(RestartStrategies.no_restart())
        self.t_env = StreamTableEnvironment.create(self.env)
        self.temp_dir = tempfile.mkdtemp()
コード例 #10
0
    def setUp(self) -> None:
        from pyflink.datastream import StreamExecutionEnvironment

        super(DataStreamConversionTestCases, self).setUp()
        config = Configuration()
        config.set_string("akka.ask.timeout", "20 s")
        self.env = StreamExecutionEnvironment.get_execution_environment(config)
        self.t_env = StreamTableEnvironment.create(self.env)

        self.env.set_parallelism(2)
        self.t_env.get_config().set("python.fn-execution.bundle.size", "1")
        self.test_sink = DataStreamTestSinkFunction()
コード例 #11
0
    def test_get_execution_environment_with_config(self):
        configuration = Configuration()
        configuration.set_integer('parallelism.default', 12)
        configuration.set_string('pipeline.name', 'haha')
        env = StreamExecutionEnvironment.get_execution_environment(configuration)
        execution_config = env.get_config()

        self.assertEqual(execution_config.get_parallelism(), 12)
        config = Configuration(
            j_configuration=get_j_env_configuration(env._j_stream_execution_environment))
        self.assertEqual(config.get_string('pipeline.name', ''), 'haha')
コード例 #12
0
 def _build_parquet_columnar_job(self, row_type: RowType):
     source = FileSource.for_bulk_file_format(
         ParquetColumnarRowInputFormat(row_type, Configuration(), 10, True, False),
         self.parquet_file_name
     ).build()
     ds = self.env.from_source(source, WatermarkStrategy.no_watermarks(), 'parquet-source')
     ds.map(lambda e: e).add_sink(self.test_sink)
コード例 #13
0
ファイル: environment_settings.py プロジェクト: twalthr/flink
    def to_configuration(self) -> Configuration:
        """
        Convert to `pyflink.common.Configuration`.

        :return: Configuration with specified value.
        """
        return Configuration(j_configuration=self._j_environment_settings.toConfiguration())
コード例 #14
0
    def test_init(self):
        conf = Configuration()

        self.assertEqual(conf.to_dict(), dict())

        conf.set_string("k1", "v1")
        conf2 = Configuration(conf)

        self.assertEqual(conf2.to_dict(), {"k1": "v1"})
コード例 #15
0
ファイル: test_file_system.py プロジェクト: xinsmile/flink
 def _build_parquet_columnar_job(self, row_type: RowType,
                                 parquet_file_name: str):
     source = FileSource.for_bulk_file_format(
         ParquetColumnarRowInputFormat(Configuration(), row_type, 10, True,
                                       True), parquet_file_name).build()
     ds = self.env.from_source(source, WatermarkStrategy.no_watermarks(),
                               'parquet-source')
     ds.map(PassThroughMapFunction()).add_sink(self.test_sink)
コード例 #16
0
    def get_configuration(self):
        """
        Gives direct access to the underlying key-value map for advanced configuration.

        :return: Entire key-value configuration.
        :rtype: Configuration
        """
        return Configuration(j_configuration=self._j_table_config.getConfiguration())
コード例 #17
0
    def get_configuration(self) -> Configuration:
        """
        Get the underlying `pyflink.common.Configuration`.

        :return: Configuration with specified value.
        """
        return Configuration(
            j_configuration=self._j_environment_settings.getConfiguration())
コード例 #18
0
    def get_configuration(self):
        """
        Returns all key/value configuration.

        :return: All key/value configuration.
        :rtype: Configuration
        """
        return Configuration(
            j_configuration=self._j_table_config.getConfiguration())
コード例 #19
0
    def to_configuration(self) -> Configuration:
        """
        Convert to `pyflink.common.Configuration`.

        It sets the `table.planner` and `execution.runtime-mode` according to the current
        EnvironmentSetting.

        :return: Configuration with specified value.
        """
        return Configuration(j_configuration=self._j_environment_settings.toConfiguration())
コード例 #20
0
    def test_key_set(self):
        conf = Configuration()

        conf.set_string("k1", "v1")
        conf.set_string("k2", "v2")
        conf.set_string("k3", "v3")
        key_set = conf.key_set()

        self.assertEqual(key_set, {"k1", "k2", "k3"})
コード例 #21
0
    def to_configuration(self) -> Configuration:
        """
        Convert to `pyflink.common.Configuration`.

        :return: Configuration with specified value.

        .. note:: Deprecated in 1.15. Please use
                :func:`EnvironmentSettings.get_configuration` instead.
        """
        return Configuration(
            j_configuration=self._j_environment_settings.toConfiguration())
コード例 #22
0
    def _generate_stream_graph(self, clear_transformations: bool = False, job_name: str = None) \
            -> JavaObject:
        gateway = get_gateway()
        JPythonConfigUtil = gateway.jvm.org.apache.flink.python.util.PythonConfigUtil
        # start BeamFnLoopbackWorkerPoolServicer when executed in MiniCluster
        j_configuration = get_j_env_configuration(self._j_stream_execution_environment)
        if not self._remote_mode and is_local_deployment(j_configuration):
            from pyflink.common import Configuration
            from pyflink.fn_execution.beam.beam_worker_pool_service import \
                BeamFnLoopbackWorkerPoolServicer

            jvm = gateway.jvm
            env_config = JPythonConfigUtil.getEnvironmentConfig(
                self._j_stream_execution_environment)
            parallelism = self.get_parallelism()
            if parallelism > 1 and env_config.containsKey(jvm.PythonOptions.PYTHON_ARCHIVES.key()):
                import logging
                logging.warning("Lookback mode is disabled as python archives are used and the "
                                "parallelism of the job is greater than 1. The Python user-defined "
                                "functions will be executed in an independent Python process.")
            else:
                config = Configuration(j_configuration=j_configuration)
                config.set_string(
                    "loopback.server.address", BeamFnLoopbackWorkerPoolServicer().start())

        JPythonConfigUtil.configPythonOperator(self._j_stream_execution_environment)

        gateway.jvm.org.apache.flink.python.chain.PythonOperatorChainingOptimizer.apply(
            self._j_stream_execution_environment)

        JPythonConfigUtil.setPartitionCustomOperatorNumPartitions(
            get_field_value(self._j_stream_execution_environment, "transformations"))

        j_stream_graph = self._j_stream_execution_environment.getStreamGraph(clear_transformations)
        if job_name is not None:
            j_stream_graph.setJobName(job_name)
        return j_stream_graph
コード例 #23
0
    def for_row_type(row_type: 'RowType',
                     writer_properties: Optional[Configuration] = None,
                     hadoop_config: Optional[Configuration] = None) \
            -> BulkWriterFactory:
        """
        Create a :class:`~pyflink.common.serialization.BulkWriterFactory` that writes records
        with a predefined schema into Orc files in a batch fashion.

        :param row_type: The RowType of records, it should match the RowTypeInfo of Row records.
        :param writer_properties: Orc writer options.
        :param hadoop_config: Hadoop configuration.
        """
        from pyflink.table.types import RowType
        if not isinstance(row_type, RowType):
            raise TypeError('row_type must be an instance of RowType')

        from pyflink.table.types import _to_java_data_type
        j_data_type = _to_java_data_type(row_type)
        jvm = get_gateway().jvm
        j_row_type = j_data_type.getLogicalType()
        orc_types = to_jarray(
            jvm.org.apache.flink.table.types.logical.LogicalType,
            [i for i in j_row_type.getChildren()])
        type_description = jvm.org.apache.flink.orc \
            .OrcSplitReaderUtil.logicalTypeToOrcType(j_row_type)
        if writer_properties is None:
            writer_properties = Configuration()
        if hadoop_config is None:
            hadoop_config = Configuration()

        return RowDataBulkWriterFactory(
            jvm.org.apache.flink.orc.writer.OrcBulkWriterFactory(
                jvm.org.apache.flink.orc.vector.RowDataVectorizer(
                    type_description.toString(), orc_types),
                create_java_properties(writer_properties),
                create_hadoop_configuration(hadoop_config)), row_type)
コード例 #24
0
def _write_row_data_to_parquet_file(path: str, row_type: RowType, rows: List[Row]):
    jvm = get_gateway().jvm
    flink = jvm.org.apache.flink

    j_output_stream = flink.core.fs.local.LocalDataOutputStream(jvm.java.io.File(path))
    j_bulk_writer = flink.formats.parquet.row.ParquetRowDataBuilder.createWriterFactory(
        _to_java_data_type(row_type).getLogicalType(),
        create_hadoop_configuration(Configuration()),
        True,
    ).create(j_output_stream)
    row_row_converter = flink.table.data.conversion.RowRowConverter.create(
        _to_java_data_type(row_type)
    )
    row_row_converter.open(row_row_converter.getClass().getClassLoader())
    for row in rows:
        j_bulk_writer.addElement(row_row_converter.toInternal(to_java_data_structure(row)))
    j_bulk_writer.finish()
コード例 #25
0
    def __init__(self,
                 row_type: RowType,
                 hadoop_config: Optional[Configuration] = None,
                 batch_size: int = 2048,
                 is_utc_timestamp: bool = False,
                 is_case_sensitive: bool = True):
        if not hadoop_config:
            hadoop_config = Configuration()

        jvm = get_gateway().jvm
        j_row_type = _to_java_data_type(row_type).getLogicalType()
        produced_type_info = jvm.org.apache.flink.table.runtime.typeutils. \
            InternalTypeInfo.of(j_row_type)
        j_parquet_columnar_format = jvm.org.apache.flink.formats.parquet. \
            ParquetColumnarRowInputFormat(create_hadoop_configuration(hadoop_config),
                                          j_row_type, produced_type_info, batch_size,
                                          is_utc_timestamp, is_case_sensitive)
        super().__init__(j_parquet_columnar_format)
コード例 #26
0
    def for_row_type(row_type: RowType,
                     hadoop_config: Optional[Configuration] = None,
                     utc_timestamp: bool = False) -> 'BulkWriterFactory':
        """
        Create a RowDataBulkWriterFactory that writes Rows records with a defined RowType into
        Parquet files in a batch fashion.

        Example:
        ::

            >>> row_type = DataTypes.ROW([
            ...     DataTypes.FIELD('string', DataTypes.STRING()),
            ...     DataTypes.FIELD('int_array', DataTypes.ARRAY(DataTypes.INT()))
            ... ])
            >>> row_type_info = Types.ROW_NAMED(
            ...     ['string', 'int_array'],
            ...     [Types.STRING(), Types.LIST(Types.INT())]
            ... )
            >>> sink = FileSink.for_bulk_format(
            ...     OUTPUT_DIR, ParquetBulkWriter.for_row_type(
            ...         row_type,
            ...         hadoop_config=Configuration(),
            ...         utc_timestamp=True,
            ...     )
            ... ).build()
            >>> ds.map(lambda e: e, output_type=row_type_info).sink_to(sink)

        Note that in the above example, an identity map to indicate its RowTypeInfo is necessary
        before ``sink_to`` when ``ds`` is a source stream producing **RowData** records, because
        RowDataBulkWriterFactory assumes the input record type is **Row** .
        """
        if not hadoop_config:
            hadoop_config = Configuration()

        jvm = get_gateway().jvm
        JParquetRowDataBuilder = jvm.org.apache.flink.formats.parquet.row.ParquetRowDataBuilder
        return RowDataBulkWriterFactory(
            JParquetRowDataBuilder.createWriterFactory(
                _to_java_data_type(row_type).getLogicalType(),
                create_hadoop_configuration(hadoop_config), utc_timestamp),
            row_type)
コード例 #27
0
    def for_row_type(row_type: 'RowType',
                     hadoop_config: Optional[Configuration] = None,
                     utc_timestamp: bool = False) -> 'BulkWriterFactory':
        """
        Create a :class:`~pyflink.common.serialization.BulkWriterFactory` that writes records
        with a predefined schema into Parquet files in a batch fashion.

        :param row_type: The RowType of records, it should match the RowTypeInfo of Row records.
        :param hadoop_config: Hadoop configuration.
        :param utc_timestamp: Use UTC timezone or local timezone to the conversion between epoch
            time and LocalDateTime. Hive 0.x/1.x/2.x use local timezone. But Hive 3.x use UTC
            timezone.
        """
        if not hadoop_config:
            hadoop_config = Configuration()

        from pyflink.table.types import _to_java_data_type
        jvm = get_gateway().jvm
        JParquetRowDataBuilder = jvm.org.apache.flink.formats.parquet.row.ParquetRowDataBuilder
        return RowDataBulkWriterFactory(JParquetRowDataBuilder.createWriterFactory(
            _to_java_data_type(row_type).getLogicalType(),
            create_hadoop_configuration(hadoop_config),
            utc_timestamp
        ), row_type)
コード例 #28
0
    def test_add_all(self):
        conf = Configuration()
        conf.set_string("k1", "v1")
        conf2 = Configuration()

        conf2.add_all(conf)
        value1 = conf2.get_string("k1", "")

        self.assertEqual(value1, "v1")

        conf2.add_all(conf, "conf_")
        value2 = conf2.get_string("conf_k1", "")

        self.assertEqual(value2, "v1")
コード例 #29
0
    def test_add_all_to_dict(self):
        conf = Configuration()

        conf.set_string("k1", "v1")
        conf.set_integer("k2", 1)
        conf.set_float("k3", 1.2)
        conf.set_boolean("k4", True)
        conf.set_bytearray("k5", bytearray([1, 2, 3]))
        target_dict = dict()
        conf.add_all_to_dict(target_dict)

        self.assertEqual(target_dict, {
            "k1": "v1",
            "k2": 1,
            "k3": 1.2,
            "k4": True,
            "k5": bytearray([1, 2, 3])
        })
コード例 #30
0
    def test_getters_and_setters(self):
        conf = Configuration()

        conf.set_string("str", "v1")
        conf.set_integer("int", 2)
        conf.set_boolean("bool", True)
        conf.set_float("float", 0.5)
        conf.set_bytearray("bytearray", bytearray([1, 2, 3]))

        str_value = conf.get_string("str", "")
        int_value = conf.get_integer("int", 0)
        bool_value = conf.get_boolean("bool", False)
        float_value = conf.get_float("float", 0)
        bytearray_value = conf.get_bytearray("bytearray", bytearray())

        self.assertEqual(str_value, "v1")
        self.assertEqual(int_value, 2)
        self.assertEqual(bool_value, True)
        self.assertEqual(float_value, 0.5)
        self.assertEqual(bytearray_value, bytearray([1, 2, 3]))