def test_is_unaligned_checkpointing_enabled(self):

        self.assertFalse(
            self.checkpoint_config.is_unaligned_checkpoints_enabled())
        self.assertFalse(
            self.checkpoint_config.is_force_unaligned_checkpoints())
        self.assertEqual(self.checkpoint_config.get_alignment_timeout(),
                         Duration.of_millis(0))

        self.checkpoint_config.enable_unaligned_checkpoints()
        self.assertTrue(
            self.checkpoint_config.is_unaligned_checkpoints_enabled())

        self.checkpoint_config.disable_unaligned_checkpoints()
        self.assertFalse(
            self.checkpoint_config.is_unaligned_checkpoints_enabled())

        self.checkpoint_config.enable_unaligned_checkpoints(True)
        self.assertTrue(
            self.checkpoint_config.is_unaligned_checkpoints_enabled())

        self.checkpoint_config.set_force_unaligned_checkpoints(True)
        self.assertTrue(
            self.checkpoint_config.is_force_unaligned_checkpoints())

        self.checkpoint_config.set_alignment_timeout(Duration.of_minutes(1))
        self.assertEqual(self.checkpoint_config.get_alignment_timeout(),
                         Duration.of_minutes(1))
Exemple #2
0
    def test_file_source(self):
        stream_format = StreamFormat.text_line_format()
        paths = ["/tmp/1.txt", "/tmp/2.txt"]
        file_source_builder = FileSource.for_record_stream_format(
            stream_format, *paths)
        file_source = file_source_builder\
            .monitor_continuously(Duration.of_days(1)) \
            .set_file_enumerator(FileEnumeratorProvider.default_splittable_file_enumerator()) \
            .set_split_assigner(FileSplitAssignerProvider.locality_aware_split_assigner()) \
            .build()

        continuous_setting = file_source.get_java_function(
        ).getContinuousEnumerationSettings()
        self.assertIsNotNone(continuous_setting)
        self.assertEqual(Duration.of_days(1),
                         Duration(continuous_setting.getDiscoveryInterval()))

        input_paths_field = \
            load_java_class("org.apache.flink.connector.file.src.AbstractFileSource"). \
            getDeclaredField("inputPaths")
        input_paths_field.setAccessible(True)
        input_paths = input_paths_field.get(file_source.get_java_function())
        self.assertEqual(len(input_paths), len(paths))
        self.assertEqual(str(input_paths[0]), paths[0])
        self.assertEqual(str(input_paths[1]), paths[1])
Exemple #3
0
def demo01():
    env = StreamExecutionEnvironment.get_execution_environment()
    ds = env.from_collection(
        collection=[(1, 'Hi', 'Hello'), (2, 'Hello', 'Hi')],
        type_info=Types.ROW([Types.INT(),
                             Types.STRING(),
                             Types.STRING()]))
    # 给Event添加水位
    # 1.内置水位生成策略
    # 1.1 延迟生成水印: 延迟10s
    watermark_strategy = WatermarkStrategy.for_bounded_out_of_orderness(
        Duration.of_seconds(10))
    # 1.2 单调递增生成水印:这个也就是相当于上述的延迟策略去掉了延迟时间,以event中的时间戳充当了水印。
    watermark_strategy = WatermarkStrategy.for_monotonous_timestamps()
    # 2. event时间的获取
    watermark_strategy = WatermarkStrategy.for_monotonous_timestamps(
    ).with_timestamp_assigner(MyTimestampAssigner())
    """
    在某些情况下,由于数据产生的比较少,导致一段时间内没有数据产生,进而就没有水印的生成,导致下游依赖水印的一些操作就会出现问题,比如某一个算子的上游有多个算子,
    这种情况下,水印是取其上游两个算子的较小值,如果上游某一个算子因为缺少数据迟迟没有生成水印,就会出现eventtime倾斜问题,导致下游没法触发计算。

    所以filnk通过WatermarkStrategy.withIdleness()方法允许用户在配置的时间内(即超时时间内)没有记录到达时将一个流标记为空闲。这样就意味着下游的数据不需要等待水印的到来。

    当下次有水印生成并发射到下游的时候,这个数据流重新变成活跃状态。
    """
    watermark_strategy = WatermarkStrategy.for_bounded_out_of_orderness(
        Duration.of_seconds(10)).with_idleness(Duration.of_seconds(30))
    ds.assign_timestamps_and_watermarks(watermark_strategy)

    ds.print()
Exemple #4
0
def event_timer_timer_demo():
    env = StreamExecutionEnvironment.get_execution_environment()

    ds = env.from_collection(
        collection=[
            (1000, 'Alice', 110.1),
            (4000, 'Bob', 30.2),
            (3000, 'Alice', 20.0),
            (2000, 'Bob', 53.1),
            (5000, 'Alice', 13.1),
            (3000, 'Bob', 3.1),
            (7000, 'Bob', 16.1),
            (10000, 'Alice', 20.1)
        ],
        type_info=Types.TUPLE([Types.LONG(), Types.STRING(), Types.FLOAT()]))

    ds = ds.assign_timestamps_and_watermarks(
        WatermarkStrategy.for_bounded_out_of_orderness(Duration.of_seconds(2))
                         .with_timestamp_assigner(MyTimestampAssigner()))

    # apply the process function onto a keyed stream
    ds.key_by(lambda value: value[1]) \
      .process(Sum()) \
      .print()

    # submit for execution
    env.execute()
Exemple #5
0
def python_data_stream_example():
    env = StreamExecutionEnvironment.get_execution_environment()
    # Set the parallelism to be one to make sure that all data including fired timer and normal data
    # are processed by the same worker and the collected result would be in order which is good for
    # assertion.
    env.set_parallelism(1)
    env.set_stream_time_characteristic(TimeCharacteristic.EventTime)

    type_info = Types.ROW_NAMED(['createTime', 'orderId', 'payAmount', 'payPlatform', 'provinceId'],
                                [Types.LONG(), Types.LONG(), Types.DOUBLE(), Types.INT(),
                                 Types.INT()])
    json_row_schema = JsonRowDeserializationSchema.builder().type_info(type_info).build()
    kafka_props = {'bootstrap.servers': 'localhost:9092', 'group.id': 'pyflink-e2e-source'}

    kafka_consumer = FlinkKafkaConsumer("timer-stream-source", json_row_schema, kafka_props)
    kafka_producer = FlinkKafkaProducer("timer-stream-sink", SimpleStringSchema(), kafka_props)

    watermark_strategy = WatermarkStrategy.for_bounded_out_of_orderness(Duration.of_seconds(5))\
        .with_timestamp_assigner(KafkaRowTimestampAssigner())

    kafka_consumer.set_start_from_earliest()
    ds = env.add_source(kafka_consumer).assign_timestamps_and_watermarks(watermark_strategy)
    ds.key_by(MyKeySelector(), key_type_info=Types.LONG()) \
        .process(MyProcessFunction(), output_type=Types.STRING()) \
        .add_sink(kafka_producer)
    env.execute_async("test data stream timer")
Exemple #6
0
 def test_with_watermark_alignment(self):
     jvm = get_gateway().jvm
     j_watermark_strategy = WatermarkStrategy.no_watermarks(
     ).with_watermark_alignment(
         "alignment-group-1", Duration.of_seconds(20),
         Duration.of_seconds(10))._j_watermark_strategy
     self.assertTrue(
         is_instance_of(
             j_watermark_strategy, jvm.org.apache.flink.api.common.
             eventtime.WatermarksWithWatermarkAlignment))
     alignment_parameters = j_watermark_strategy.getAlignmentParameters()
     self.assertEqual(alignment_parameters.getWatermarkGroup(),
                      "alignment-group-1")
     self.assertEqual(alignment_parameters.getMaxAllowedWatermarkDrift(),
                      20000)
     self.assertEqual(alignment_parameters.getUpdateInterval(), 10000)
Exemple #7
0
    def get_alignment_timeout(self) -> 'Duration':
        """
        Returns the alignment timeout, as configured via :func:`set_alignment_timeout` or
        ``org.apache.flink.streaming.api.environment.ExecutionCheckpointingOptions#ALIGNMENT_TIMEOUT``.

        :return: the alignment timeout.
        """
        return Duration(self._j_checkpoint_config.getAlignmentTimeout())
Exemple #8
0
 def test_with_idleness(self):
     jvm = get_gateway().jvm
     j_watermark_strategy = WatermarkStrategy.no_watermarks().with_idleness(
         Duration.of_seconds(5))._j_watermark_strategy
     self.assertTrue(
         is_instance_of(
             j_watermark_strategy, jvm.org.apache.flink.api.common.
             eventtime.WatermarkStrategyWithIdleness))
     self.assertEqual(
         get_field_value(j_watermark_strategy,
                         "idlenessTimeout").toMillis(), 5000)
Exemple #9
0
 def test_for_bounded_out_of_orderness(self):
     jvm = get_gateway().jvm
     j_watermark_strategy = WatermarkStrategy.for_bounded_out_of_orderness(
         Duration.of_seconds(3))._j_watermark_strategy
     j_watermark_generator = j_watermark_strategy.createWatermarkGenerator(
         None)
     self.assertTrue(
         is_instance_of(
             j_watermark_generator, jvm.org.apache.flink.api.common.
             eventtime.BoundedOutOfOrdernessWatermarks))
     self.assertEqual(
         get_field_value(j_watermark_generator, "outOfOrdernessMillis"),
         3000)
Exemple #10
0
    def test_pulsar_sink(self):
        ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3),
                                       ('deeefg', 4)],
                                      type_info=Types.ROW(
                                          [Types.STRING(),
                                           Types.INT()]))

        TEST_OPTION_NAME = 'pulsar.producer.chunkingEnabled'
        pulsar_sink = PulsarSink.builder() \
            .set_service_url('pulsar://localhost:6650') \
            .set_admin_url('http://localhost:8080') \
            .set_producer_name('fo') \
            .set_topics('ada') \
            .set_serialization_schema(
                PulsarSerializationSchema.flink_schema(SimpleStringSchema())) \
            .set_delivery_guarantee(DeliveryGuarantee.AT_LEAST_ONCE) \
            .set_topic_routing_mode(TopicRoutingMode.ROUND_ROBIN) \
            .delay_sending_message(MessageDelayer.fixed(Duration.of_seconds(12))) \
            .set_config(TEST_OPTION_NAME, True) \
            .set_properties({'pulsar.producer.batchingMaxMessages': '100'}) \
            .build()

        ds.sink_to(pulsar_sink).name('pulsar sink')

        plan = eval(self.env.get_execution_plan())
        self.assertEqual('pulsar sink: Writer', plan['nodes'][1]['type'])
        configuration = get_field_value(pulsar_sink.get_java_function(),
                                        "sinkConfiguration")
        self.assertEqual(
            configuration.getString(
                ConfigOptions.key('pulsar.client.serviceUrl').string_type().
                no_default_value()._j_config_option),
            'pulsar://localhost:6650')
        self.assertEqual(
            configuration.getString(
                ConfigOptions.key('pulsar.admin.adminUrl').string_type().
                no_default_value()._j_config_option), 'http://localhost:8080')
        self.assertEqual(
            configuration.getString(
                ConfigOptions.key('pulsar.producer.producerName').string_type(
                ).no_default_value()._j_config_option), 'fo - %s')

        j_pulsar_serialization_schema = get_field_value(
            pulsar_sink.get_java_function(), 'serializationSchema')
        j_serialization_schema = get_field_value(j_pulsar_serialization_schema,
                                                 'serializationSchema')
        self.assertTrue(
            is_instance_of(
                j_serialization_schema,
                'org.apache.flink.api.common.serialization.SimpleStringSchema')
        )

        self.assertEqual(
            configuration.getString(
                ConfigOptions.key('pulsar.sink.deliveryGuarantee').string_type(
                ).no_default_value()._j_config_option), 'at-least-once')

        j_topic_router = get_field_value(pulsar_sink.get_java_function(),
                                         "topicRouter")
        self.assertTrue(
            is_instance_of(
                j_topic_router,
                'org.apache.flink.connector.pulsar.sink.writer.router.RoundRobinTopicRouter'
            ))

        j_message_delayer = get_field_value(pulsar_sink.get_java_function(),
                                            'messageDelayer')
        delay_duration = get_field_value(j_message_delayer, 'delayDuration')
        self.assertEqual(delay_duration, 12000)

        test_option = ConfigOptions.key(
            TEST_OPTION_NAME).boolean_type().no_default_value()
        self.assertEqual(
            configuration.getBoolean(test_option._j_config_option), True)
        self.assertEqual(
            configuration.getLong(
                ConfigOptions.key('pulsar.producer.batchingMaxMessages').
                long_type().no_default_value()._j_config_option), 100)