def data_stream_word_count_demo():
    env = StreamExecutionEnvironment.get_execution_environment()
    env.set_parallelism(1)
    env.set_runtime_mode(RuntimeExecutionMode.BATCH)

    input_path = '/opt/examples/datastream/input/word_count_input'
    output_path = '/opt/examples/datastream/output/data_stream_word_count'

    file_source = FileSource\
        .for_record_stream_format(
            StreamFormat.text_line_format(),
            input_path) \
        .process_static_file_set() \
        .build()

    file_sink = FileSink \
        .for_row_format(output_path, Encoder.simple_string_encoder()) \
        .with_output_file_config(OutputFileConfig.builder().with_part_prefix('pre').with_part_suffix('suf').build()) \
        .build()

    ds = env.from_source(
        source=file_source,
        watermark_strategy=WatermarkStrategy.for_monotonous_timestamps(),
        source_name='file_source',
        type_info=Types.STRING())

    ds.map(lambda a: Row(a, 1), output_type=Types.ROW([Types.STRING(), Types.INT()])) \
        .key_by(lambda a: a[0]) \
        .reduce(lambda a, b: Row(a[0], a[1] + b[1])) \
        .sink_to(file_sink)

    env.execute('9-data_stream_word_count')
def state_access_demo():
    env = StreamExecutionEnvironment.get_execution_environment()
    env.set_parallelism(1)
    env.set_runtime_mode(RuntimeExecutionMode.BATCH)

    seq_num_source = NumberSequenceSource(1, 10)

    output_path = '/opt/examples/datastream/output/state_access'
    file_sink = FileSink \
        .for_row_format(output_path, Encoder.simple_string_encoder()) \
        .with_output_file_config(OutputFileConfig.builder().with_part_prefix('pre').with_part_suffix('suf').build()) \
        .build()

    ds = env.from_source(
        source=seq_num_source,
        watermark_strategy=WatermarkStrategy.for_monotonous_timestamps(),
        source_name='seq_num_source',
        type_info=Types.LONG())

    ds.map(lambda a: Row(a % 4, 1), output_type=Types.ROW([Types.LONG(), Types.LONG()])) \
        .key_by(lambda a: a[0]) \
        .map(MyMapFunction(), output_type=Types.ROW([Types.LONG(), Types.LONG()])) \
        .key_by(lambda a: a[0]) \
        .process(MyKeyedProcessFunction(), Types.LONG()) \
        .sink_to(file_sink)

    env.execute('11-data_stream_state_access')
def batch_seq_num_test():
    env = StreamExecutionEnvironment.get_execution_environment()
    env.set_parallelism(2)
    env.set_runtime_mode(RuntimeExecutionMode.BATCH)

    seq_num_source = NumberSequenceSource(1, 1000)

    output_path = '/opt/examples/output/batch_seq_num'
    file_sink = FileSink \
        .for_row_format(output_path, Encoder.simple_string_encoder()) \
        .with_output_file_config(OutputFileConfig.builder().with_part_prefix('pre').with_part_suffix('suf').build()) \
        .build()

    ds = env.from_source(
        source=seq_num_source,
        watermark_strategy=WatermarkStrategy.for_monotonous_timestamps(),
        source_name='file_source',
        type_info=Types.LONG())

    ds.map(lambda a: Row(a % 4, 1), output_type=Types.ROW([Types.LONG(), Types.LONG()])) \
        .key_by(lambda a: a[0]) \
        .reduce(lambda a, b: Row(a[0], a[1] + b[1])) \
        .sink_to(file_sink)

    env.execute('9-data_stream_batch_seq_num')
Ejemplo n.º 4
0
    def test_stream_file_sink(self):
        self.env.set_parallelism(2)
        ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3), ('deeefg', 4)],
                                      type_info=Types.ROW([Types.STRING(), Types.INT()]))
        ds.map(
            lambda a: a[0],
            Types.STRING()).add_sink(
            StreamingFileSink.for_row_format(self.tempdir, Encoder.simple_string_encoder())
                .with_rolling_policy(
                    RollingPolicy.default_rolling_policy(
                        part_size=1024 * 1024 * 1024,
                        rollover_interval=15 * 60 * 1000,
                        inactivity_interval=5 * 60 * 1000))
                .with_output_file_config(
                    OutputFileConfig.OutputFileConfigBuilder()
                    .with_part_prefix("prefix")
                    .with_part_suffix("suffix").build()).build())

        self.env.execute("test_streaming_file_sink")

        results = []
        import os
        for root, dirs, files in os.walk(self.tempdir, topdown=True):
            for file in files:
                self.assertTrue(file.startswith('.prefix'))
                self.assertTrue('suffix' in file)
                path = root + "/" + file
                with open(path) as infile:
                    for line in infile:
                        results.append(line)

        expected = ['deeefg\n', 'bdc\n', 'ab\n', 'cfgs\n']
        results.sort()
        expected.sort()
        self.assertEqual(expected, results)
Ejemplo n.º 5
0
    def test_file_sink(self):
        base_path = "/tmp/1.txt"
        encoder = Encoder.simple_string_encoder()
        file_sink_builder = FileSink.for_row_format(base_path, encoder)
        file_sink = file_sink_builder\
            .with_bucket_check_interval(1000) \
            .with_bucket_assigner(BucketAssigner.base_path_bucket_assigner()) \
            .with_rolling_policy(RollingPolicy.on_checkpoint_rolling_policy()) \
            .with_output_file_config(
                OutputFileConfig.builder().with_part_prefix("pre").with_part_suffix("suf").build())\
            .build()

        buckets_builder_field = \
            load_java_class("org.apache.flink.connector.file.sink.FileSink"). \
            getDeclaredField("bucketsBuilder")
        buckets_builder_field.setAccessible(True)
        buckets_builder = buckets_builder_field.get(
            file_sink.get_java_function())

        self.assertEqual("DefaultRowFormatBuilder",
                         buckets_builder.getClass().getSimpleName())

        row_format_builder_clz = load_java_class(
            "org.apache.flink.connector.file.sink.FileSink$RowFormatBuilder")
        encoder_field = row_format_builder_clz.getDeclaredField("encoder")
        encoder_field.setAccessible(True)
        self.assertEqual(
            "SimpleStringEncoder",
            encoder_field.get(buckets_builder).getClass().getSimpleName())

        interval_field = row_format_builder_clz.getDeclaredField(
            "bucketCheckInterval")
        interval_field.setAccessible(True)
        self.assertEqual(1000, interval_field.get(buckets_builder))

        bucket_assigner_field = row_format_builder_clz.getDeclaredField(
            "bucketAssigner")
        bucket_assigner_field.setAccessible(True)
        self.assertEqual(
            "BasePathBucketAssigner",
            bucket_assigner_field.get(
                buckets_builder).getClass().getSimpleName())

        rolling_policy_field = row_format_builder_clz.getDeclaredField(
            "rollingPolicy")
        rolling_policy_field.setAccessible(True)
        self.assertEqual(
            "OnCheckpointRollingPolicy",
            rolling_policy_field.get(
                buckets_builder).getClass().getSimpleName())

        output_file_config_field = row_format_builder_clz.getDeclaredField(
            "outputFileConfig")
        output_file_config_field.setAccessible(True)
        output_file_config = output_file_config_field.get(buckets_builder)
        self.assertEqual("pre", output_file_config.getPartPrefix())
        self.assertEqual("suf", output_file_config.getPartSuffix())
def run_consumer(output_path):
    env = StreamExecutionEnvironment.get_execution_environment()
    # write all the data to one file
    env.set_parallelism(1)

    # get the credit card data
    dataset = datasets.CreditCard()

    # create a small collection of items
    i = 0
    num_of_items = 2000
    items = []
    for x, y in dataset:
        if i == num_of_items:
            break
        i += 1
        items.append((json.dumps(x), y))

    credit_stream = env.from_collection(collection=items,
                                        type_info=Types.ROW(
                                            [Types.STRING(),
                                             Types.STRING()]))

    # detect fraud in transactions
    fraud_data = credit_stream.map(lambda data: \
        json.dumps(requests.post('http://localhost:9000/predict', \
                        json={'x': data[0], 'y': data[1]}).json()), \
                        output_type=Types.STRING())

    # save the results to a file
    fraud_data.sink_to(sink=FileSink.for_row_format(
        base_path=output_path,
        encoder=Encoder.simple_string_encoder()).build())

    # submit for execution
    env.execute()
Ejemplo n.º 7
0
    def test_file_sink(self):
        base_path = "/tmp/1.txt"
        encoder = Encoder.simple_string_encoder()
        file_sink_builder = FileSink.for_row_format(base_path, encoder)
        file_sink = file_sink_builder\
            .with_bucket_check_interval(1000) \
            .with_bucket_assigner(BucketAssigner.base_path_bucket_assigner()) \
            .with_rolling_policy(RollingPolicy.on_checkpoint_rolling_policy()) \
            .with_output_file_config(
                OutputFileConfig.builder().with_part_prefix("pre").with_part_suffix("suf").build())\
            .enable_compact(FileCompactStrategy.builder()
                            .enable_compaction_on_checkpoint(3)
                            .set_size_threshold(1024)
                            .set_num_compact_threads(2)
                            .build(),
                            FileCompactor.concat_file_compactor(b'\n')) \
            .build()

        buckets_builder_field = \
            load_java_class("org.apache.flink.connector.file.sink.FileSink"). \
            getDeclaredField("bucketsBuilder")
        buckets_builder_field.setAccessible(True)
        buckets_builder = buckets_builder_field.get(
            file_sink.get_java_function())

        self.assertEqual("DefaultRowFormatBuilder",
                         buckets_builder.getClass().getSimpleName())

        row_format_builder_clz = load_java_class(
            "org.apache.flink.connector.file.sink.FileSink$RowFormatBuilder")
        encoder_field = row_format_builder_clz.getDeclaredField("encoder")
        encoder_field.setAccessible(True)
        self.assertEqual(
            "SimpleStringEncoder",
            encoder_field.get(buckets_builder).getClass().getSimpleName())

        interval_field = row_format_builder_clz.getDeclaredField(
            "bucketCheckInterval")
        interval_field.setAccessible(True)
        self.assertEqual(1000, interval_field.get(buckets_builder))

        bucket_assigner_field = row_format_builder_clz.getDeclaredField(
            "bucketAssigner")
        bucket_assigner_field.setAccessible(True)
        self.assertEqual(
            "BasePathBucketAssigner",
            bucket_assigner_field.get(
                buckets_builder).getClass().getSimpleName())

        rolling_policy_field = row_format_builder_clz.getDeclaredField(
            "rollingPolicy")
        rolling_policy_field.setAccessible(True)
        self.assertEqual(
            "OnCheckpointRollingPolicy",
            rolling_policy_field.get(
                buckets_builder).getClass().getSimpleName())

        output_file_config_field = row_format_builder_clz.getDeclaredField(
            "outputFileConfig")
        output_file_config_field.setAccessible(True)
        output_file_config = output_file_config_field.get(buckets_builder)
        self.assertEqual("pre", output_file_config.getPartPrefix())
        self.assertEqual("suf", output_file_config.getPartSuffix())

        compact_strategy_field = row_format_builder_clz.getDeclaredField(
            "compactStrategy")
        compact_strategy_field.setAccessible(True)
        compact_strategy = compact_strategy_field.get(buckets_builder)
        self.assertEqual(3,
                         compact_strategy.getNumCheckpointsBeforeCompaction())
        self.assertEqual(1024, compact_strategy.getSizeThreshold())
        self.assertEqual(2, compact_strategy.getNumCompactThreads())

        file_compactor_field = row_format_builder_clz.getDeclaredField(
            "fileCompactor")
        file_compactor_field.setAccessible(True)
        file_compactor = file_compactor_field.get(buckets_builder)
        self.assertEqual("ConcatFileCompactor",
                         file_compactor.getClass().getSimpleName())
        concat_file_compactor_clz = load_java_class(
            "org.apache.flink.connector.file.sink.compactor.ConcatFileCompactor"
        )
        file_delimiter_field = concat_file_compactor_clz.getDeclaredField(
            "fileDelimiter")
        file_delimiter_field.setAccessible(True)
        file_delimiter = file_delimiter_field.get(file_compactor)
        self.assertEqual(b'\n', file_delimiter)