Ejemplo n.º 1
0
        def _check_record(data, topic, serialized_data):
            input_type = Types.ROW([Types.STRING()])

            serialization_schema = KafkaRecordSerializationSchema.builder() \
                .set_topic_selector(_select) \
                .set_value_serialization_schema(
                    JsonRowSerializationSchema.builder().with_type_info(input_type).build()) \
                .build()
            jvm = get_gateway().jvm
            serialization_schema._j_serialization_schema.open(
                jvm.org.apache.flink.connector.testutils.formats.
                DummyInitializationContext(),
                jvm.org.apache.flink.connector.kafka.sink.
                DefaultKafkaSinkContext(0, 1, jvm.java.util.Properties()))
            sink = KafkaSink.builder() \
                .set_bootstrap_servers('localhost:9092') \
                .set_record_serializer(serialization_schema) \
                .build()

            ds = MockDataStream(Types.ROW([Types.STRING()]))
            ds.sink_to(sink)
            row = Row(data)
            topic_row = ds.feed(row)  # type: Row
            j_record = serialization_schema._j_serialization_schema.serialize(
                to_java_data_structure(topic_row), None, None)
            self.assertEqual(j_record.topic(), topic)
            self.assertIsNone(j_record.key())
            self.assertEqual(j_record.value(), serialized_data)
Ejemplo n.º 2
0
 def _build_orc_job(self, row_type: RowType, row_type_info: RowTypeInfo, data: List[Row]):
     jvm = get_gateway().jvm
     sink = FileSink.for_bulk_format(
         self.orc_dir_name, OrcBulkWriters.for_row_type(row_type)
     ).build()
     j_list = jvm.java.util.ArrayList()
     for d in data:
         j_list.add(to_java_data_structure(d))
     ds = DataStream(self.env._j_stream_execution_environment.fromCollection(
         j_list,
         row_type_info.get_java_type_info()
     ))
     ds.sink_to(sink)
Ejemplo n.º 3
0
def _write_row_data_to_parquet_file(path: str, row_type: RowType, rows: List[Row]):
    jvm = get_gateway().jvm
    flink = jvm.org.apache.flink

    j_output_stream = flink.core.fs.local.LocalDataOutputStream(jvm.java.io.File(path))
    j_bulk_writer = flink.formats.parquet.row.ParquetRowDataBuilder.createWriterFactory(
        _to_java_data_type(row_type).getLogicalType(),
        create_hadoop_configuration(Configuration()),
        True,
    ).create(j_output_stream)
    row_row_converter = flink.table.data.conversion.RowRowConverter.create(
        _to_java_data_type(row_type)
    )
    row_row_converter.open(row_row_converter.getClass().getClassLoader())
    for row in rows:
        j_bulk_writer.addElement(row_row_converter.toInternal(to_java_data_structure(row)))
    j_bulk_writer.finish()
Ejemplo n.º 4
0
    def test_set_topic(self):
        input_type = Types.ROW([Types.STRING()])

        serialization_schema = KafkaRecordSerializationSchema.builder() \
            .set_topic('test-topic') \
            .set_value_serialization_schema(
                JsonRowSerializationSchema.builder().with_type_info(input_type).build()) \
            .build()
        jvm = get_gateway().jvm
        serialization_schema._j_serialization_schema.open(
            jvm.org.apache.flink.connector.testutils.formats.
            DummyInitializationContext(),
            jvm.org.apache.flink.connector.kafka.sink.DefaultKafkaSinkContext(
                0, 1, jvm.java.util.Properties()))

        j_record = serialization_schema._j_serialization_schema.serialize(
            to_java_data_structure(Row('test')), None, None)
        self.assertEqual(j_record.topic(), 'test-topic')
        self.assertIsNone(j_record.key())
        self.assertEqual(j_record.value(), b'{"f0":"test"}')