Beispiel #1
0
    def test_primitive_array_type_info(self):
        ds = self.env.from_collection([(1, [1.1, 1.2, 1.30]), (2, [2.1, 2.2, 2.3]),
                                      (3, [3.1, 3.2, 3.3])],
                                      type_info=Types.ROW([Types.INT(),
                                                           Types.PRIMITIVE_ARRAY(Types.FLOAT())]))

        ds.map(lambda x: x, output_type=Types.ROW([Types.INT(),
                                                   Types.PRIMITIVE_ARRAY(Types.FLOAT())]))\
            .add_sink(self.test_sink)
        self.env.execute("test primitive array type info")
        results = self.test_sink.get_results()
        expected = ['1,[1.1, 1.2, 1.3]', '2,[2.1, 2.2, 2.3]', '3,[3.1, 3.2, 3.3]']
        results.sort()
        expected.sort()
        self.assertEqual(expected, results)
    def test_json_row_serialization_deserialization_schema(self):
        jsons = [
            "{\"svt\":\"2020-02-24T12:58:09.209+0800\"}",
            "{\"svt\":\"2020-02-24T12:58:09.209+0800\", "
            "\"ops\":{\"id\":\"281708d0-4092-4c21-9233-931950b6eccf\"},\"ids\":[1, 2, 3]}",
            "{\"svt\":\"2020-02-24T12:58:09.209+0800\"}"
        ]
        expected_jsons = [
            "{\"svt\":\"2020-02-24T12:58:09.209+0800\",\"ops\":null,\"ids\":null}",
            "{\"svt\":\"2020-02-24T12:58:09.209+0800\","
            "\"ops\":{\"id\":\"281708d0-4092-4c21-9233-931950b6eccf\"},"
            "\"ids\":[1,2,3]}",
            "{\"svt\":\"2020-02-24T12:58:09.209+0800\",\"ops\":null,\"ids\":null}"
        ]

        row_schema = Types.ROW_NAMED(["svt", "ops", "ids"], [
            Types.STRING(),
            Types.ROW_NAMED(['id'], [Types.STRING()]),
            Types.PRIMITIVE_ARRAY(Types.INT())
        ])

        json_row_serialization_schema = JsonRowSerializationSchema.builder() \
            .with_type_info(row_schema).build()
        json_row_deserialization_schema = JsonRowDeserializationSchema.builder() \
            .type_info(row_schema).build()

        for i in range(len(jsons)):
            j_row = json_row_deserialization_schema._j_deserialization_schema\
                .deserialize(bytes(jsons[i], encoding='utf-8'))
            result = str(json_row_serialization_schema._j_serialization_schema.
                         serialize(j_row),
                         encoding='utf-8')
            self.assertEqual(expected_jsons[i], result)
Beispiel #3
0
def _create_orc_basic_row_and_data() -> Tuple[RowType, RowTypeInfo, List[Row]]:
    row_type = DataTypes.ROW([
        DataTypes.FIELD('char', DataTypes.CHAR(10)),
        DataTypes.FIELD('varchar', DataTypes.VARCHAR(10)),
        DataTypes.FIELD('bytes', DataTypes.BYTES()),
        DataTypes.FIELD('boolean', DataTypes.BOOLEAN()),
        DataTypes.FIELD('decimal', DataTypes.DECIMAL(2, 0)),
        DataTypes.FIELD('int', DataTypes.INT()),
        DataTypes.FIELD('bigint', DataTypes.BIGINT()),
        DataTypes.FIELD('double', DataTypes.DOUBLE()),
        DataTypes.FIELD('date', DataTypes.DATE().bridged_to('java.sql.Date')),
        DataTypes.FIELD('timestamp', DataTypes.TIMESTAMP(3).bridged_to('java.sql.Timestamp')),
    ])
    row_type_info = Types.ROW_NAMED(
        ['char', 'varchar', 'bytes', 'boolean', 'decimal', 'int', 'bigint', 'double',
         'date', 'timestamp'],
        [Types.STRING(), Types.STRING(), Types.PRIMITIVE_ARRAY(Types.BYTE()), Types.BOOLEAN(),
         Types.BIG_DEC(), Types.INT(), Types.LONG(), Types.DOUBLE(), Types.SQL_DATE(),
         Types.SQL_TIMESTAMP()]
    )
    data = [Row(
        char='char',
        varchar='varchar',
        bytes=b'varbinary',
        boolean=True,
        decimal=Decimal(1.5),
        int=2147483647,
        bigint=-9223372036854775808,
        double=2e-308,
        date=date(1970, 1, 1),
        timestamp=datetime(1970, 1, 2, 3, 4, 5, 600000),
    )]
    return row_type, row_type_info, data
Beispiel #4
0
def _create_parquet_basic_row_and_data() -> Tuple[RowType, RowTypeInfo, List[Row]]:
    row_type = DataTypes.ROW([
        DataTypes.FIELD('char', DataTypes.CHAR(10)),
        DataTypes.FIELD('varchar', DataTypes.VARCHAR(10)),
        DataTypes.FIELD('binary', DataTypes.BINARY(10)),
        DataTypes.FIELD('varbinary', DataTypes.VARBINARY(10)),
        DataTypes.FIELD('boolean', DataTypes.BOOLEAN()),
        DataTypes.FIELD('decimal', DataTypes.DECIMAL(2, 0)),
        DataTypes.FIELD('int', DataTypes.INT()),
        DataTypes.FIELD('bigint', DataTypes.BIGINT()),
        DataTypes.FIELD('double', DataTypes.DOUBLE()),
        DataTypes.FIELD('date', DataTypes.DATE().bridged_to('java.sql.Date')),
        DataTypes.FIELD('time', DataTypes.TIME().bridged_to('java.sql.Time')),
        DataTypes.FIELD('timestamp', DataTypes.TIMESTAMP(3).bridged_to('java.sql.Timestamp')),
        DataTypes.FIELD('timestamp_ltz', DataTypes.TIMESTAMP_LTZ(3)),
    ])
    row_type_info = Types.ROW_NAMED(
        ['char', 'varchar', 'binary', 'varbinary', 'boolean', 'decimal', 'int', 'bigint', 'double',
         'date', 'time', 'timestamp', 'timestamp_ltz'],
        [Types.STRING(), Types.STRING(), Types.PRIMITIVE_ARRAY(Types.BYTE()),
         Types.PRIMITIVE_ARRAY(Types.BYTE()), Types.BOOLEAN(), Types.BIG_DEC(), Types.INT(),
         Types.LONG(), Types.DOUBLE(), Types.SQL_DATE(), Types.SQL_TIME(), Types.SQL_TIMESTAMP(),
         Types.INSTANT()]
    )
    datetime_ltz = datetime.datetime(1970, 2, 3, 4, 5, 6, 700000, tzinfo=pytz.timezone('UTC'))
    timestamp_ltz = Instant.of_epoch_milli(
        (
            calendar.timegm(datetime_ltz.utctimetuple()) +
            calendar.timegm(time.localtime(0))
        ) * 1000 + datetime_ltz.microsecond // 1000
    )
    data = [Row(
        char='char',
        varchar='varchar',
        binary=b'binary',
        varbinary=b'varbinary',
        boolean=True,
        decimal=Decimal(1.5),
        int=2147483647,
        bigint=-9223372036854775808,
        double=2e-308,
        date=datetime.date(1970, 1, 1),
        time=datetime.time(1, 1, 1),
        timestamp=datetime.datetime(1970, 1, 2, 3, 4, 5, 600000),
        timestamp_ltz=timestamp_ltz
    )]
    return row_type, row_type_info, data
Beispiel #5
0
    def test_from_java_type(self):
        basic_int_type_info = Types.INT()
        self.assertEqual(basic_int_type_info,
                         _from_java_type(basic_int_type_info.get_java_type_info()))

        basic_short_type_info = Types.SHORT()
        self.assertEqual(basic_short_type_info,
                         _from_java_type(basic_short_type_info.get_java_type_info()))

        basic_long_type_info = Types.LONG()
        self.assertEqual(basic_long_type_info,
                         _from_java_type(basic_long_type_info.get_java_type_info()))

        basic_float_type_info = Types.FLOAT()
        self.assertEqual(basic_float_type_info,
                         _from_java_type(basic_float_type_info.get_java_type_info()))

        basic_double_type_info = Types.DOUBLE()
        self.assertEqual(basic_double_type_info,
                         _from_java_type(basic_double_type_info.get_java_type_info()))

        basic_char_type_info = Types.CHAR()
        self.assertEqual(basic_char_type_info,
                         _from_java_type(basic_char_type_info.get_java_type_info()))

        basic_byte_type_info = Types.BYTE()
        self.assertEqual(basic_byte_type_info,
                         _from_java_type(basic_byte_type_info.get_java_type_info()))

        basic_big_int_type_info = Types.BIG_INT()
        self.assertEqual(basic_big_int_type_info,
                         _from_java_type(basic_big_int_type_info.get_java_type_info()))

        basic_big_dec_type_info = Types.BIG_DEC()
        self.assertEqual(basic_big_dec_type_info,
                         _from_java_type(basic_big_dec_type_info.get_java_type_info()))

        basic_sql_date_type_info = Types.SQL_DATE()
        self.assertEqual(basic_sql_date_type_info,
                         _from_java_type(basic_sql_date_type_info.get_java_type_info()))

        basic_sql_time_type_info = Types.SQL_TIME()
        self.assertEqual(basic_sql_time_type_info,
                         _from_java_type(basic_sql_time_type_info.get_java_type_info()))

        basic_sql_timestamp_type_info = Types.SQL_TIMESTAMP()
        self.assertEqual(basic_sql_timestamp_type_info,
                         _from_java_type(basic_sql_timestamp_type_info.get_java_type_info()))

        row_type_info = Types.ROW([Types.INT(), Types.STRING()])
        self.assertEqual(row_type_info, _from_java_type(row_type_info.get_java_type_info()))

        tuple_type_info = Types.TUPLE([Types.CHAR(), Types.INT()])
        self.assertEqual(tuple_type_info, _from_java_type(tuple_type_info.get_java_type_info()))

        primitive_int_array_type_info = Types.PRIMITIVE_ARRAY(Types.INT())
        self.assertEqual(primitive_int_array_type_info,
                         _from_java_type(primitive_int_array_type_info.get_java_type_info()))

        object_array_type_info = Types.OBJECT_ARRAY(Types.SQL_DATE())
        self.assertEqual(object_array_type_info,
                         _from_java_type(object_array_type_info.get_java_type_info()))

        pickled_byte_array_type_info = Types.PICKLED_BYTE_ARRAY()
        self.assertEqual(pickled_byte_array_type_info,
                         _from_java_type(pickled_byte_array_type_info.get_java_type_info()))

        sql_date_type_info = Types.SQL_DATE()
        self.assertEqual(sql_date_type_info,
                         _from_java_type(sql_date_type_info.get_java_type_info()))

        map_type_info = Types.MAP(Types.INT(), Types.STRING())
        self.assertEqual(map_type_info,
                         _from_java_type(map_type_info.get_java_type_info()))

        list_type_info = Types.LIST(Types.INT())
        self.assertEqual(list_type_info,
                         _from_java_type(list_type_info.get_java_type_info()))