Beispiel #1
0
    def test_transform_struct_map_array_nested_type(self) -> None:
        column = ColumnMetadata(
            'col1', None,
            'struct<nest1:map<string,array<int>>,nest2:array<string>>', 0)
        column.set_column_key(self.column_key)

        struct_type = StructTypeMetadata(
            name='col1',
            parent=column,
            type_str='struct<nest1:map<string,array<int>>,nest2:array<string>>'
        )
        inner_map = MapTypeMetadata(name='nest1',
                                    parent=struct_type,
                                    type_str='map<string,array<int>>')
        inner_map_key = ScalarTypeMetadata(name='_map_key',
                                           parent=inner_map,
                                           type_str='string')
        inner_map_array = ArrayTypeMetadata(name='_map_value',
                                            parent=inner_map,
                                            type_str='array<int>')
        inner_struct_array = ArrayTypeMetadata(name='nest2',
                                               parent=struct_type,
                                               type_str='array<string>')

        struct_type.struct_items = {
            'nest1': inner_map,
            'nest2': inner_struct_array
        }
        inner_map.map_key_type = inner_map_key
        inner_map.map_value_type = inner_map_array
        inner_map.sort_order = 0
        inner_struct_array.sort_order = 1

        actual = parse_hive_type(column.type, column.name, column)
        self.assertEqual(actual, struct_type)
Beispiel #2
0
    def test_transform_map_struct_nested_type(self) -> None:
        column = ColumnMetadata('col1', None,
                                'map<string,struct<nest1:int,nest2:int>>', 0)
        column.set_column_key(self.column_key)

        map_type = MapTypeMetadata(
            name='col1',
            parent=column,
            type_str='map<string,struct<nest1:int,nest2:int>>')
        map_key = ScalarTypeMetadata(name='_map_key',
                                     parent=map_type,
                                     type_str='string')
        inner_struct = StructTypeMetadata(
            name='_map_value',
            parent=map_type,
            type_str='struct<nest1:int,nest2:int>')
        inner_scalar_nest1 = ScalarTypeMetadata(name='nest1',
                                                parent=inner_struct,
                                                type_str='int')
        inner_scalar_nest2 = ScalarTypeMetadata(name='nest2',
                                                parent=inner_struct,
                                                type_str='int')

        map_type.map_key_type = map_key
        map_type.map_value_type = inner_struct
        inner_struct.struct_items = {
            'nest1': inner_scalar_nest1,
            'nest2': inner_scalar_nest2
        }
        inner_scalar_nest1.sort_order = 0
        inner_scalar_nest2.sort_order = 1

        actual = parse_hive_type(column.type, column.name, column)
        self.assertEqual(actual, map_type)
Beispiel #3
0
    def test_transform_non_alpha_only_types(self) -> None:
        column = ColumnMetadata(
            'col1', None, 'struct<nest1:decimal(10,2),nest2:double precision,'
            'nest3:varchar(32),nest4:map<varchar(32),decimal(10,2)>,'
            'nest5:interval_day_time>', 0)
        column.set_column_key(self.column_key)

        struct_type = StructTypeMetadata(
            name='col1',
            parent=column,
            type_str='struct<nest1:decimal(10,2),nest2:double precision,'
            'nest3:varchar(32),nest4:map<varchar(32),decimal(10,2)>,'
            'nest5:interval_day_time>')
        inner_scalar_nest1 = ScalarTypeMetadata(name='nest1',
                                                parent=struct_type,
                                                type_str='decimal(10,2)')
        inner_scalar_nest2 = ScalarTypeMetadata(name='nest2',
                                                parent=struct_type,
                                                type_str='double precision')
        inner_scalar_nest3 = ScalarTypeMetadata(name='nest3',
                                                parent=struct_type,
                                                type_str='varchar(32)')
        inner_map_nest4 = MapTypeMetadata(
            name='nest4',
            parent=struct_type,
            type_str='map<varchar(32),decimal(10,2)>')
        inner_map_nest4_key = ScalarTypeMetadata(name='_map_key',
                                                 parent=inner_map_nest4,
                                                 type_str='varchar(32)')
        inner_map_nest4_value = ScalarTypeMetadata(name='_map_value',
                                                   parent=inner_map_nest4,
                                                   type_str='decimal(10,2)')
        inner_scalar_nest5 = ScalarTypeMetadata(name='nest5',
                                                parent=struct_type,
                                                type_str='interval_day_time')

        struct_type.struct_items = {
            'nest1': inner_scalar_nest1,
            'nest2': inner_scalar_nest2,
            'nest3': inner_scalar_nest3,
            'nest4': inner_map_nest4,
            'nest5': inner_scalar_nest5
        }
        inner_map_nest4.map_key_type = inner_map_nest4_key
        inner_map_nest4.map_value_type = inner_map_nest4_value
        inner_scalar_nest1.sort_order = 0
        inner_scalar_nest2.sort_order = 1
        inner_scalar_nest3.sort_order = 2
        inner_map_nest4.sort_order = 3
        inner_scalar_nest5.sort_order = 4

        actual = parse_hive_type(column.type, column.name, column)
        self.assertEqual(actual, struct_type)
Beispiel #4
0
    def test_transform_map_type(self) -> None:
        column = ColumnMetadata('col1', None, 'map<string,map<string,int>>', 0)
        column.set_column_key(self.column_key)

        map_type = MapTypeMetadata(name='col1',
                                   parent=column,
                                   type_str='map<string,map<string,int>>')
        map_key = ScalarTypeMetadata(name='_map_key',
                                     parent=map_type,
                                     type_str='string')
        map_value = MapTypeMetadata(name='_map_value',
                                    parent=map_type,
                                    type_str='map<string,int>')
        inner_map_key = ScalarTypeMetadata(name='_map_key',
                                           parent=map_value,
                                           type_str='string')
        inner_scalar = ScalarTypeMetadata(name='_map_value',
                                          parent=map_value,
                                          type_str='int')

        map_type.map_key_type = map_key
        map_type.map_value_type = map_value
        map_value.map_key_type = inner_map_key
        map_value.map_value_type = inner_scalar

        actual = parse_hive_type(column.type, column.name, column)
        self.assertEqual(actual, map_type)
Beispiel #5
0
def parse_hive_type(type_str: str, name: str, parent: Union[ColumnMetadata, TypeMetadata]) -> TypeMetadata:
    type_str = type_str.lower()
    parsed_type = complex_type.parseString(type_str, parseAll=True)

    if parsed_type.scalar_type:
        return ScalarTypeMetadata(name=name,
                                  parent=parent,
                                  type_str=type_str)

    results = parsed_type[0]
    if parsed_type.array_type:
        array_type_metadata = ArrayTypeMetadata(name=name,
                                                parent=parent,
                                                type_str=type_str)
        array_inner_type = parse_hive_type(results.type, '_inner_', array_type_metadata)
        if not isinstance(array_inner_type, ScalarTypeMetadata):
            array_type_metadata.array_inner_type = array_inner_type
        return array_type_metadata
    elif parsed_type.map_type:
        map_type_metadata = MapTypeMetadata(name=name,
                                            parent=parent,
                                            type_str=type_str)
        map_type_metadata.map_key_type = parse_hive_type(results.key, '_map_key', map_type_metadata)
        map_type_metadata.map_value_type = parse_hive_type(results.type, '_map_value', map_type_metadata)
        return map_type_metadata
    elif parsed_type.struct_type:
        struct_type_metadata = StructTypeMetadata(name=name,
                                                  parent=parent,
                                                  type_str=type_str)
        struct_items = {}
        for index, result in enumerate(results):
            struct_items[result.name] = parse_hive_type(result.type, result.name, struct_type_metadata)
            struct_items[result.name].sort_order = index

        struct_type_metadata.struct_items = struct_items
        return struct_type_metadata
    else:
        raise Exception(f"Unrecognized type: {type_str}")
Beispiel #6
0
    def test_transform_array_map_nested_type(self) -> None:
        column = ColumnMetadata('col1', None, 'array<map<string,int>>', 0)
        column.set_column_key(self.column_key)

        array_type = ArrayTypeMetadata(name='col1',
                                       parent=column,
                                       type_str='array<map<string,int>>')
        inner_map = MapTypeMetadata(name='_inner_',
                                    parent=array_type,
                                    type_str='map<string,int>')
        inner_map_key = ScalarTypeMetadata(name='_map_key',
                                           parent=inner_map,
                                           type_str='string')
        inner_scalar = ScalarTypeMetadata(name='_map_value',
                                          parent=inner_map,
                                          type_str='int')

        array_type.array_inner_type = inner_map
        inner_map.map_key_type = inner_map_key
        inner_map.map_value_type = inner_scalar

        actual = parse_hive_type(column.type, column.name, column)
        self.assertEqual(actual, array_type)
Beispiel #7
0
    def test_serialize_map_struct_type_metadata(self) -> None:
        column = ColumnMetadata(
            'col1', None,
            'map<string,struct<c1:map<string,string>,c2:string>>', 0)
        column.set_column_key(self.column_key)

        map_type_metadata = MapTypeMetadata(
            name='col1',
            parent=column,
            type_str='map<string,struct<c1:map<string,string>,c2:string>>')
        map_key = ScalarTypeMetadata(name='_map_key',
                                     parent=map_type_metadata,
                                     type_str='string')
        nested_struct_type_metadata_level1 = StructTypeMetadata(
            name='_map_value',
            parent=map_type_metadata,
            type_str='struct<c1:map<string,string>,c2:string>')
        nested_map_type_metadata_level2 = MapTypeMetadata(
            name='c1',
            parent=nested_struct_type_metadata_level1,
            type_str='map<string,string>')
        nested_map_key = ScalarTypeMetadata(
            name='_map_key',
            parent=nested_map_type_metadata_level2,
            type_str='string')
        nested_scalar_type_metadata_level3 = ScalarTypeMetadata(
            name='_map_value',
            parent=nested_map_type_metadata_level2,
            type_str='string')
        nested_scalar_type_metadata_level2 = ScalarTypeMetadata(
            name='c2',
            parent=nested_struct_type_metadata_level1,
            type_str='string')

        map_type_metadata.map_key_type = map_key
        map_type_metadata.map_value_type = nested_struct_type_metadata_level1
        nested_struct_type_metadata_level1.struct_items = {
            'c1': nested_map_type_metadata_level2,
            'c2': nested_scalar_type_metadata_level2
        }
        nested_map_type_metadata_level2.map_key_type = nested_map_key
        nested_map_type_metadata_level2.map_value_type = nested_scalar_type_metadata_level3
        nested_map_type_metadata_level2.sort_order = 0
        nested_scalar_type_metadata_level2.sort_order = 1

        expected_nodes = [{
            'kind':
            'map',
            'name':
            'col1',
            'data_type':
            'map<string,struct<c1:map<string,string>,c2:string>>',
            'LABEL':
            'Type_Metadata',
            'KEY':
            'hive://gold.test_schema1/test_table1/col1/type/col1'
        }, {
            'kind':
            'scalar',
            'name':
            '_map_key',
            'data_type':
            'string',
            'LABEL':
            'Type_Metadata',
            'KEY':
            'hive://gold.test_schema1/test_table1/col1/type/col1/_map_key'
        }, {
            'kind':
            'struct',
            'name':
            '_map_value',
            'data_type':
            'struct<c1:map<string,string>,c2:string>',
            'LABEL':
            'Type_Metadata',
            'KEY':
            'hive://gold.test_schema1/test_table1/col1/type/col1/_map_value'
        }, {
            'kind':
            'map',
            'name':
            'c1',
            'data_type':
            'map<string,string>',
            'sort_order:UNQUOTED':
            0,
            'LABEL':
            'Type_Metadata',
            'KEY':
            'hive://gold.test_schema1/test_table1/col1/type/col1/_map_value/c1'
        }, {
            'kind':
            'scalar',
            'name':
            '_map_key',
            'data_type':
            'string',
            'LABEL':
            'Type_Metadata',
            'KEY':
            'hive://gold.test_schema1/test_table1/col1/type/col1/_map_value/c1/_map_key'
        }, {
            'kind':
            'scalar',
            'name':
            '_map_value',
            'data_type':
            'string',
            'LABEL':
            'Type_Metadata',
            'KEY':
            'hive://gold.test_schema1/test_table1/col1/type/col1/_map_value/c1/_map_value'
        }, {
            'kind':
            'scalar',
            'name':
            'c2',
            'data_type':
            'string',
            'LABEL':
            'Type_Metadata',
            'sort_order:UNQUOTED':
            1,
            'KEY':
            'hive://gold.test_schema1/test_table1/col1/type/col1/_map_value/c2'
        }]
        expected_rels = [{
            'END_KEY': 'hive://gold.test_schema1/test_table1/col1/type/col1',
            'START_KEY': 'hive://gold.test_schema1/test_table1/col1',
            'END_LABEL': 'Type_Metadata',
            'START_LABEL': 'Column',
            'TYPE': 'TYPE_METADATA',
            'REVERSE_TYPE': 'TYPE_METADATA_OF'
        }, {
            'END_KEY':
            'hive://gold.test_schema1/test_table1/col1/type/col1/_map_key',
            'START_KEY': 'hive://gold.test_schema1/test_table1/col1/type/col1',
            'END_LABEL': 'Type_Metadata',
            'START_LABEL': 'Type_Metadata',
            'TYPE': 'SUBTYPE',
            'REVERSE_TYPE': 'SUBTYPE_OF'
        }, {
            'END_KEY':
            'hive://gold.test_schema1/test_table1/col1/type/col1/_map_value',
            'START_KEY': 'hive://gold.test_schema1/test_table1/col1/type/col1',
            'END_LABEL': 'Type_Metadata',
            'START_LABEL': 'Type_Metadata',
            'TYPE': 'SUBTYPE',
            'REVERSE_TYPE': 'SUBTYPE_OF'
        }, {
            'END_KEY':
            'hive://gold.test_schema1/test_table1/col1/type/col1/_map_value/c1',
            'START_KEY':
            'hive://gold.test_schema1/test_table1/col1/type/col1/_map_value',
            'END_LABEL': 'Type_Metadata',
            'START_LABEL': 'Type_Metadata',
            'TYPE': 'SUBTYPE',
            'REVERSE_TYPE': 'SUBTYPE_OF'
        }, {
            'END_KEY':
            'hive://gold.test_schema1/test_table1/col1/type/col1/_map_value/c1/_map_key',
            'START_KEY':
            'hive://gold.test_schema1/test_table1/col1/type/col1/_map_value/c1',
            'END_LABEL': 'Type_Metadata',
            'START_LABEL': 'Type_Metadata',
            'TYPE': 'SUBTYPE',
            'REVERSE_TYPE': 'SUBTYPE_OF'
        }, {
            'END_KEY':
            'hive://gold.test_schema1/test_table1/col1/type/col1/_map_value/c1/_map_value',
            'START_KEY':
            'hive://gold.test_schema1/test_table1/col1/type/col1/_map_value/c1',
            'END_LABEL': 'Type_Metadata',
            'START_LABEL': 'Type_Metadata',
            'TYPE': 'SUBTYPE',
            'REVERSE_TYPE': 'SUBTYPE_OF'
        }, {
            'END_KEY':
            'hive://gold.test_schema1/test_table1/col1/type/col1/_map_value/c2',
            'START_KEY':
            'hive://gold.test_schema1/test_table1/col1/type/col1/_map_value',
            'END_LABEL': 'Type_Metadata',
            'START_LABEL': 'Type_Metadata',
            'TYPE': 'SUBTYPE',
            'REVERSE_TYPE': 'SUBTYPE_OF'
        }]

        node_row = map_type_metadata.next_node()
        actual = []
        while node_row:
            node_row_serialized = neo4_serializer.serialize_node(node_row)
            actual.append(node_row_serialized)
            node_row = map_type_metadata.next_node()
        for i in range(0, len(expected_nodes)):
            self.assertEqual(actual[i], expected_nodes[i])

        relation_row = map_type_metadata.next_relation()
        actual = []
        while relation_row:
            relation_row_serialized = neo4_serializer.serialize_relationship(
                relation_row)
            actual.append(relation_row_serialized)
            relation_row = map_type_metadata.next_relation()
        for i in range(0, len(expected_rels)):
            self.assertEqual(actual[i], expected_rels[i])
Beispiel #8
0
    def test_serialize_array_map_type_metadata(self) -> None:
        column = ColumnMetadata('col1', None,
                                'array<map<string,array<string>>>', 0)
        column.set_column_key(self.column_key)

        array_type_metadata = ArrayTypeMetadata(
            name='col1',
            parent=column,
            type_str='array<map<string,array<string>>>')
        nested_map_type_metadata_level1 = MapTypeMetadata(
            name='_inner_',
            parent=array_type_metadata,
            type_str='map<string,array<string>>')
        nested_map_key = ScalarTypeMetadata(
            name='_map_key',
            parent=nested_map_type_metadata_level1,
            type_str='string')
        nested_array_type_metadata_level2 = ArrayTypeMetadata(
            name='_map_value',
            parent=nested_map_type_metadata_level1,
            type_str='array<string>')
        nested_scalar_type_metadata_level3 = ScalarTypeMetadata(
            name='_inner_',
            parent=nested_array_type_metadata_level2,
            type_str='string')

        array_type_metadata.array_inner_type = nested_map_type_metadata_level1
        nested_map_type_metadata_level1.map_key_type = nested_map_key
        nested_map_type_metadata_level1.map_value_type = nested_array_type_metadata_level2
        nested_array_type_metadata_level2.array_inner_type = nested_scalar_type_metadata_level3

        expected_nodes = [{
            'kind':
            'array',
            'data_type':
            'array<map<string,array<string>>>',
            'LABEL':
            'Type_Metadata',
            'name':
            'col1',
            'KEY':
            'hive://gold.test_schema1/test_table1/col1/type/col1'
        }, {
            'kind':
            'map',
            'data_type':
            'map<string,array<string>>',
            'LABEL':
            'Type_Metadata',
            'name':
            '_inner_',
            'KEY':
            'hive://gold.test_schema1/test_table1/col1/type/col1/_inner_'
        }, {
            'kind':
            'scalar',
            'data_type':
            'string',
            'LABEL':
            'Type_Metadata',
            'name':
            '_map_key',
            'KEY':
            'hive://gold.test_schema1/test_table1/col1/type/col1/_inner_/_map_key'
        }, {
            'kind':
            'array',
            'data_type':
            'array<string>',
            'LABEL':
            'Type_Metadata',
            'name':
            '_map_value',
            'KEY':
            'hive://gold.test_schema1/test_table1/col1/type/col1/_inner_/_map_value'
        }]
        expected_rels = [{
            'END_KEY': 'hive://gold.test_schema1/test_table1/col1/type/col1',
            'START_KEY': 'hive://gold.test_schema1/test_table1/col1',
            'END_LABEL': 'Type_Metadata',
            'START_LABEL': 'Column',
            'TYPE': 'TYPE_METADATA',
            'REVERSE_TYPE': 'TYPE_METADATA_OF'
        }, {
            'END_KEY':
            'hive://gold.test_schema1/test_table1/col1/type/col1/_inner_',
            'START_KEY': 'hive://gold.test_schema1/test_table1/col1/type/col1',
            'END_LABEL': 'Type_Metadata',
            'START_LABEL': 'Type_Metadata',
            'TYPE': 'SUBTYPE',
            'REVERSE_TYPE': 'SUBTYPE_OF'
        }, {
            'END_KEY':
            'hive://gold.test_schema1/test_table1/col1/type/col1/_inner_/_map_key',
            'START_KEY':
            'hive://gold.test_schema1/test_table1/col1/type/col1/_inner_',
            'END_LABEL': 'Type_Metadata',
            'START_LABEL': 'Type_Metadata',
            'TYPE': 'SUBTYPE',
            'REVERSE_TYPE': 'SUBTYPE_OF'
        }, {
            'END_KEY':
            'hive://gold.test_schema1/test_table1/col1/type/col1/_inner_/_map_value',
            'START_KEY':
            'hive://gold.test_schema1/test_table1/col1/type/col1/_inner_',
            'END_LABEL': 'Type_Metadata',
            'START_LABEL': 'Type_Metadata',
            'TYPE': 'SUBTYPE',
            'REVERSE_TYPE': 'SUBTYPE_OF'
        }]

        node_row = array_type_metadata.next_node()
        actual = []
        while node_row:
            node_row_serialized = neo4_serializer.serialize_node(node_row)
            actual.append(node_row_serialized)
            node_row = array_type_metadata.next_node()
        for i in range(0, len(expected_nodes)):
            self.assertEqual(actual[i], expected_nodes[i])

        relation_row = array_type_metadata.next_relation()
        actual = []
        while relation_row:
            relation_row_serialized = neo4_serializer.serialize_relationship(
                relation_row)
            actual.append(relation_row_serialized)
            relation_row = array_type_metadata.next_relation()
        for i in range(0, len(expected_rels)):
            self.assertEqual(actual[i], expected_rels[i])