def __parse_attributes(self, attributes: bytes) -> Dict[str, str]: """ Given a string representing zero or more possible attributes, parse them into a dictionary. Returns: A dictionary keyed by the attribute name and who's values are unescaped strings. If no attributes exist, this returns an empty dictionary. """ attr_stream = InputStream(attributes) parsed_attrs: Dict[str, str] = {} state = 'space' attr = b'' val = b'' def unescape(value: bytes) -> str: val = value.decode(self.encoding) val = val.replace('&', '&') val = val.replace('<', '<') val = val.replace('>', '>') val = val.replace(''', '\'') val = val.replace('"', '\"') val = val.replace(' ', '\r') return val.replace(' ', '\n') while True: c = attr_stream.read_byte() if c is None: return parsed_attrs if state == 'space': if not c.isspace(): state = 'attr' attr = c elif state == 'attr': if c == b'=': attr = attr.strip() state = 'valstart' else: attr = attr + c elif state == 'valstart': if c == b'"': state = 'valdouble' val = b'' elif c == b'\'': state = 'valsingle' val = b'' elif state == 'valdouble': if c == b'"': state = 'space' parsed_attrs[attr.decode('ascii')] = unescape(val) else: val = val + c elif state == 'valsingle': if c == b'\'': state = 'space' parsed_attrs[attr.decode('ascii')] = unescape(val) else: val = val + c
def __init__(self, data: bytes, encoding: str) -> None: """ Initialize the XML decoder. Parameters: data - String XML data which should be decoded into Nodes. encoding - The expected encoding of the XML. """ self.stream = InputStream(data) self.root: Optional[Node] = None self.current: List[Node] = [] self.encoding = encoding
def __init__(self, data: bytes, encoding: str) -> None: """ Initialize the object. Parameters: - data - A binary blob of data to be decoded - encoding - A string representing the text encoding for string elements. Should be either 'shift-jis', 'euc-jp' or 'utf-8' """ self.stream = InputStream(data) self.encoding = encoding self.executed = False
def __split_node(self, content: bytes) -> Tuple[bytes, bytes]: node_stream = InputStream(content) tag = b'' attributes = b'' state = "tag" while True: c = node_stream.read_byte() if c is None: break if state == "tag": if c.isspace(): state = "space" else: tag = tag + c elif state == "space": if not c.isspace(): attributes = c state = "attributes" elif state == "attributes": attributes = attributes + c return (tag, attributes)
class XmlDecoder: """ A hand-rolled XML parser, suitable for parsing old-style XML documents in game data or from legacy game traffic. I did consider using lxml and other data stores, but they insist on mangling data inside binary/string blobs making them unsuitable for a protocol with exact specifications. """ def __init__(self, data: bytes, encoding: str) -> None: """ Initialize the XML decoder. Parameters: data - String XML data which should be decoded into Nodes. encoding - The expected encoding of the XML. """ self.stream = InputStream(data) self.root: Optional[Node] = None self.current: List[Node] = [] self.encoding = encoding def __start_element(self, tag: bytes, attributes: Dict[str, str]) -> None: """ Called when we encounter an element open tag. Also called when we encounter an empty element. Creates a new node with the specified name and attributes. Parameters: tag - The string tag name. attributes - A dictionary keyed by attribute name and whose values are the string attribute values. This attribute values should already be decoded from the XML's encoding. """ data_type = attributes.get('__type') array_str = attributes.get('__count') if array_str is not None: array = True else: array = False if data_type is None: # Special case for nodes that don't have a type node = Node(name=tag.decode('ascii'), type=Node.NODE_TYPE_VOID) else: # Get the data value type_int = Node.typename_to_type(data_type) if type_int is None: raise XmlEncodingException( f'Invalid node type {data_type} for node {tag.decode("ascii")}' ) node = Node(name=tag.decode('ascii'), type=type_int, array=array) # Now, do the attributes for attr in attributes: if attr == '__type' or attr == '__count': # Skip these, handled continue else: node.set_attribute(attr, attributes.get(attr)) self.current.append(node) def __end_element(self, tag: bytes) -> None: """ Called when we encounter an element close tag. Also called when we encounter an empty element, after __start_element is called. Does bookkeeping related to element order. Parameters: tag - The string tag name. """ node = self.current.pop() if node.name != tag.decode('ascii'): raise Exception( f'Logic error, expected {tag.decode("ascii")} but got {node.name}' ) if len(self.current) == 0: self.root = node else: parent = self.current[-1] parent.add_child(node) def __yield_values(self, text: str) -> Iterator[str]: value = '' for c in text: if c.isspace(): if len(value) > 0: yield value value = '' else: value = value + c if len(value) > 0: yield value def __text(self, text: bytes) -> None: """ Called when we finish parsing arbitrary non-element text. Note that the text passed in is in the XML document's encoding and it is this function's responsibility to decode it. Parameters: text - String text value of the node, as encoded by the XML document's encoding. """ try: value = text.decode(self.encoding) except UnicodeDecodeError: raise XmlEncodingException( 'Failed to decode text node with given encoding') if len(self.current) > 0: data_type = self.current[-1].data_type composite = self.current[-1].is_composite array = self.current[-1].is_array if data_type == 'void': # We can't handle this return if data_type == 'str': # Do nothing, already fine value = value.replace('&', '&') value = value.replace('<', '<') value = value.replace('>', '>') value = value.replace(''', '\'') value = value.replace('"', '\"') if self.current[-1].value is None: self.current[-1].set_value(value) else: self.current[-1].set_value(self.current[-1].value + value) elif data_type == 'bin': # Convert from a hex string def hex_to_bin(hexval: str) -> bytes: intval = int(hexval, 16) return struct.pack('>B', intval) # Remove any spaces first value = ''.join([c for c in value if not c.isspace()]) if self.current[-1].value is None: self.current[-1].set_value(b''.join([ hex_to_bin(value[i:(i + 2)]) for i in range(0, len(value), 2) ])) else: self.current[-1].set_value( self.current[-1].value + b''.join([ hex_to_bin(value[i:(i + 2)]) for i in range(0, len(value), 2) ])) elif data_type == 'ip4': # Do nothing, already fine self.current[-1].set_value(value) elif data_type == 'bool': def conv_bool(val: str) -> bool: if val and val.lower() in ['0', 'false']: return False else: return True if array or composite: self.current[-1].set_value( [conv_bool(v) for v in self.__yield_values(value)]) else: self.current[-1].set_value(conv_bool(value)) elif data_type == 'float': if array or composite: self.current[-1].set_value( [float(v) for v in self.__yield_values(value)]) else: self.current[-1].set_value(float(value)) else: if array or composite: self.current[-1].set_value( [int(v) for v in self.__yield_values(value)]) else: self.current[-1].set_value(int(value)) def __parse_attributes(self, attributes: bytes) -> Dict[str, str]: """ Given a string representing zero or more possible attributes, parse them into a dictionary. Returns: A dictionary keyed by the attribute name and who's values are unescaped strings. If no attributes exist, this returns an empty dictionary. """ attr_stream = InputStream(attributes) parsed_attrs: Dict[str, str] = {} state = 'space' attr = b'' val = b'' def unescape(value: bytes) -> str: val = value.decode(self.encoding) val = val.replace('&', '&') val = val.replace('<', '<') val = val.replace('>', '>') val = val.replace(''', '\'') val = val.replace('"', '\"') val = val.replace(' ', '\r') return val.replace(' ', '\n') while True: c = attr_stream.read_byte() if c is None: return parsed_attrs if state == 'space': if not c.isspace(): state = 'attr' attr = c elif state == 'attr': if c == b'=': attr = attr.strip() state = 'valstart' else: attr = attr + c elif state == 'valstart': if c == b'"': state = 'valdouble' val = b'' elif c == b'\'': state = 'valsingle' val = b'' elif state == 'valdouble': if c == b'"': state = 'space' parsed_attrs[attr.decode('ascii')] = unescape(val) else: val = val + c elif state == 'valsingle': if c == b'\'': state = 'space' parsed_attrs[attr.decode('ascii')] = unescape(val) else: val = val + c def __split_node(self, content: bytes) -> Tuple[bytes, bytes]: node_stream = InputStream(content) tag = b'' attributes = b'' state = "tag" while True: c = node_stream.read_byte() if c is None: break if state == "tag": if c.isspace(): state = "space" else: tag = tag + c elif state == "space": if not c.isspace(): attributes = c state = "attributes" elif state == "attributes": attributes = attributes + c return (tag, attributes) def __handle_node(self, content: bytes) -> None: """ Called whenever we encounter any node type. Filters out special nodes, determines whether this is a start, end or empty node, and fires off calls to the respective __start_element and __end_element functions. Parameters: The node contents, minus the < and > characters. This will be encoded in the XML document's encoding. """ if content[:1] == b'?' and content[-1:] == b'?': # Special node, parse to get the encoding. tag, attributes = self.__split_node(content[1:-1]) if tag == b'xml': attributes_dict = self.__parse_attributes(attributes) if 'encoding' in attributes_dict: self.encoding = attributes_dict['encoding'] return if content[:1] == b'/': # We got an element end self.__end_element(content[1:]) else: # We got a start element if content[-1:] == b'/': # This is an empty element empty = True content = content[:-1] else: # This node has subnodes or text empty = False tag, attributes = self.__split_node(content) self.__start_element(tag, self.__parse_attributes(attributes)) if empty: self.__end_element(tag) def get_tree(self) -> Node: """ Walk the XML document and parse into nodes. Returns: A Node object representing the root of the XML document. """ state = 'text' text = b'' node = b'' while True: c = self.stream.read_byte() if c is None: return self.root elif state == 'text': if c == b'<': self.__text(text) state = 'node' node = b'' else: text = text + c elif state == 'node': if c == b'>': self.__handle_node(node) state = 'text' text = b'' else: node = node + c
class BinaryDecoder: """ A class capable of taking a binary blob and decoding it to a Node tree. """ def __init__(self, data: bytes, encoding: str) -> None: """ Initialize the object. Parameters: - data - A binary blob of data to be decoded - encoding - A string representing the text encoding for string elements. Should be either 'shift-jis', 'euc-jp' or 'utf-8' """ self.stream = InputStream(data) self.encoding = encoding self.executed = False def __read_node_name(self) -> str: """ Given the current position in the stream, read the 6-bit-byte packed string name of the node. Returns: A string representing the name in ascii """ length = self.stream.read_int() if length is None: raise BinaryEncodingException( "Ran out of data when attempting to read node name length!") binary_length = int(((length * 6) + 7) / 8) def int_to_bin(integer: int) -> str: val = bin(integer)[2:] while len(val) < 8: val = '0' + val return val data = '' for _ in range(binary_length): next_byte = self.stream.read_int() if next_byte is None: raise BinaryEncodingException( "Ran out of data when attempting to read node name!") data = data + int_to_bin(next_byte) data_str = [data[i:(i + 6)] for i in range(0, len(data), 6)] data_int = [int(val, 2) for val in data_str] ret = ''.join([Node.NODE_NAME_CHARS[val] for val in data_int]) ret = ret[:length] return ret def __read_node(self, node_type: int) -> Node: """ Given an integer node type, read the node's name, possible attributes and children. Will return a Node representing this node. Note that calling this on the first node should return a tree of all nodes. Returns: Node object """ name = self.__read_node_name() node = Node(name=name, type=node_type) while True: child_type = self.stream.read_int() if child_type is None: raise BinaryEncodingException( "Ran out of data when attempting to read node type!") if child_type == Node.END_OF_NODE: return node elif child_type == Node.ATTR_TYPE: key = self.__read_node_name() node.set_attribute(key) else: child = self.__read_node(child_type) node.add_child(child) def get_tree(self) -> Node: """ Parse the header and body such that we can return a Node tree representing the data passed to us. Returns: Node object """ if self.executed: raise BinaryEncodingException( "Logic error, should only call this once per instance") self.executed = True # Read the header first header_length = self.stream.read_int(4) if header_length is None: raise BinaryEncodingException( "Ran out of data when attempting to read header length!") node_type = self.stream.read_int() if node_type is None: raise BinaryEncodingException( "Ran out of data when attempting to read root node type!") root = self.__read_node(node_type) eod = self.stream.read_int() if eod != Node.END_OF_DOCUMENT: raise BinaryEncodingException( f'Unknown node type {eod} at end of document') # Skip by any padding while self.stream.pos < header_length + 4: self.stream.read_byte() # Read the body next body_length = self.stream.read_int(4) if body_length is not None and body_length > 0: # We have a body body = self.stream.read_blob(body_length) if body is None: raise BinaryEncodingException('Body has insufficient data') ordering = PackedOrdering(body_length) values = PackedOrdering.node_to_body_ordering(root) for value in values: node = value['node'] if value['type'] == 'attribute': size = None enc = 's' dtype = 'str' array = False composite = False else: size = node.data_length enc = node.data_encoding dtype = node.data_type array = node.is_array composite = node.is_composite if composite and array: raise Exception( 'Logic error, no support for composite arrays!') if not array: # Scalar value alignment = value['alignment'] if alignment == 1: loc = ordering.get_next_byte() elif alignment == 2: loc = ordering.get_next_short() elif alignment == 4: loc = ordering.get_next_int() if loc is None: raise BinaryEncodingException( "Ran out of data when attempting to read node data location!" ) if size is None: # The size should be read from the first 4 bytes size = struct.unpack('>I', body[loc:(loc + 4)])[0] ordering.mark_used(size + 4, loc, round_to=4) loc = loc + 4 decode_data = body[loc:(loc + size)] decode_value = f'>{size}{enc}' else: # The size is built-in ordering.mark_used(size, loc) decode_data = body[loc:(loc + size)] decode_value = f'>{enc}' if composite: val_list = list( struct.unpack(decode_value, decode_data)) if value['type'] == 'attribute': raise Exception( 'Logic error, shouldn\'t have composite attribute type!' ) node.set_value(val_list) continue val = struct.unpack(decode_value, decode_data)[0] if dtype == 'str': # Need to convert this from encoding to standard string. # Also, need to lob off the trailing null. try: val = val[:-1].decode(self.encoding) except UnicodeDecodeError: # Nothing we can do here pass if value['type'] == 'attribute': node.set_attribute(value['name'], val) else: node.set_value(val) else: # Array value loc = ordering.get_next_int() if loc is None: raise BinaryEncodingException( "Ran out of data when attempting to read array length location!" ) # The raw size in bytes length = struct.unpack('>I', body[loc:(loc + 4)])[0] elems = int(length / size) ordering.mark_used(length + 4, loc, round_to=4) loc = loc + 4 decode_data = body[loc:(loc + length)] decode_value = f'>{enc * elems}' val = struct.unpack(decode_value, decode_data) node.set_value([v for v in val]) return root