def test_xml_text_parents(self): print("test_xmlparser.py: test_xml_text_parents start") parser = XmlParser(io.BytesIO(b"<root>Hello, world!</root>")) parser.next_token() token = parser.next_token() self.assertEqual(len(token.get_parent_tags()), 1) self.assertEqual(token.get_parent_tags()[0], b"root") print("test_xmlparser.py: test_xml_text_parents end")
def test_xml_pure_text(self): print("test_xmlparser.py: test_xml_pure_text start") parser = XmlParser(io.BytesIO(b"hello world")) had_error = False try: parser.next_token() except XmlParserInvalidSyntaxError: had_error = True self.assertTrue(had_error, msg="verify the parser has aborted " + \ "parsing \"hello world\" as XML") print("test_xmlparser.py: test_xml_pure_text end")
def test_xml_attributes_unfinished(self): print("test_xmlparser.py: test_xml_attributes_unfinished start") parser = XmlParser(io.BytesIO(b"<root><child myattr=/></root>")) self.assertTrue(parser.next_token().get_tag() == b"root") had_error = False try: parser.next_token() except XmlParserInvalidSyntaxError: had_error = True self.assertTrue(had_error, msg="verify the parser has aborted " + \ "due to unfinished attribute (= character but no value)") print("test_xmlparser.py: test_xml_attributes_unfinished end")
def test_xml_invalid_xml_tag_position(self): print("test_xmlparser.py: test_xml_invalid_xml_tag_position start") parser = XmlParser(io.BytesIO( b"<root><?xml version=\"1.0\"?></root>")) parser.next_token() had_error = False try: parser.next_token() except XmlParserInvalidSyntaxError: had_error = True self.assertTrue(had_error, msg="verify the parser has aborted " + \ "after encountering invalid followup data after closing '?'") print("test_xmlparser.py: test_xml_invalid_xml_tag_position end")
def test_xml_selfclosing_invalid_followup(self): print("test_xmlparser.py: test_xml_selfclosing_invalid_followup " + \ "start") parser = XmlParser(io.BytesIO(b"<root><child/test></root>")) parser.next_token() had_error = False try: parser.next_token() except XmlParserInvalidSyntaxError: had_error = True self.assertTrue(had_error, msg="verify the parser has aborted " + \ "after encountering invalid followup data after closing '/'") print("test_xmlparser.py: test_xml_selfclosing_invalid_followup end")
def test_xml_unfinished(self): print("test_xmlparser.py: test_xml_unfinished start") parser = XmlParser(io.BytesIO(b"<root><child/>")) self.assertTrue(parser.next_token().get_tag() == b"root") self.assertTrue(parser.next_token().get_tag() == b"child") self.assertTrue(parser.next_token().get_tag() == b"child") had_error = False try: token = parser.next_token() print("TOKEN: " + str(token)) except XmlParserInvalidSyntaxError: had_error = True self.assertTrue(had_error, msg="verify the parser has aborted " + \ "due to unfinished XML") print("test_xmlparser.py: test_xml_unfinished end")
def test_xml_attributes_malformed(self): print("test_xmlparser.py: test_xml_attributes_malformed start") parser = XmlParser(io.BytesIO(b"<root><child \"\"/></root>")) had_error = False root_begin = parser.next_token() self.assertEqual(root_begin.get_type(), "begin_tag") self.assertEqual(root_begin.get_tag(), b"root") try: token = parser.next_token() print("token: " + str(token)) print("attributes: " + str(token.attributes)) except XmlParserInvalidSyntaxError: had_error = True self.assertTrue(had_error, msg="verify the parser has aborted " + \ "due to malformed attribute") print("test_xmlparser.py: test_xml_attributes_malformed end")
def test_xml_parents(self): print("test_xmlparser.py: test_xml_parents start") parser = XmlParser(io.BytesIO( b"<root><child1><child2/></child1></root>")) root_begin = parser.next_token() assert(len(root_begin.get_parent_tags()) == 0) child1_begin = parser.next_token() assert(len(child1_begin.get_parent_tags()) == 1) assert(child1_begin.get_parent_tags()[0] == b"root") child2_begin = parser.next_token() assert(len(child2_begin.get_parent_tags()) == 2) assert(child2_begin.get_parent_tags()[0] == b"root") assert(child2_begin.get_parent_tags()[1] == b"child1") child2_end = parser.next_token() assert(child2_end.get_tag() == b"child2") assert(len(child2_begin.get_parent_tags()) == 2) assert(child2_end.get_parent_tags()[0] == b"root") assert(child2_end.get_parent_tags()[1] == b"child1") child1_end = parser.next_token() assert(child1_end.get_tag() == b"child1") assert(len(child1_end.get_parent_tags()) == 1) assert(child1_end.get_parent_tags()[0] == b"root") root_end = parser.next_token() assert(len(root_end.get_parent_tags()) == 0) print("test_xmlparser.py: test_xml_parents end")
def deserialize(self, xml_file, expected_root_tag, expected_namespace, expected_max_file_size=None): """ Parse and deserialize an XML file. The XML file must have one root tag expected to you, which is checked for existance. All further subvalues must be either dictionaries serialized with the given expected namespace, or <int>, <Decimal>, <list>, <str>, <bool> or <None> value tags inside. Returns the resulting dictionary with all the nested dictionaries and values inside. Raises XmlParserParseError if the file is longer than expected, or doesn't have the expected root tag or namespace. In addition, errors of the XmlParser (XmlParserParseError, XmlParserInvalidSyntaxError) are passed through. Example with expected_root_tag="my_settings_storage", expected_namespace="my_game_settings": <?xml version="1.0"?> <my_settings_storage> <my_game_settings:resolutions> <list> <list> <int>800</int> <int>600</int> </list> <list> <int>1024</int> <int>768</int> </list> </list> </my_game_settings:resolutions> </my_settings_storage> """ if len(expected_root_tag) == 0: raise ValueError("need expected root tag") if expected_max_file_size != None: if not xml_file.seekable(): raise ValueError("cannot truncate non-seekable file") xml_file.seek(0, io.SEEK_END) if xml_file.tell() > expected_max_file_size: raise XmlParserParseError("file larger than expected - " + \ expected_max_file_size + " bytes") xml_file.seek(0, io.SEEK_SET) # information about the dictionary structure: self.result_dict = dict() self.current_parent = self.result_dict self.current_parent_parents = [] self.current_parent_name_in_parents = [] # if we're inside a value tag and expect a value: self.expected_value_type = None self.value_was_set = False # remember about what the next closing tag may be fore if a special # purpose self.next_end_tag_closes_value = False self.next_end_tag_closes_list = False # recall whether we are in a list: self.in_a_list = False self.current_list = None # set up the parser self.parser = XmlParser(xml_file) self.token = self.parser.next_token() def apply_value_to_current(value): self.next_end_tag_closes_value = True # if in a list, add to that instead: if self.in_a_list: self.current_list.append(value) return # go up one level, remember name: name = self.current_parent_name_in_parents[-1] #print("[applying] current_parent_parents: " + str(self.current_parent_parents)) self.current_parent_parents[-1][name] = value self.current_parent = value self.next_end_tag_closes_value = True while self.token.get_type() != "end_document": #print("----") #print("current_parent_parents: " + str(self.current_parent_parents) + "\n") #print("current full dict: " + str(self.result_dict) + "\n") #print("current_parent:" + str(self.current_parent) + "\n") #print("token: " + str(self.token) + "\n") #print("GO:\n") if self.token.get_type() == "begin_tag": if len(self.token.get_parent_tags()) == 0: # root tag. actual_root_tag = self.token.get_tag().decode(\ "utf-8") if actual_root_tag != expected_root_tag: raise XmlParserParseError('wrong root tag. got "' \ + actual_root_tag + '", expected "' + \ expected_root_tag + '"') else: tag = self.token.get_tag().decode("utf-8") # see if dict tag or value tag: if tag.find(":") >= 0: # dict tag. verify namespace: if not tag.startswith(expected_namespace + ":"): raise XmlParserParseError("invalid namespace " + \ "found - should always be " + \ expected_namespace) # truncate namespace: tag = tag.split(":", 1)[1] elif tag == "list": if self.in_a_list: raise XmlParserParseError("list inside list " + \ "isn ot allowed") if self.next_end_tag_closes_value or \ self.next_end_tag_closes_list: raise XmlParserParseError("this tag already " + \ "has contents, cannot add list") if len(self.current_parent_parents) == None: raise XmlParserParseError("can only have " + \ "list inside a dictionary tag, not inside " \ + "root") self.in_a_list = True self.current_list = list() self.token = self.parser.next_token() continue else: # value tag: if self.expected_value_type: raise XmlParserParseError("cannot nest values") if self.next_end_tag_closes_value or \ self.next_end_tag_closes_list: raise XmlParserParseError("this tag already " + \ "has contents, cannot add value") self.expected_value_type = tag if len(self.current_parent_parents) == None: raise XmlParserParseError("can only have " + \ "value inside a dictionary tag, not inside " \ + "root") self.value_was_set = False # don't make a child dict, just continue: self.token = self.parser.next_token() continue # go deeper one level, assume it is of type dict for now: self.current_parent_parents.append(self.current_parent) self.current_parent_name_in_parents.append(tag) self.current_parent[tag] = dict() self.current_parent = self.current_parent[tag] # if it is actually a value or a list, we will replace it # later with that instead. elif self.token.get_type() == "end_tag": if self.expected_value_type != None and not \ self.value_was_set: raise XmlParserParseError("invalid empty value tag found") if self.expected_value_type != None: # leave value tag (e.g. </int>): assert(self.expected_value_type == self.token.get_tag().\ decode("utf-8")) assert(self.next_end_tag_closes_value) self.expected_value_type = None self.next_end_tag_closes_value = False self.token = self.parser.next_token() continue if self.token.get_tag() == "list": # leave list tag </list> self.in_a_list = False apply_value_to_current(self.current_list) self.current_list = None self.next_end_tag_closes_value = False self.next_end_tag_closes_list = True # get out of current value: if self.next_end_tag_closes_value or \ self.next_end_tag_closes_list: # we are at the closing dict tag of # were we just had a value or list inside self.next_end_tag_closes_list = False self.next_end_tag_closes_value = False self.token = self.parser.next_token() continue if len(self.current_parent_parents) > 0: assert(self.token.get_tag().decode("utf-8").\ startswith(expected_namespace + ":")) self.current_parent = self.current_parent_parents[-1] del(self.current_parent_parents[-1]) del(self.current_parent_name_in_parents[-1]) else: assert(self.token.get_tag().decode("utf-8") \ == expected_root_tag) return self.result_dict elif self.token.get_type() == "text": if self.expected_value_type == None: if len(self.token.get_text().strip()) == 0: # just skip over this self.token = self.parser.next_token() continue raise XmlParserParseError("invalid text content in " + \ "no value tag") # now set this value accordingly: self.value_was_set = True if self.expected_value_type == "str": apply_value_to_current(self.token.get_text().strip()) elif self.expected_value_type == "int": apply_value_to_current(int(self.token.get_text().\ strip())) elif self.expected_value_type == "Decimal": apply_value_to_current(Decimal(self.token.get_text().\ strip())) elif self.expected_value_type == "bool": apply_value_to_current(bool(self.token.get_text().\ strip())) elif self.expected_value_type == "None": apply_value_to_current(None) else: raise XmlParserParseError("unsupported value tag " + \ "'" + expected_value_type + "' encountered") self.token = self.parser.next_token() return self.result_dict
class XmlSimpleDictionarySerializer(object): def serialize(self, dictionary, root_tag, namespace="serialized"): """ Serialize a given dictionary with any sort of values or nested dictionaries inside to XML. All dictionaries will be serialized to tags with the keys being the tag names, and the values being the contents. The whole serialization will use one unified namespace, as if the whole thing was one object. (also see deserialize() for an example) Any sort of actual non-dictionary value will be serialized to special non-prefixed <int>, <Decimal>, <list>, <str> etc. tags. Supported types are: list, int, Decimal, str, bool """ if not isinstance(dictionary, dict): raise ValueError("not a dictionary") txt = '<?xml version="1.0"?>\n' + \ "<" + root_tag + ">\n" + self._serialize(dictionary, namespace, 1) + \ "</" + root_tag + ">\n" return txt def _serialize(self, obj, namespace, indent=0, no_dict=False): current_indent = ' ' * (4 * indent) if obj == "None": return current_indent + "<None>None</None>\n" elif isinstance(obj, dict): if no_dict: raise ValueError("dictionary not allowed here - can only " + \ "list dictionary in dictionaries, not inside other " + \ "types (e.g. list") txt = "" for key in obj: txt = txt + current_indent + "<" + namespace + ":" + key + \ ">\n" txt = txt + self._serialize(obj[key], namespace, indent + 1) txt = txt + current_indent + "</" + namespace + ":" + key + \ ">\n" return txt elif isinstance(obj, list): txt = current_indent + "<list>\n" for item in obj: txt = txt + self._serialize(item, namespace, indent + 1, no_dict=True) txt = txt + current_indent + "</list>\n" return txt elif isinstance(obj, int): return current_indent + "<int>" + str(obj) + "</int>\n" elif isinstance(obj, str): return current_indent + "<str>" + obj.replace("&", "&").\ replace(">", ">").replace("<", "lt;") + "</str>\n"; elif isinstance(obj, Decimal) or isinstance(obj, float): return current_indent + "<Decimal>" + str(obj) + "</Decimal>\n" elif isinstance(obj, bool): return current_indent + "<bool>" + str(obj) + "</bool>\n" else: raise TypeError("unsupported value type - cannot serialize") def deserialize(self, xml_file, expected_root_tag, expected_namespace, expected_max_file_size=None): """ Parse and deserialize an XML file. The XML file must have one root tag expected to you, which is checked for existance. All further subvalues must be either dictionaries serialized with the given expected namespace, or <int>, <Decimal>, <list>, <str>, <bool> or <None> value tags inside. Returns the resulting dictionary with all the nested dictionaries and values inside. Raises XmlParserParseError if the file is longer than expected, or doesn't have the expected root tag or namespace. In addition, errors of the XmlParser (XmlParserParseError, XmlParserInvalidSyntaxError) are passed through. Example with expected_root_tag="my_settings_storage", expected_namespace="my_game_settings": <?xml version="1.0"?> <my_settings_storage> <my_game_settings:resolutions> <list> <list> <int>800</int> <int>600</int> </list> <list> <int>1024</int> <int>768</int> </list> </list> </my_game_settings:resolutions> </my_settings_storage> """ if len(expected_root_tag) == 0: raise ValueError("need expected root tag") if expected_max_file_size != None: if not xml_file.seekable(): raise ValueError("cannot truncate non-seekable file") xml_file.seek(0, io.SEEK_END) if xml_file.tell() > expected_max_file_size: raise XmlParserParseError("file larger than expected - " + \ expected_max_file_size + " bytes") xml_file.seek(0, io.SEEK_SET) # information about the dictionary structure: self.result_dict = dict() self.current_parent = self.result_dict self.current_parent_parents = [] self.current_parent_name_in_parents = [] # if we're inside a value tag and expect a value: self.expected_value_type = None self.value_was_set = False # remember about what the next closing tag may be fore if a special # purpose self.next_end_tag_closes_value = False self.next_end_tag_closes_list = False # recall whether we are in a list: self.in_a_list = False self.current_list = None # set up the parser self.parser = XmlParser(xml_file) self.token = self.parser.next_token() def apply_value_to_current(value): self.next_end_tag_closes_value = True # if in a list, add to that instead: if self.in_a_list: self.current_list.append(value) return # go up one level, remember name: name = self.current_parent_name_in_parents[-1] #print("[applying] current_parent_parents: " + str(self.current_parent_parents)) self.current_parent_parents[-1][name] = value self.current_parent = value self.next_end_tag_closes_value = True while self.token.get_type() != "end_document": #print("----") #print("current_parent_parents: " + str(self.current_parent_parents) + "\n") #print("current full dict: " + str(self.result_dict) + "\n") #print("current_parent:" + str(self.current_parent) + "\n") #print("token: " + str(self.token) + "\n") #print("GO:\n") if self.token.get_type() == "begin_tag": if len(self.token.get_parent_tags()) == 0: # root tag. actual_root_tag = self.token.get_tag().decode(\ "utf-8") if actual_root_tag != expected_root_tag: raise XmlParserParseError('wrong root tag. got "' \ + actual_root_tag + '", expected "' + \ expected_root_tag + '"') else: tag = self.token.get_tag().decode("utf-8") # see if dict tag or value tag: if tag.find(":") >= 0: # dict tag. verify namespace: if not tag.startswith(expected_namespace + ":"): raise XmlParserParseError("invalid namespace " + \ "found - should always be " + \ expected_namespace) # truncate namespace: tag = tag.split(":", 1)[1] elif tag == "list": if self.in_a_list: raise XmlParserParseError("list inside list " + \ "isn ot allowed") if self.next_end_tag_closes_value or \ self.next_end_tag_closes_list: raise XmlParserParseError("this tag already " + \ "has contents, cannot add list") if len(self.current_parent_parents) == None: raise XmlParserParseError("can only have " + \ "list inside a dictionary tag, not inside " \ + "root") self.in_a_list = True self.current_list = list() self.token = self.parser.next_token() continue else: # value tag: if self.expected_value_type: raise XmlParserParseError("cannot nest values") if self.next_end_tag_closes_value or \ self.next_end_tag_closes_list: raise XmlParserParseError("this tag already " + \ "has contents, cannot add value") self.expected_value_type = tag if len(self.current_parent_parents) == None: raise XmlParserParseError("can only have " + \ "value inside a dictionary tag, not inside " \ + "root") self.value_was_set = False # don't make a child dict, just continue: self.token = self.parser.next_token() continue # go deeper one level, assume it is of type dict for now: self.current_parent_parents.append(self.current_parent) self.current_parent_name_in_parents.append(tag) self.current_parent[tag] = dict() self.current_parent = self.current_parent[tag] # if it is actually a value or a list, we will replace it # later with that instead. elif self.token.get_type() == "end_tag": if self.expected_value_type != None and not \ self.value_was_set: raise XmlParserParseError("invalid empty value tag found") if self.expected_value_type != None: # leave value tag (e.g. </int>): assert(self.expected_value_type == self.token.get_tag().\ decode("utf-8")) assert(self.next_end_tag_closes_value) self.expected_value_type = None self.next_end_tag_closes_value = False self.token = self.parser.next_token() continue if self.token.get_tag() == "list": # leave list tag </list> self.in_a_list = False apply_value_to_current(self.current_list) self.current_list = None self.next_end_tag_closes_value = False self.next_end_tag_closes_list = True # get out of current value: if self.next_end_tag_closes_value or \ self.next_end_tag_closes_list: # we are at the closing dict tag of # were we just had a value or list inside self.next_end_tag_closes_list = False self.next_end_tag_closes_value = False self.token = self.parser.next_token() continue if len(self.current_parent_parents) > 0: assert(self.token.get_tag().decode("utf-8").\ startswith(expected_namespace + ":")) self.current_parent = self.current_parent_parents[-1] del(self.current_parent_parents[-1]) del(self.current_parent_name_in_parents[-1]) else: assert(self.token.get_tag().decode("utf-8") \ == expected_root_tag) return self.result_dict elif self.token.get_type() == "text": if self.expected_value_type == None: if len(self.token.get_text().strip()) == 0: # just skip over this self.token = self.parser.next_token() continue raise XmlParserParseError("invalid text content in " + \ "no value tag") # now set this value accordingly: self.value_was_set = True if self.expected_value_type == "str": apply_value_to_current(self.token.get_text().strip()) elif self.expected_value_type == "int": apply_value_to_current(int(self.token.get_text().\ strip())) elif self.expected_value_type == "Decimal": apply_value_to_current(Decimal(self.token.get_text().\ strip())) elif self.expected_value_type == "bool": apply_value_to_current(bool(self.token.get_text().\ strip())) elif self.expected_value_type == "None": apply_value_to_current(None) else: raise XmlParserParseError("unsupported value tag " + \ "'" + expected_value_type + "' encountered") self.token = self.parser.next_token() return self.result_dict