class KVGXmlDictionaryReader(_XmlBase): def __init__(self): self._charcol = CharacterCollection() def get_character_collection(self): return self._charcol def _start_element(self, name, attrs): self._tag = name if self._first_tag: self._first_tag = False if self._tag != "kanjis": raise ValueError, "The very first tag should be <kanjis>" if self._tag == "kanji": self._writing = Writing() self._utf8 = attrs["midashi"].encode("UTF-8") if self._tag == "stroke": self._stroke = Stroke() if attrs.has_key("path"): self._stroke_svg = attrs["path"].encode("UTF-8") try: svg_parser = SVG_Parser(self._stroke_svg) svg_parser.parse() self._stroke.append_points(svg_parser.get_points()) except: sys.stderr.write("Something went wrong in this character: " + self._utf8 + "\n") else: print "Missing path in <stroke> element: " + self._utf8 def _end_element(self, name): if name == "kanji": char = Character() char.set_utf8(self._utf8) char.set_writing(self._writing) self._charcol.add_set(self._utf8) self._charcol.append_character(self._utf8, char) for s in ["_tag", "_stroke"]: if s in self.__dict__: del self.__dict__[s] if name == "stroke": self._writing.append_stroke(self._stroke) self._stroke = None self._tag = None def _char_data(self, data): if self._tag == "utf8": self._utf8 = data.encode("UTF-8") elif self._tag == "width": self._writing.set_width(int(data)) elif self._tag == "height": self._writing.set_height(int(data))
class KVGXmlDictionaryReader(_XmlBase): def __init__(self): self._charcol = CharacterCollection() def get_character_collection(self): return self._charcol def _start_element(self, name, attrs): self._tag = name if self._first_tag: self._first_tag = False if self._tag != "kanjivg": raise ValueError, "The very first tag should be <kanjivg>" if self._tag == "kanji": self._writing = Writing() self._utf8 = unichr(int(attrs["id"].split('_')[1], 16)).encode("UTF-8") if self._tag == "path": self._stroke = Stroke() if attrs.has_key("d"): self._stroke_svg = attrs["d"].encode("UTF-8") svg_parser = SVG_Parser(self._stroke_svg) svg_parser.parse() self._stroke.append_points(svg_parser.get_points()) else: sys.stderr.write("Missing data in <path> element: " + self._utf8 + "\n") def _end_element(self, name): if name == "kanji": char = Character() char.set_utf8(self._utf8) char.set_writing(self._writing) self._charcol.add_set(self._utf8) self._charcol.append_character(self._utf8, char) for s in ["_tag", "_stroke"]: if s in self.__dict__: del self.__dict__[s] if name == "path": self._writing.append_stroke(self._stroke) self._stroke = None self._tag = None def _char_data(self, data): if self._tag == "utf8": self._utf8 = data.encode("UTF-8") elif self._tag == "width": self._writing.set_width(int(data)) elif self._tag == "height": self._writing.set_height(int(data))
class KVGXmlDictionaryReader(_XmlBase): def __init__(self): self._charcol = CharacterCollection() def get_character_collection(self): return self._charcol def _start_element(self, name, attrs): self._tag = name if self._first_tag: self._first_tag = False if self._tag != "kanjivg": raise ValueError, "The very first tag should be <kanjivg>" if self._tag == "kanji": self._writing = Writing() self._utf8 = unichr(int(attrs["id"].split("_")[1], 16)).encode("UTF-8") if self._tag == "path": self._stroke = Stroke() if attrs.has_key("d"): self._stroke_svg = attrs["d"].encode("UTF-8") svg_parser = SVG_Parser(self._stroke_svg) svg_parser.parse() self._stroke.append_points(svg_parser.get_points()) else: sys.stderr.write("Missing data in <path> element: " + self._utf8 + "\n") def _end_element(self, name): if name == "kanji": char = Character() char.set_utf8(self._utf8) char.set_writing(self._writing) self._charcol.add_set(self._utf8) self._charcol.append_character(self._utf8, char) for s in ["_tag", "_stroke"]: if s in self.__dict__: del self.__dict__[s] if name == "path": self._writing.append_stroke(self._stroke) self._stroke = None self._tag = None def _char_data(self, data): if self._tag == "utf8": self._utf8 = data.encode("UTF-8") elif self._tag == "width": self._writing.set_width(int(data)) elif self._tag == "height": self._writing.set_height(int(data))
def get_character_collection(self): charcol = CharacterCollection() # group characters with the same label into sets sets = {} for i in range(len(self._labels)): # Create Character writing = Writing() if self.height and self.width: writing.set_height(self.height) writing.set_width(self.width) for delin_range in self._delineations[i]: if delin_range.start_comp == (delin_range.end_comp - 1): stroke_points = self._strokes[delin_range.start_comp][delin_range.start_point:delin_range.end_point] writing.append_stroke(Stroke.from_list(stroke_points)) else: # add first stroke to writing start_stroke_points = self._strokes[delin_range.start_comp][delin_range.start_point:-1] if len(start_stroke_points) > 0: writing.append_stroke(Stroke.from_list(start_stroke_points)) # add last stroke to writing end_stroke_points = self._strokes[delin_range.end_comp - 1][0:delin_range.end_point] if len(end_stroke_points) > 0: writing.append_stroke(Stroke.from_list(end_stroke_points)) # add the remaining strokes to writing for stroke in self._strokes[delin_range.start_comp + 1:delin_range.end_comp - 1]: writing.append_stroke(stroke) character = Character() character.set_writing(writing) utf8 = self._labels[i] character.set_utf8(utf8) sets[utf8] = sets.get(utf8, []) + [character] charcol.add_sets(sets.keys()) for set_name, characters in sets.items(): charcol.append_characters(set_name, characters) return charcol
class TomoeXmlDictionaryReader(_XmlBase): def __init__(self): self._charcol = CharacterCollection() def get_character_collection(self): return self._charcol def _start_element(self, name, attrs): self._tag = name if self._first_tag: self._first_tag = False if self._tag != "dictionary": raise ValueError, "The very first tag should be <dictionary>" if self._tag == "character": self._writing = Writing() if self._tag == "stroke": self._stroke = Stroke() elif self._tag == "point": point = Point() for key in ("x", "y", "pressure", "xtilt", "ytilt", "timestamp"): if attrs.has_key(key): value = attrs[key].encode("UTF-8") if key in ("pressure", "xtilt", "ytilt"): value = float(value) else: value = int(float(value)) else: value = None setattr(point, key, value) self._stroke.append_point(point) def _end_element(self, name): if name == "character": char = Character() char.set_utf8(self._utf8) char.set_writing(self._writing) self._charcol.add_set(self._utf8) self._charcol.append_character(self._utf8, char) for s in ["_tag", "_stroke"]: if s in self.__dict__: del self.__dict__[s] if name == "stroke": self._writing.append_stroke(self._stroke) self._stroke = None self._tag = None def _char_data(self, data): if self._tag == "utf8": self._utf8 = data.encode("UTF-8") elif self._tag == "width": self._writing.set_width(int(data)) elif self._tag == "height": self._writing.set_height(int(data))