def _iterOneDirection(self, column1, column2): from itertools import groupby from lxml import etree as ET from io import BytesIO from pyglossary.html_utils import unescape_unicode glos = self._glos for headword, groupsOrig in groupby( self.iterRows(column1, column2), key=lambda row: row[0], ): headword = html.unescape(headword) groups = [(term2, entry_type) for _, term2, entry_type in groupsOrig] f = BytesIO() with ET.htmlfile(f) as hf: with hf.element("div"): with glos.titleElement(hf, headword): try: hf.write(headword) except Exception as e: log.error(f"error in writing {headword!r}, {e}") hf.write(repr(headword)) if len(groups) == 1: hf.write(ET.Element("br")) self.makeList( hf, groups, self.writeSense, ) defi = unescape_unicode(f.getvalue().decode("utf-8")) yield self._glos.newEntry(headword, defi, defiFormat="h")
def getEntryByElem(self, entry: "lxml.etree.Element") -> "BaseEntry": from lxml import etree as ET keywords = [] f = BytesIO() if self._discover: for elem in entry.iter(): if elem.tag not in self.supportedTags: self._discoveredTags[elem.tag] = elem def br(): return ET.Element("br") with ET.htmlfile(f) as hf: with hf.element("div"): for form in entry.findall("form/orth", self.ns): keywords.append(form.text) if self._keywords_header: for keyword in keywords: with hf.element("b"): hf.write(keyword) hf.write(br()) # TODO: "form/usg" # <usg type="geo">Brit</usg> # <usg type="geo">US</usg> # <usg type="hint">...</usg> gramGrpList = entry.findall("gramGrp", self.ns) if gramGrpList: for gramGrp in gramGrpList: parts = [] for child in gramGrp.iterchildren(): text = self.normalizeGramGrpChild(child) if text: parts.append(text) with hf.element("i"): hf.write(", ".join(parts)) hf.write(br()) pronList = entry.findall("form/pron", self.ns) if pronList: for i, pron in enumerate(pronList): if i > 0: hf.write(", ") with hf.element("font", color="green"): hf.write(f"/{pron.text}/") hf.write(br()) hf.write("\n") self.make_list( hf, entry.findall("sense", self.ns), self.process_sense, ) defi = f.getvalue().decode("utf-8") defi = unescape_unicode(defi) return self._glos.newEntry(keywords, defi, defiFormat="h")
def setGlosInfo(self, key: str, value: str) -> None: self._glos.setInfo(key, unescape_unicode(value))
def getEntryByElem(self, entry: "lxml.etree.Element") -> "BaseEntry": from lxml import etree as ET glos = self._glos keywords = [] f = BytesIO() if self._discover: for elem in entry.iter(): if elem.tag not in self.supportedTags: self._discoveredTags[elem.tag] = elem def br(): return ET.Element("br") for form in entry.findall("form/orth", self.ns): if form.getparent().get("type"): # only use normal form, not inflected one, here continue keywords.append(form.text) # Add keywords for inflected forms for orth in entry.findall('.//form[@type="infl"]/orth', self.ns): if not orth.text: continue keywords.append(orth.text) gramList = [] # type: List[str] for gramGrp in entry.findall("gramGrp", self.ns): parts = [] for child in gramGrp.iterchildren(): text = self.normalizeGramGrpChild(child) if text: parts.append(text) if parts: gramList.append(", ".join(parts)) pronList = entry.findall("form/pron", self.ns) senseList = entry.findall("sense", self.ns) with ET.htmlfile(f) as hf: with hf.element("div"): if self._keywords_header: for keyword in keywords: with glos.titleElement(hf, keyword): hf.write(keyword) hf.write(br()) # TODO: "form/usg" # <usg type="geo">Brit</usg> # <usg type="geo">US</usg> # <usg type="hint">...</usg> for text in gramList: with hf.element("i"): hf.write(text) hf.write(br()) if pronList: for i, pron in enumerate(pronList): if i > 0: hf.write(", ") with hf.element("font", color="green"): hf.write(f"/{pron.text}/") hf.write(br()) hf.write("\n") self.makeList( hf, senseList, self.writeSense, ) defi = unescape_unicode(f.getvalue().decode("utf-8")) return self._glos.newEntry( keywords, defi, defiFormat="h", byteProgress=(self._file.tell(), self._fileSize), )
def getEntryByElem(self, entry: "lxml.etree.Element") -> "BaseEntry": from lxml import etree as ET glos = self._glos keywords = [] f = BytesIO() def br(): return ET.Element("br") with ET.htmlfile(f) as hf: kebList = [] # type: List[str] rebList = [] # type: List[str] with hf.element("div"): for k_ele in entry.findall("k_ele"): keb = k_ele.find("keb") if keb is None: continue kebList.append(keb.text) keywords.append(keb.text) # for elem in k_ele.findall("ke_pri"): # log.info(elem.text) for r_ele in entry.findall("r_ele"): reb = r_ele.find("reb") if reb is None: continue props = [] if r_ele.find("re_nokanji") is not None: props.append("no kanji") inf = r_ele.find("re_inf") if inf is not None: props.append( self.re_inf_mapping.get(inf.text, inf.text) ) rebList.append((reb.text, props)) keywords.append(reb.text) # for elem in r_ele.findall("re_pri"): # log.info(elem.text) # this is for making internal links valid # this makes too many alternates! # but we don't seem to have a choice # execpt for scanning and indexing all words once # and then starting over and fixing/optimizing links for keb in kebList: for reb, _ in rebList: keywords.append(f"{keb}・{reb}") if kebList: with glos.titleElement(hf, kebList[0]): for i, keb in enumerate(kebList): if i > 0: with hf.element("font", color="red"): hf.write(" | ") hf.write(keb) hf.write(br()) if rebList: for i, (reb, props) in enumerate(rebList): if i > 0: with hf.element("font", color="red"): hf.write(" | ") with hf.element("font", color="green"): hf.write(reb) for prop in props: hf.write(" ") with hf.element("small"): with hf.element("span", style=self.tagStyle): hf.write(prop) hf.write(br()) self.makeList( hf, entry.findall("sense"), self.writeSense, ) defi = f.getvalue().decode("utf-8") defi = unescape_unicode(defi) byteProgress = (self._file.tell(), self._fileSize) return self._glos.newEntry(keywords, defi, defiFormat="h", byteProgress=byteProgress)
def case(self, text, expected): actual = unescape_unicode(text) self.assertEqual(actual, expected)