def _iterOneDirection(self, column1, column2): from itertools import groupby from lxml import etree as ET from io import BytesIO from pyglossary.html_utils import unescape_unicode glos = self._glos for headword, groupsOrig in groupby( self.iterRows(column1, column2), key=lambda row: row[0], ): headword = html.unescape(headword) groups = [(term2, entry_type) for _, term2, entry_type in groupsOrig] f = BytesIO() with ET.htmlfile(f) as hf: with hf.element("div"): with glos.titleElement(hf, headword): try: hf.write(headword) except Exception as e: log.error(f"error in writing {headword!r}, {e}") hf.write(repr(headword)) if len(groups) == 1: hf.write(ET.Element("br")) self.makeList( hf, groups, self.writeSense, ) defi = unescape_unicode(f.getvalue().decode("utf-8")) yield self._glos.newEntry(headword, defi, defiFormat="h")
def render_article(trad, simp, pinyin, eng): from lxml import etree as ET from io import BytesIO # pinyin_tones = [convert(syl) for syl in pinyin.split()] pinyin_list = [] tones = [] for syllable in pinyin.split(): nice_syllable, tone = convert(syllable) pinyin_list.append(nice_syllable) tones.append(tone) f = BytesIO() with ET.htmlfile(f, encoding="utf-8") as hf: with hf.element("div", style="border: 1px solid; padding: 5px"): with hf.element("div"): with hf.element("big"): colorize(hf, simp, tones) if trad != simp: hf.write("\xa0/\xa0") # "\xa0" --> " " == " " colorize(hf, trad, tones) hf.write(ET.Element("br")) with hf.element("big"): colorize(hf, pinyin_list, tones) with hf.element("div"): with hf.element("ul"): for defn in eng: with hf.element("li"): hf.write(defn) article = f.getvalue().decode("utf-8") return article
def getEntryByElem(self, entry: "lxml.etree.Element") -> "BaseEntry": from lxml import etree as ET keywords = [] f = BytesIO() if self._discover: for elem in entry.iter(): if elem.tag not in self.supportedTags: self._discoveredTags[elem.tag] = elem def br(): return ET.Element("br") with ET.htmlfile(f) as hf: with hf.element("div"): for form in entry.findall("form/orth", self.ns): keywords.append(form.text) if self._keywords_header: for keyword in keywords: with hf.element("b"): hf.write(keyword) hf.write(br()) # TODO: "form/usg" # <usg type="geo">Brit</usg> # <usg type="geo">US</usg> # <usg type="hint">...</usg> gramGrpList = entry.findall("gramGrp", self.ns) if gramGrpList: for gramGrp in gramGrpList: parts = [] for child in gramGrp.iterchildren(): text = self.normalizeGramGrpChild(child) if text: parts.append(text) with hf.element("i"): hf.write(", ".join(parts)) hf.write(br()) pronList = entry.findall("form/pron", self.ns) if pronList: for i, pron in enumerate(pronList): if i > 0: hf.write(", ") with hf.element("font", color="green"): hf.write(f"/{pron.text}/") hf.write(br()) hf.write("\n") self.make_list( hf, entry.findall("sense", self.ns), self.process_sense, ) defi = f.getvalue().decode("utf-8") defi = unescape_unicode(defi) return self._glos.newEntry(keywords, defi, defiFormat="h")
def _iterOneDirection(self, column1, column2): from itertools import groupby from lxml import etree as ET from io import BytesIO glos = self._glos for headword, groupsOrig in groupby( self.iterRows(column1, column2), key=lambda row: row[0], ): headword = html.unescape(headword) groups = [ (term2, entry_type) for _, term2, entry_type in groupsOrig ] f = BytesIO() gender, headword = self.parseGender(headword) with ET.htmlfile(f, encoding="utf-8") as hf: with hf.element("div"): if gender: with hf.element("i"): hf.write(gender) hf.write(ET.Element("br")) self.makeList( hf, groups, self.writeSense, ) defi = f.getvalue().decode("utf-8") yield self._glos.newEntry(headword, defi, defiFormat="h")
def transform(self, article: "lxml.etree.Element") -> str: from lxml import etree as ET encoding = self._encoding f = BytesIO() with ET.htmlfile(f, encoding="utf-8") as hf: with hf.element("div", **{"class": "article"}): self.writeChildrenOf(hf, article) text = f.getvalue().decode("utf-8") text = text.replace("<br>", "<br/>") # for compatibility return text
def getEntryByElem(self, entry: "lxml.etree.Element") -> "BaseEntry": from lxml import etree as ET keywords = [] f = BytesIO() if self._discover: for elem in entry.iter(): if elem.tag not in self.supportedTags: self._discoveredTags[elem.tag] = elem with ET.htmlfile(f) as hf: with hf.element("div"): for form in entry.findall("form/orth", self.ns): keywords.append(form.text) # TODO: if there is only one keyword, we should skip this with hf.element("b"): hf.write(form.text) hf.write(ET.Element("br")) # TODO: "form/usg" # <usg type="geo">Brit</usg> # <usg type="geo">US</usg> # <usg type="hint">...</usg> gramGrpList = entry.findall("gramGrp", self.ns) if gramGrpList: for gramGrp in gramGrpList: parts = [] for child in gramGrp.iterchildren(): text = self.normalizeGramGrpChild(child) if text: parts.append(text) with hf.element("i"): hf.write(", ".join(parts)) hf.write(ET.Element("br")) pronList = entry.findall("form/pron", self.ns) if pronList: hf.write(", ".join(f'<font color="green">/{p.text}/</font>' for p in pronList)) hf.write(ET.Element("br")) hf.write("\n") self.make_list( hf, entry.findall("sense", self.ns), self.process_sense, ) defi = f.getvalue().decode("utf-8") defi = html.unescape(defi) return self._glos.newEntry(keywords, defi)
def _createEntry(self, yamlBlock: str): from lxml import etree as ET from yaml import load try: from yaml import CLoader as Loader except ImportError: from yaml import Loader edict = load(yamlBlock, Loader=Loader) word = edict.get("word") if not word: log.error(f"no word in {edict}") return f = BytesIO() with ET.htmlfile(f) as hf: with hf.element("div"): self._processEntry(hf, edict) defi = f.getvalue().decode("utf-8") return word, defi
def getEntryByElem(self, entry: "lxml.etree.Element") -> "BaseEntry": from lxml import etree as ET keywords = [] f = BytesIO() with ET.htmlfile(f) as hf: with hf.element("div"): for form in entry.findall("form/orth", self.ns): keywords.append(form.text) # TODO: if there is only one keyword, we should skip this with hf.element("b"): hf.write(form.text) hf.write(ET.Element("br")) # TODO: "gramGrp/gen" is gender: m|masc|f|fem|n|neut|m;f|adj posList = entry.findall("gramGrp/pos", self.ns) if posList: for pos in posList: with hf.element("i"): hf.write(pos.text) hf.write(" ") hf.write(ET.Element("br")) pronList = entry.findall("form/pron", self.ns) if pronList: hf.write(", ".join( f'<font color="green">/{p.text}/</font>' for p in pronList )) hf.write(ET.Element("br")) hf.write("\n") self.make_list( hf, entry.findall("sense", self.ns), self.process_sense, ) defi = f.getvalue().decode("utf-8") defi = html.unescape(defi) return self._glos.newEntry(keywords, defi)
def getEntryByElem(self, entry: "lxml.etree.Element") -> "BaseEntry": from lxml import etree as ET keywords = [] f = BytesIO() with ET.htmlfile(f) as hf: with hf.element("div"): for form in entry.findall("form/orth", self.ns): keywords.append(form.text) with hf.element("b"): hf.write(form.text) hf.write(" ") for pos in entry.findall("gramGrp/pos", self.ns): with hf.element("i"): hf.write(pos.text) hf.write(ET.Element("br")) hf.write("\n") self.make_list( hf, entry.findall("sense", self.ns), self.process_sense, ) return self._glos.newEntry(keywords, f.getvalue().decode("utf-8"))
def docfile(self, *args, **kwargs): return etree.htmlfile(*args, **kwargs)
def docfile(self, *args, **kwargs): logger.debug("Starting file with %r %r", args, kwargs) return etree.htmlfile(*args, **kwargs)
def getEntryByElem(self, entry: "lxml.etree.Element") -> "BaseEntry": from lxml import etree as ET glos = self._glos keywords = [] f = BytesIO() pron_color = self._pron_color if self._discover: for elem in entry.iter(): if elem.tag not in self.supportedTags: self._discoveredTags[elem.tag] = elem def br(): return ET.Element("br") inflectedKeywords = [] for form in entry.findall("form", self.ns): inflected = form.get("type") == "infl" for orth in form.findall("orth", self.ns): if not orth.text: continue if inflected: inflectedKeywords.append(orth.text) else: keywords.append(orth.text) keywords += inflectedKeywords pronList = [ pron.text.strip('/') for pron in entry.findall("form/pron", self.ns) if pron.text ] senseList = entry.findall("sense", self.ns) with ET.htmlfile(f, encoding="utf-8") as hf: with hf.element("div"): if self._word_title: for keyword in keywords: with glos.titleElement(hf, keyword): hf.write(keyword) hf.write(br()) # TODO: "form/usg" # <usg type="geo">Brit</usg> # <usg type="geo">US</usg> # <usg type="hint">...</usg> if pronList: for i, pron in enumerate(pronList): if i > 0: hf.write(", ") hf.write("/") with hf.element("font", color=pron_color): hf.write(f"{pron}") hf.write("/") hf.write(br()) hf.write("\n") self.writeGramGroups(hf, entry.findall("gramGrp", self.ns)) self.writeSenseList(hf, senseList) defi = f.getvalue().decode("utf-8") # defi = defi.replace("\xa0", " ") # do we need to do this? return self._glos.newEntry( keywords, defi, defiFormat="h", byteProgress=(self._file.tell(), self._fileSize), )
def getEntryByElem(self, entry: "lxml.etree.Element") -> "BaseEntry": from lxml import etree as ET glos = self._glos keywords = [] f = BytesIO() if self._discover: for elem in entry.iter(): if elem.tag not in self.supportedTags: self._discoveredTags[elem.tag] = elem def br(): return ET.Element("br") for form in entry.findall("form/orth", self.ns): if form.getparent().get("type"): # only use normal form, not inflected one, here continue keywords.append(form.text) # Add keywords for inflected forms for orth in entry.findall('.//form[@type="infl"]/orth', self.ns): if not orth.text: continue keywords.append(orth.text) gramList = [] # type: List[str] for gramGrp in entry.findall("gramGrp", self.ns): parts = [] for child in gramGrp.iterchildren(): text = self.normalizeGramGrpChild(child) if text: parts.append(text) if parts: gramList.append(", ".join(parts)) pronList = entry.findall("form/pron", self.ns) senseList = entry.findall("sense", self.ns) with ET.htmlfile(f) as hf: with hf.element("div"): if self._keywords_header: for keyword in keywords: with glos.titleElement(hf, keyword): hf.write(keyword) hf.write(br()) # TODO: "form/usg" # <usg type="geo">Brit</usg> # <usg type="geo">US</usg> # <usg type="hint">...</usg> for text in gramList: with hf.element("i"): hf.write(text) hf.write(br()) if pronList: for i, pron in enumerate(pronList): if i > 0: hf.write(", ") with hf.element("font", color="green"): hf.write(f"/{pron.text}/") hf.write(br()) hf.write("\n") self.makeList( hf, senseList, self.writeSense, ) defi = unescape_unicode(f.getvalue().decode("utf-8")) return self._glos.newEntry( keywords, defi, defiFormat="h", byteProgress=(self._file.tell(), self._fileSize), )
def getEntryByElem(self, entry: "lxml.etree.Element") -> "BaseEntry": from lxml import etree as ET glos = self._glos keywords = [] f = BytesIO() def br(): return ET.Element("br") with ET.htmlfile(f) as hf: kebList = [] # type: List[str] rebList = [] # type: List[str] with hf.element("div"): for k_ele in entry.findall("k_ele"): keb = k_ele.find("keb") if keb is None: continue kebList.append(keb.text) keywords.append(keb.text) # for elem in k_ele.findall("ke_pri"): # log.info(elem.text) for r_ele in entry.findall("r_ele"): reb = r_ele.find("reb") if reb is None: continue props = [] if r_ele.find("re_nokanji") is not None: props.append("no kanji") inf = r_ele.find("re_inf") if inf is not None: props.append( self.re_inf_mapping.get(inf.text, inf.text) ) rebList.append((reb.text, props)) keywords.append(reb.text) # for elem in r_ele.findall("re_pri"): # log.info(elem.text) # this is for making internal links valid # this makes too many alternates! # but we don't seem to have a choice # execpt for scanning and indexing all words once # and then starting over and fixing/optimizing links for keb in kebList: for reb, _ in rebList: keywords.append(f"{keb}・{reb}") if kebList: with glos.titleElement(hf, kebList[0]): for i, keb in enumerate(kebList): if i > 0: with hf.element("font", color="red"): hf.write(" | ") hf.write(keb) hf.write(br()) if rebList: for i, (reb, props) in enumerate(rebList): if i > 0: with hf.element("font", color="red"): hf.write(" | ") with hf.element("font", color="green"): hf.write(reb) for prop in props: hf.write(" ") with hf.element("small"): with hf.element("span", style=self.tagStyle): hf.write(prop) hf.write(br()) self.makeList( hf, entry.findall("sense"), self.writeSense, ) defi = f.getvalue().decode("utf-8") defi = unescape_unicode(defi) byteProgress = (self._file.tell(), self._fileSize) return self._glos.newEntry(keywords, defi, defiFormat="h", byteProgress=byteProgress)