Esempio n. 1
0
    def writeGramGroups(
        self,
        hf: "lxml.etree.htmlfile",
        gramGrpList: "List[lxml.etree.htmlfile]",
    ):
        from lxml import etree as ET

        auto_rtl = self._auto_rtl
        color = self._gram_color
        for gramGrp in gramGrpList:
            parts = []
            for child in gramGrp.iterchildren():
                part = self.normalizeGramGrpChild(child)
                if part:
                    parts.append(part)
            if not parts:
                continue

            sep = ", "
            if auto_rtl:
                ws = getWritingSystemFromText(parts[0])
                if ws:
                    sep = ws.comma + " "

            text = sep.join(parts)
            with hf.element("font", color=color):
                hf.write(text)

            hf.write(ET.Element("br"))
Esempio n. 2
0
    def __iter__(self):
        from pyglossary.langs.writing_system import getWritingSystemFromText
        alternateDict = {}
        self._cur.execute("select wordkey, searchwordkey from Keys")
        for row in self._cur.fetchall():
            if row[0] in alternateDict:
                alternateDict[row[0]].append(row[1])
            else:
                alternateDict[row[0]] = [row[1]]

        self._cur.execute(
            "select word, searchword, root, meaning from WordsTable"
            " order by id")
        # FIXME: iteration over self._cur stops after one entry
        # and self._cur.fetchone() returns None
        # for row in self._cur:
        for row in self._cur.fetchall():
            word = row[0]
            searchword = row[1]
            root = row[2]
            meaning = row[3]
            definition = meaning
            definition = definition.replace("|", "<br>")

            if root:
                definition += f'<br>Root: <a href="bword://{html.escape(root)}">{root}</a>'

            ws = getWritingSystemFromText(meaning)
            if ws and ws.direction == "rtl":
                definition = f'<div dir="rtl">{definition}</div>'

            words = [word, searchword]
            if word in alternateDict:
                words += alternateDict[word]
            yield self._glos.newEntry(
                words,
                definition,
                defiFormat="h",
            )
Esempio n. 3
0
    def write(self) -> "Generator[None, BaseEntry, None]":
        import re
        from collections import Counter, OrderedDict
        from pyglossary.json_utils import dataToPrettyJson
        from pyglossary.langs.writing_system import getWritingSystemFromText

        glos = self._glos

        re_possible_html = re.compile(
            r"<[a-z1-6]+[ />]",
            re.I,
        )
        re_style = re.compile(
            r"<([a-z1-6]+)[^<>]* style=",
            re.I | re.DOTALL,
        )

        wordCount = 0
        bwordCount = 0

        styleByTagCounter = Counter()

        defiFormatCounter = Counter()
        firstTagCounter = Counter()
        allTagsCounter = Counter()
        sourceScriptCounter = Counter()

        while True:
            entry = yield
            if entry is None:
                break
            defi = entry.defi

            wordCount += 1
            bwordCount += defi.count("bword://")

            for m in re_style.finditer(defi):
                tag = m.group(1)
                styleByTagCounter[tag] += 1

            entry.detectDefiFormat()
            defiFormat = entry.defiFormat
            defiFormatCounter[defiFormat] += 1
            if defiFormat == "m":
                if re_possible_html.match(defi):
                    log.warn(f"undetected html defi: {defi}")
            elif defiFormat == "h":
                match = re_possible_html.search(defi)
                if match is not None:
                    tag = match.group().strip("< />").lower()
                    firstTagCounter[tag] += 1
                    for tag in re_possible_html.findall(defi):
                        tag = tag.strip("< />").lower()
                        allTagsCounter[tag] += 1

            ws = getWritingSystemFromText(entry.s_word)
            if ws:
                wsName = ws.name
            else:
                log.debug(f"No script detected for word: {entry.s_word}")
                wsName = "None"
            sourceScriptCounter[wsName] += 1

        data_entry_count = defiFormatCounter["b"]
        del defiFormatCounter["b"]
        info = OrderedDict()
        for key, value in glos.iterInfo():
            info[key] = value
        info["word_count"] = wordCount
        info["bword_count"] = bwordCount
        info["data_entry_count"] = data_entry_count
        info["defi_format"] = ", ".join(
            f"{defiFormat}={count}"
            for defiFormat, count in sorted(defiFormatCounter.items()))
        info["defi_tag"] = ", ".join(
            f"{defiFormat}={count}"
            for defiFormat, count in allTagsCounter.most_common())
        info["defi_first_tag"] = ", ".join(
            f"{defiFormat}={count}"
            for defiFormat, count in firstTagCounter.most_common())
        info["style"] = ", ".join(
            f"{defiFormat}={count}"
            for defiFormat, count in styleByTagCounter.most_common())
        info["source_script"] = ", ".join(
            f"{defiFormat}={count}"
            for defiFormat, count in sourceScriptCounter.most_common())
        self._file.write(dataToPrettyJson(info) + "\n")
Esempio n. 4
0
 def getTitleTag(self, sample: str) -> str:
     ws = getWritingSystemFromText(sample)
     if ws:
         return ws.titleTag
     return "b"
Esempio n. 5
0
	def getCommaSep(self, sample: str):
		if self._auto_rtl:
			ws = getWritingSystemFromText(sample)
			if ws:
				return ws.comma + " "
		return ", "