def write(self, filename: str, encoding: str = "utf-8", havePrevLink: bool = True) -> None: from collections import OrderedDict as odict from pyglossary.json_utils import dataToPrettyJson if exists(filename): raise ValueError(f"directory {filename!r} already exists") self._filename = filename self._encoding = encoding self._havePrevLink = havePrevLink self._resDir = join(filename, "res") os.makedirs(filename) os.mkdir(self._resDir) thisEntry = yield if thisEntry is None: raise ValueError("glossary is empty") count = 1 rootHash = thisHash = self.getEntryHash(thisEntry) prevHash = None while True: nextEntry = yield if nextEntry is None: break if nextEntry.isData(): nextEntry.save(self._resDir) continue nextHash = self.getEntryHash(nextEntry) self.saveEntry(thisEntry, thisHash, prevHash, nextHash) thisEntry = nextEntry prevHash, thisHash = thisHash, nextHash count += 1 self.saveEntry(thisEntry, thisHash, prevHash, None) with open( join(self._filename, "info.json"), "w", encoding=self._encoding, ) as toFile: info = odict() info["name"] = self._glos.getInfo("name") info["root"] = self.hashToPath(rootHash) info["havePrevLink"] = self._havePrevLink info["wordCount"] = count # info["modified"] = for key, value in self._glos.getExtraInfos(( "name", "root", "havePrevLink", "wordCount", )).items(): info[key] = value toFile.write(dataToPrettyJson(info))
def write(self, filename: str) -> Generator[None, "BaseEntry", None]: import re from collections import Counter, OrderedDict from pyglossary.json_utils import dataToPrettyJson glos = self._glos re_possible_html = re.compile(r"<[a-zA-Z]+[ />]") defiFormatCounter = Counter() firstTagCounter = Counter() allTagsCounter = Counter() wordCount = 0 while True: entry = yield if entry is None: break entry.detectDefiFormat() defiFormat = entry.defiFormat wordCount += 1 defiFormatCounter[defiFormat] += 1 defi = entry.defi if defiFormat == "m": if re_possible_html.match(defi): log.warn(f"undetected html defi: {defi}") elif defiFormat == "h": tag = re_possible_html.search(defi).group().strip( "< />").lower() firstTagCounter[tag] += 1 for tag in re_possible_html.findall(defi): tag = tag.strip("< />").lower() allTagsCounter[tag] += 1 data_entry_count = defiFormatCounter["b"] del defiFormatCounter["b"] info = OrderedDict() for key, value in glos.iterInfo(): info[key] = value info["word_count"] = wordCount info["data_entry_count"] = data_entry_count info["defi_format_counter"] = ", ".join( f"{defiFormat}={count}" for defiFormat, count in sorted(defiFormatCounter.items())) info["defi_tag_counter"] = ", ".join( f"{defiFormat}={count}" for defiFormat, count in allTagsCounter.most_common()) info["defi_first_tag_counter"] = ", ".join( f"{defiFormat}={count}" for defiFormat, count in firstTagCounter.most_common()) with open(filename, mode="w", encoding="utf-8") as _file: _file.write(dataToPrettyJson(info))
def write(self) -> "Generator[None, BaseEntry, None]": from collections import OrderedDict as odict from pyglossary.json_utils import dataToPrettyJson filename = self._filename thisEntry = yield if thisEntry is None: raise ValueError("glossary is empty") count = 1 rootHash = thisHash = self.getEntryHash(thisEntry) prevHash = None while True: nextEntry = yield if nextEntry is None: break if nextEntry.isData(): nextEntry.save(self._resDir) continue nextHash = self.getEntryHash(nextEntry) self.saveEntry(thisEntry, thisHash, prevHash, nextHash) thisEntry = nextEntry prevHash, thisHash = thisHash, nextHash count += 1 self.saveEntry(thisEntry, thisHash, prevHash, None) with open( join(self._filename, "info.json"), "w", encoding=self._encoding, ) as toFile: info = odict() info["name"] = self._glos.getInfo("name") info["root"] = self.hashToPath(rootHash) info["havePrevLink"] = self._havePrevLink info["wordCount"] = count # info["modified"] = for key, value in self._glos.getExtraInfos(( "name", "root", "havePrevLink", "wordCount", )).items(): info[key] = value toFile.write(dataToPrettyJson(info))
def saveConfig(self): from pyglossary.json_utils import dataToPrettyJson config = OrderedDict() for key, option in self.configDefDict.items(): if key not in self.config: log.warning(f"saveConfig: missing key {key!r}") continue value = self.config[key] if not option.validate(value): log.error(f"saveConfig: invalid {key}={value!r}") continue config[key] = value jsonStr = dataToPrettyJson(config) with open(confJsonFile, mode="wt", encoding="utf-8") as _file: _file.write(jsonStr) log.info(f"saved {confJsonFile!r}")
def write(self, ): from collections import OrderedDict as odict from pyglossary.json_utils import dataToPrettyJson filename = self._filename wordCount = 0 compression = self._compression c_open = compressionOpenFunc(compression) if not c_open: raise ValueError(f"invalid compression {c!r}") while True: entry = yield if entry is None: break if entry.isData(): continue fpath = join(filename, self.filePathFromWord(entry.b_word)) if compression: fpath = f"{fpath}.{compression}" parentDir = dirname(fpath) if not isdir(parentDir): makedirs(parentDir) if isfile(fpath): log.warn(f"file exists: {fpath}") fpath += f"-{sha1(entry.b_defi).hexdigest()[:4]}" with c_open(fpath, "wt", encoding="utf-8") as _file: _file.write( f"{escapeNTB(entry.s_word)}\n{entry.defi}" ) wordCount += 1 with open( join(filename, "info.json"), mode="w", encoding="utf-8", ) as infoFile: info = odict() info["name"] = self._glos.getInfo("name") info["wordCount"] = wordCount for key, value in self._glos.getExtraInfos(( "name", "wordCount", )).items(): info[key] = value infoFile.write(dataToPrettyJson(info))
def write(self): from collections import OrderedDict as odict from pyglossary.json_utils import dataToPrettyJson glosIter = iter(self._iterNonDataEntries()) try: thisEntry = next(glosIter) except StopIteration: raise ValueError("glossary is empty") count = 1 rootHash = thisHash = self.getEntryHash(thisEntry) prevHash = None for nextEntry in glosIter: nextHash = self.getEntryHash(nextEntry) self.saveEntry(thisEntry, thisHash, prevHash, nextHash) thisEntry = nextEntry prevHash, thisHash = thisHash, nextHash count += 1 self.saveEntry(thisEntry, thisHash, prevHash, None) with open( join(self._filename, "info.json"), "w", encoding=self._encoding, ) as toFile: info = odict() info["name"] = self._glos.getInfo("name") info["root"] = self.hashToPath(rootHash) info["havePrevLink"] = self._havePrevLink info["wordCount"] = count # info["modified"] = for key, value in self._glos.getExtraInfos(( "name", "root", "havePrevLink", "wordCount", )).items(): info[key] = value toFile.write(dataToPrettyJson(info))
def write(self): from collections import OrderedDict as odict from pyglossary.json_utils import dataToPrettyJson glosIter = iter(self._glos) try: thisEntry = next(glosIter) except StopIteration: raise ValueError('glossary is empty') os.makedirs(self._filename) count = 1 rootHash = thisHash = self.getEntryHash(thisEntry) prevHash = None for nextEntry in glosIter: nextHash = self.getEntryHash(nextEntry) self.saveEntry(thisEntry, thisHash, prevHash, nextHash) thisEntry = nextEntry prevHash, thisHash = thisHash, nextHash count += 1 self.saveEntry(thisEntry, thisHash, prevHash, None) with open(join(self._filename, 'info.json'), 'w', encoding=self._encoding) as fp: info = odict() info['name'] = self._glos.getInfo('name') info['root'] = self.hashToPath(rootHash) info['havePrevLink'] = self._havePrevLink info['wordCount'] = count #info['modified'] = for key, value in self._glos.getExtraInfos(( 'name', 'root', 'havePrevLink', 'wordCount', )).items(): info[key] = value fp.write(dataToPrettyJson(info))
def write(glos: GlossaryType, filename: str) -> bool: import re from collections import Counter, OrderedDict from pyglossary.json_utils import dataToPrettyJson possible_html_re = re.compile(r"<[a-zA-Z]+[ />]") defiFormatCounter = Counter() firstTagCounter = Counter() for entry in glos: entry.detectDefiFormat() defiFormat = entry.getDefiFormat() defiFormatCounter[defiFormat] += 1 defi = entry.getDefi() if defiFormat == "m": if possible_html_re.match(defi): log.warn(f"undetected html defi: {defi}") elif defiFormat == "h": tag = possible_html_re.search(defi).group().strip("< />").lower() firstTagCounter[tag] += 1 data_entry_count = defiFormatCounter["b"] del defiFormatCounter["b"] info = OrderedDict() for key, value in glos.iterInfo(): info[key] = value info["data_entry_count"] = data_entry_count info["defi_format_counter"] = ", ".join( f"{defiFormat}={count}" for defiFormat, count in sorted(defiFormatCounter.items()) ) info["defi_first_tag_counter"] = ", ".join( f"{defiFormat}={count}" for defiFormat, count in firstTagCounter.most_common() ) with open(filename, mode="w", encoding="utf-8") as _file: _file.write(dataToPrettyJson(info))
def write(self): from collections import OrderedDict as odict from pyglossary.json_utils import dataToPrettyJson glosIter = iter(self._glos) try: thisEntry = next(glosIter) except StopIteration: raise ValueError('glossary is empty') os.makedirs(self._filename) count = 1 rootHash = thisHash = self.getEntryHash(thisEntry) for nextEntry in glosIter: nextHash = self.getEntryHash(nextEntry) self.saveEntry(thisEntry, thisHash, nextHash) thisEntry = nextEntry thisHash = nextHash count += 1 self.saveEntry(thisEntry, thisHash, None) with open(join(self._filename, 'info.json'), 'w', encoding=self._encoding) as fp: info = odict() info['name'] = self._glos.getInfo('name') info['root'] = self.hashToPath(rootHash) info['wordCount'] = count #info['modified'] = origInfo = self._glos.info.copy() for key in ('name', 'root', 'wordCount'): try: del origInfo[key] except KeyError: pass info.update(origInfo) fp.write(dataToPrettyJson(info))
def write(self) -> "Generator[None, BaseEntry, None]": import re from collections import Counter, OrderedDict from pyglossary.json_utils import dataToPrettyJson from pyglossary.langs.writing_system import getWritingSystemFromText glos = self._glos re_possible_html = re.compile( r"<[a-z1-6]+[ />]", re.I, ) re_style = re.compile( r"<([a-z1-6]+)[^<>]* style=", re.I | re.DOTALL, ) wordCount = 0 bwordCount = 0 styleByTagCounter = Counter() defiFormatCounter = Counter() firstTagCounter = Counter() allTagsCounter = Counter() sourceScriptCounter = Counter() while True: entry = yield if entry is None: break defi = entry.defi wordCount += 1 bwordCount += defi.count("bword://") for m in re_style.finditer(defi): tag = m.group(1) styleByTagCounter[tag] += 1 entry.detectDefiFormat() defiFormat = entry.defiFormat defiFormatCounter[defiFormat] += 1 if defiFormat == "m": if re_possible_html.match(defi): log.warn(f"undetected html defi: {defi}") elif defiFormat == "h": match = re_possible_html.search(defi) if match is not None: tag = match.group().strip("< />").lower() firstTagCounter[tag] += 1 for tag in re_possible_html.findall(defi): tag = tag.strip("< />").lower() allTagsCounter[tag] += 1 ws = getWritingSystemFromText(entry.s_word) if ws: wsName = ws.name else: log.debug(f"No script detected for word: {entry.s_word}") wsName = "None" sourceScriptCounter[wsName] += 1 data_entry_count = defiFormatCounter["b"] del defiFormatCounter["b"] info = OrderedDict() for key, value in glos.iterInfo(): info[key] = value info["word_count"] = wordCount info["bword_count"] = bwordCount info["data_entry_count"] = data_entry_count info["defi_format"] = ", ".join( f"{defiFormat}={count}" for defiFormat, count in sorted(defiFormatCounter.items())) info["defi_tag"] = ", ".join( f"{defiFormat}={count}" for defiFormat, count in allTagsCounter.most_common()) info["defi_first_tag"] = ", ".join( f"{defiFormat}={count}" for defiFormat, count in firstTagCounter.most_common()) info["style"] = ", ".join( f"{defiFormat}={count}" for defiFormat, count in styleByTagCounter.most_common()) info["source_script"] = ", ".join( f"{defiFormat}={count}" for defiFormat, count in sourceScriptCounter.most_common()) self._file.write(dataToPrettyJson(info) + "\n")