def newEntry(self, word, defi) -> "BaseEntry": byteProgress = None if self._fileSize: byteProgress = (self._file.tell(), self._fileSize) return Entry( word, defi, byteProgress=byteProgress, )
def __next__(self): if not self._slobObj: log.error("iterating over a reader which is not open") raise StopIteration self._refIndex += 1 if self._refIndex >= len(self._slobObj): raise StopIteration blob = self._slobObj[self._refIndex] # blob.key is str, blob.content is bytes word = blob.key defi = toStr(blob.content) return Entry(word, defi)
def __next__(self): self._pos += 1 try: return self._pendingEntries.pop(0) except IndexError: pass ### try: wordDefi = self.nextPair() except StopIteration as e: self._len = self._pos raise e if not wordDefi: return word, defi = wordDefi ### return Entry(word, defi)
def loadInfo(self): self._pendingEntries = [] self._leadingLinesCount = 0 try: while True: wordDefi = self.nextPair() if not wordDefi: continue word, defi = wordDefi if not self.isInfoWord(word): self._pendingEntries.append(Entry(word, defi)) break self._leadingLinesCount += 1 word = self.fixInfoWord(word) if not word: continue self._glos.setInfo(word, defi) except StopIteration: pass
def loadInfo(self) -> None: self._pendingEntries = [] self._leadingLinesCount = 0 try: while True: wordDefi = self.nextPair() if not wordDefi: continue word, defi = wordDefi if not self.isInfoWords(word): self._pendingEntries.append(Entry(word, defi)) break self._leadingLinesCount += 1 if isinstance(word, list): word = [self.fixInfoWord(w) for w in word] else: word = self.fixInfoWord(word) if not word: continue if not defi: continue self._glos.setInfo(word, defi) except StopIteration: pass
def write_groups(self): import gzip from collections import OrderedDict from pyglossary.entry import Entry glos = self._glos words = [] dataEntryCount = 0 htmlHeader = "<?xml version=\"1.0\" encoding=\"utf-8\"?><html>\n" groupCounter = 0 htmlContents = htmlHeader def writeGroup(lastPrefix): nonlocal htmlContents group_fname = fixFilename(lastPrefix) htmlContents += "</html>" log.debug(f"writeGroup: {lastPrefix!r}, " "{group_fname!r}, count={groupCounter}") with gzip.open(group_fname + ".html", mode="wb") as gzipFile: gzipFile.write(htmlContents.encode("utf-8")) htmlContents = htmlHeader data = [] while True: entry = yield if entry is None: break if entry.isData(): dataEntryCount += 1 continue l_word = entry.l_word if len(l_word) == 1: data.append(entry.getRaw(glos)) continue wordsByPrefix = OrderedDict() for word in l_word: prefix = self.get_prefix(word) if prefix in wordsByPrefix: wordsByPrefix[prefix].append(word) else: wordsByPrefix[prefix] = [word] if len(wordsByPrefix) == 1: data.append(entry.getRaw(glos)) continue defi = entry.defi for prefix, p_words in wordsByPrefix.items(): data.append(Entry(p_words, defi).getRaw(glos)) del entry log.info(f"\nKobo: sorting entries...") data.sort(key=Entry.getRawEntrySortKey(glos, self.get_prefix_b)) lastPrefix = "" for rawEntry in data: entry = Entry.fromRaw(glos, rawEntry) headword, *variants = entry.l_word prefix = self.get_prefix(headword) if lastPrefix and prefix != lastPrefix: writeGroup(lastPrefix) groupCounter = 0 lastPrefix = prefix defi = entry.defi defi = self.fix_defi(defi) for w in entry.l_word: words.append(w) variants = [v.strip().lower() for v in variants] variants_html = ('<var>' + ''.join(f'<variant name="{v}"/>' for v in variants) + '</var>') htmlContents += f"<w><a name=\"{headword}\" /><div><b>{headword}</b>"\ f"{variants_html}<br/>{defi}</div></w>\n" groupCounter += 1 if groupCounter > 0: writeGroup(lastPrefix) if dataEntryCount > 0: log.warn( f"ignored {dataEntryCount} files (data entries)" " and replaced '<img ...' tags in definitions with placeholders" ) self._words = words