def writeCompactMergeSyns(self, defiFormat): """ Build StarDict dictionary with sametypesequence option specified. Every item definition consists of a single article. All articles have the same format, specified in defiFormat parameter. Parameters: defiFormat - format of article definition: h - html, m - plain text """ log.debug(f"writeCompactMergeSyns: defiFormat={defiFormat}") dictMark = 0 idxBlockList = [] # list of tuples (b"word", startAndLength) altIndexList = [] # list of tuples (b"alternate", entryIndex) dictFile = open(self._filename + ".dict", "wb") t0 = now() if not isdir(self._resDir): os.mkdir(self._resDir) entryIndex = -1 while True: entry = yield if entry is None: break if entry.isData(): entry.save(self._resDir) continue entryIndex += 1 words = entry.l_word # list of strs word = words[0] # str defi = self.fixDefi(entry.defi, defiFormat) # defi is str b_dictBlock = defi.encode("utf-8") dictFile.write(b_dictBlock) blockLen = len(b_dictBlock) blockData = uint32ToBytes(dictMark) + uint32ToBytes(blockLen) for word in words: idxBlockList.append((word.encode("utf-8"), blockData)) dictMark += blockLen wordCount = self.writeIdxFile(idxBlockList) dictFile.close() if not os.listdir(self._resDir): os.rmdir(self._resDir) log.info(f"Writing dict file took {now()-t0:.2f} seconds") self.writeIfoFile( wordCount, len(altIndexList), defiFormat=defiFormat, )
def writeSynFile(self, altIndexList: List[Tuple[bytes, int]]) -> None: """ Build .syn file """ if not altIndexList: return log.info(f"Sorting {len(altIndexList)} synonyms...") t0 = now() altIndexList.sort(key=lambda x: self.sortKey(x[0])) # 28 seconds with old sort key (converted from custom cmp) # 0.63 seconds with my new sort key # 0.20 seconds without key function (default sort) log.info( f"Sorting {len(altIndexList)} synonyms took {now()-t0:.2f} seconds", ) log.info(f"Writing {len(altIndexList)} synonyms...") t0 = now() with open(self._filename + ".syn", "wb") as synFile: synFile.write(b"".join([ b_alt + b"\x00" + uint32ToBytes(entryIndex) for b_alt, entryIndex in altIndexList ])) log.info( f"Writing {len(altIndexList)} synonyms took {now()-t0:.2f} seconds", )
def writeGeneral(self) -> None: """ Build StarDict dictionary in general case. Every item definition may consist of an arbitrary number of articles. sametypesequence option is not used. """ dictMark = 0 altIndexList = [] # list of tuples (b"alternate", entryIndex) dictFile = open(self._filename + ".dict", "wb") idxFile = open(self._filename + ".idx", "wb") indexFileSize = 0 t0 = now() wordCount = 0 defiFormatCounter = Counter() if not isdir(self._resDir): os.mkdir(self._resDir) entryIndex = -1 while True: entry = yield if entry is None: break if entry.isData(): entry.save(self._resDir) continue entryIndex += 1 entry.detectDefiFormat() # call no more than once defiFormat = entry.defiFormat defiFormatCounter[defiFormat] += 1 if defiFormat not in ("m", "h", "x"): log.error(f"invalid defiFormat={defiFormat}, using 'm'") defiFormat = "m" words = entry.l_word # list of strs word = words[0] # str defi = self.fixDefi(entry.defi, defiFormat) # defi is str for alt in words[1:]: altIndexList.append((alt.encode("utf-8"), entryIndex)) b_dictBlock = (defiFormat + defi).encode("utf-8") + b"\x00" dictFile.write(b_dictBlock) blockLen = len(b_dictBlock) b_idxBlock = word.encode("utf-8") + b"\x00" + \ uint32ToBytes(dictMark) + \ uint32ToBytes(blockLen) idxFile.write(b_idxBlock) dictMark += blockLen indexFileSize += len(b_idxBlock) wordCount += 1 dictFile.close() idxFile.close() if not os.listdir(self._resDir): os.rmdir(self._resDir) log.info(f"Writing dict file took {now()-t0:.2f} seconds") log.debug("defiFormatsCount = " + pformat(defiFormatCounter.most_common())) self.writeSynFile(altIndexList) self.writeIfoFile( wordCount, indexFileSize, len(altIndexList), )
def writeCompact(self, defiFormat): """ Build StarDict dictionary with sametypesequence option specified. Every item definition consists of a single article. All articles have the same format, specified in defiFormat parameter. Parameters: defiFormat - format of article definition: h - html, m - plain text """ dictMark = 0 altIndexList = [] # list of tuples (b"alternate", entryIndex) dictFile = open(self._filename + ".dict", "wb") idxFile = open(self._filename + ".idx", "wb") indexFileSize = 0 t0 = now() wordCount = 0 if not isdir(self._resDir): os.mkdir(self._resDir) entryIndex = -1 while True: entry = yield if entry is None: break if entry.isData(): entry.save(self._resDir) continue entryIndex += 1 words = entry.l_word # list of strs word = words[0] # str defi = self.fixDefi(entry.defi, defiFormat) # defi is str for alt in words[1:]: altIndexList.append((alt.encode("utf-8"), entryIndex)) b_dictBlock = defi.encode("utf-8") dictFile.write(b_dictBlock) blockLen = len(b_dictBlock) b_idxBlock = word.encode("utf-8") + b"\x00" + \ uint32ToBytes(dictMark) + \ uint32ToBytes(blockLen) idxFile.write(b_idxBlock) dictMark += blockLen indexFileSize += len(b_idxBlock) wordCount += 1 dictFile.close() idxFile.close() if not os.listdir(self._resDir): os.rmdir(self._resDir) log.info(f"Writing dict file took {now()-t0:.2f} seconds") self.writeSynFile(altIndexList) self.writeIfoFile( wordCount, indexFileSize, len(altIndexList), defiFormat=defiFormat, )