def writeCompactMergeSyns(self, defiFormat):
        """
		Build StarDict dictionary with sametypesequence option specified.
		Every item definition consists of a single article.
		All articles have the same format, specified in defiFormat parameter.

		Parameters:
		defiFormat - format of article definition: h - html, m - plain text
		"""
        log.debug(f"writeCompactMergeSyns: defiFormat={defiFormat}")
        dictMark = 0
        idxBlockList = []  # list of tuples (b"word", startAndLength)
        altIndexList = []  # list of tuples (b"alternate", entryIndex)

        dictFile = open(self._filename + ".dict", "wb")

        t0 = now()
        if not isdir(self._resDir):
            os.mkdir(self._resDir)

        entryIndex = -1
        while True:
            entry = yield
            if entry is None:
                break
            if entry.isData():
                entry.save(self._resDir)
                continue
            entryIndex += 1

            words = entry.l_word  # list of strs
            word = words[0]  # str
            defi = self.fixDefi(entry.defi, defiFormat)
            # defi is str

            b_dictBlock = defi.encode("utf-8")
            dictFile.write(b_dictBlock)
            blockLen = len(b_dictBlock)

            blockData = uint32ToBytes(dictMark) + uint32ToBytes(blockLen)
            for word in words:
                idxBlockList.append((word.encode("utf-8"), blockData))

            dictMark += blockLen

        wordCount = self.writeIdxFile(idxBlockList)

        dictFile.close()
        if not os.listdir(self._resDir):
            os.rmdir(self._resDir)
        log.info(f"Writing dict file took {now()-t0:.2f} seconds")

        self.writeIfoFile(
            wordCount,
            len(altIndexList),
            defiFormat=defiFormat,
        )
Beispiel #2
0
    def writeSynFile(self, altIndexList: List[Tuple[bytes, int]]) -> None:
        """
		Build .syn file
		"""
        if not altIndexList:
            return

        log.info(f"Sorting {len(altIndexList)} synonyms...")
        t0 = now()

        altIndexList.sort(key=lambda x: self.sortKey(x[0]))
        # 28 seconds with old sort key (converted from custom cmp)
        # 0.63 seconds with my new sort key
        # 0.20 seconds without key function (default sort)

        log.info(
            f"Sorting {len(altIndexList)} synonyms took {now()-t0:.2f} seconds",
        )
        log.info(f"Writing {len(altIndexList)} synonyms...")
        t0 = now()
        with open(self._filename + ".syn", "wb") as synFile:
            synFile.write(b"".join([
                b_alt + b"\x00" + uint32ToBytes(entryIndex)
                for b_alt, entryIndex in altIndexList
            ]))
        log.info(
            f"Writing {len(altIndexList)} synonyms took {now()-t0:.2f} seconds",
        )
Beispiel #3
0
    def writeGeneral(self) -> None:
        """
		Build StarDict dictionary in general case.
		Every item definition may consist of an arbitrary number of articles.
		sametypesequence option is not used.
		"""
        dictMark = 0
        altIndexList = []  # list of tuples (b"alternate", entryIndex)

        dictFile = open(self._filename + ".dict", "wb")
        idxFile = open(self._filename + ".idx", "wb")
        indexFileSize = 0

        t0 = now()
        wordCount = 0
        defiFormatCounter = Counter()
        if not isdir(self._resDir):
            os.mkdir(self._resDir)

        entryIndex = -1
        while True:
            entry = yield
            if entry is None:
                break
            if entry.isData():
                entry.save(self._resDir)
                continue
            entryIndex += 1

            entry.detectDefiFormat()  # call no more than once
            defiFormat = entry.defiFormat
            defiFormatCounter[defiFormat] += 1
            if defiFormat not in ("m", "h", "x"):
                log.error(f"invalid defiFormat={defiFormat}, using 'm'")
                defiFormat = "m"

            words = entry.l_word  # list of strs
            word = words[0]  # str
            defi = self.fixDefi(entry.defi, defiFormat)
            # defi is str

            for alt in words[1:]:
                altIndexList.append((alt.encode("utf-8"), entryIndex))

            b_dictBlock = (defiFormat + defi).encode("utf-8") + b"\x00"
            dictFile.write(b_dictBlock)
            blockLen = len(b_dictBlock)

            b_idxBlock = word.encode("utf-8") + b"\x00" + \
             uint32ToBytes(dictMark) + \
             uint32ToBytes(blockLen)
            idxFile.write(b_idxBlock)

            dictMark += blockLen
            indexFileSize += len(b_idxBlock)

            wordCount += 1

        dictFile.close()
        idxFile.close()
        if not os.listdir(self._resDir):
            os.rmdir(self._resDir)
        log.info(f"Writing dict file took {now()-t0:.2f} seconds")
        log.debug("defiFormatsCount = " +
                  pformat(defiFormatCounter.most_common()))

        self.writeSynFile(altIndexList)
        self.writeIfoFile(
            wordCount,
            indexFileSize,
            len(altIndexList),
        )
Beispiel #4
0
    def writeCompact(self, defiFormat):
        """
		Build StarDict dictionary with sametypesequence option specified.
		Every item definition consists of a single article.
		All articles have the same format, specified in defiFormat parameter.

		Parameters:
		defiFormat - format of article definition: h - html, m - plain text
		"""
        dictMark = 0
        altIndexList = []  # list of tuples (b"alternate", entryIndex)

        dictFile = open(self._filename + ".dict", "wb")
        idxFile = open(self._filename + ".idx", "wb")
        indexFileSize = 0

        t0 = now()
        wordCount = 0
        if not isdir(self._resDir):
            os.mkdir(self._resDir)

        entryIndex = -1
        while True:
            entry = yield
            if entry is None:
                break
            if entry.isData():
                entry.save(self._resDir)
                continue
            entryIndex += 1

            words = entry.l_word  # list of strs
            word = words[0]  # str
            defi = self.fixDefi(entry.defi, defiFormat)
            # defi is str

            for alt in words[1:]:
                altIndexList.append((alt.encode("utf-8"), entryIndex))

            b_dictBlock = defi.encode("utf-8")
            dictFile.write(b_dictBlock)
            blockLen = len(b_dictBlock)

            b_idxBlock = word.encode("utf-8") + b"\x00" + \
             uint32ToBytes(dictMark) + \
             uint32ToBytes(blockLen)
            idxFile.write(b_idxBlock)

            dictMark += blockLen
            indexFileSize += len(b_idxBlock)

            wordCount += 1

        dictFile.close()
        idxFile.close()
        if not os.listdir(self._resDir):
            os.rmdir(self._resDir)
        log.info(f"Writing dict file took {now()-t0:.2f} seconds")

        self.writeSynFile(altIndexList)
        self.writeIfoFile(
            wordCount,
            indexFileSize,
            len(altIndexList),
            defiFormat=defiFormat,
        )