Ejemplo n.º 1
0
    def readIdxFile(self) -> List[Tuple[bytes, int, int]]:
        if isfile(self._filename + ".idx.gz"):
            with gzip.open(self._filename + ".idx.gz") as idxFile:
                idxBytes = idxFile.read()
        else:
            with open(self._filename + ".idx", "rb") as idxFile:
                idxBytes = idxFile.read()

        indexData = []
        pos = 0
        while pos < len(idxBytes):
            beg = pos
            pos = idxBytes.find(b"\x00", beg)
            if pos < 0:
                log.error("Index file is corrupted")
                break
            b_word = idxBytes[beg:pos]
            pos += 1
            if pos + 8 > len(idxBytes):
                log.error("Index file is corrupted")
                break
            offset = uint32FromBytes(idxBytes[pos:pos + 4])
            pos += 4
            size = uint32FromBytes(idxBytes[pos:pos + 4])
            pos += 4
            indexData.append((b_word, offset, size))

        return indexData
Ejemplo n.º 2
0
    def parseDefiBlockGeneral(self, b_block: bytes) -> List[Tuple[bytes, int]]:
        """
		Parse definition block when sametypesequence option is not specified.

		Return a list of (b_defi, defiFormatCode) tuples
			where b_defi is a bytes instance
			and defiFormatCode is int, so: defiFormat = chr(defiFormatCode)
		"""
        res = []
        i = 0
        while i < len(b_block):
            t = b_block[i]
            if not bytes([t]).isalpha():
                return None
            i += 1
            if bytes([t]).islower():
                beg = i
                i = b_block.find(b"\x00", beg)
                if i < 0:
                    return None
                res.append((b_block[beg:i], t))
                i += 1
            else:
                assert bytes([t]).isupper()
                if i + 4 > len(b_block):
                    return None
                size = uint32FromBytes(b_block[i:i + 4])
                i += 4
                if i + size > len(b_block):
                    return None
                res.append((b_block[i:i + size], t))
                i += size
        return res
Ejemplo n.º 3
0
    def parseDefiBlockCompact(
        self,
        b_block: bytes,
        sametypesequence: str,
    ) -> List[Tuple[bytes, int]]:
        """
		Parse definition block when sametypesequence option is specified.

		Return a list of (b_defi, defiFormatCode) tuples
			where b_defi is a bytes instance
			and defiFormatCode is int, so: defiFormat = chr(defiFormatCode)
		"""
        b_sametypesequence = sametypesequence.encode("utf-8")
        assert len(b_sametypesequence) > 0
        res = []
        i = 0
        for t in b_sametypesequence[:-1]:
            if i >= len(b_block):
                return None
            if bytes([t]).islower():
                beg = i
                i = b_block.find(b"\x00", beg)
                if i < 0:
                    return None
                res.append((b_block[beg:i], t))
                i += 1
            else:
                assert bytes([t]).isupper()
                if i + 4 > len(b_block):
                    return None
                size = uint32FromBytes(b_block[i:i + 4])
                i += 4
                if i + size > len(b_block):
                    return None
                res.append((b_block[i:i + size], t))
                i += size

        if i >= len(b_block):
            return None
        t = b_sametypesequence[-1]
        if bytes([t]).islower():
            if 0 in b_block[i:]:
                return None
            res.append((b_block[i:], t))
        else:
            assert bytes([t]).isupper()
            res.append((b_block[i:], t))

        return res
Ejemplo n.º 4
0
    def readSynFile(self) -> "Dict[int, List[str]]":
        """
		return synDict, a dict { entryIndex -> altList }
		"""
        if not isfile(self._filename + ".syn"):
            return {}
        unicode_errors = self._unicode_errors

        with open(self._filename + ".syn", "rb") as synFile:
            synBytes = synFile.read()
        synBytesLen = len(synBytes)
        synDict = {}
        pos = 0
        while pos < synBytesLen:
            beg = pos
            pos = synBytes.find(b"\x00", beg)
            if pos < 0:
                log.error("Synonym file is corrupted")
                break
            b_alt = synBytes[beg:pos]  # b_alt is bytes
            pos += 1
            if pos + 4 > len(synBytes):
                log.error("Synonym file is corrupted")
                break
            entryIndex = uint32FromBytes(synBytes[pos:pos + 4])
            pos += 4
            if entryIndex >= self._wordCount:
                log.error(f"Corrupted synonym file. " +
                          f"Word {b_alt} references invalid item")
                continue

            s_alt = b_alt.decode("utf-8", errors=unicode_errors)
            # s_alt is str
            try:
                synDict[entryIndex].append(s_alt)
            except KeyError:
                synDict[entryIndex] = [s_alt]

        return synDict