Python binStrToIntの例、pyglossary.text_utils.binStrToInt Pythonの例

コード例 #1

0

ファイルを表示

ファイル: stardict.py プロジェクト: xunglv/pyglossary

 def readIdxFile(self):
     if isfile(self.fileBasePath + '.idx.gz'):
         import gzip
         with gzip.open(self.fileBasePath + '.idx.gz') as f:
             idxStr = f.read()
     else:
         with open(self.fileBasePath + '.idx', 'rb') as f:
             idxStr = f.read()
     self.indexData = []
     i = 0
     while i < len(idxStr):
         beg = i
         i = idxStr.find('\x00', beg)
         if i < 0:
             log.error("Index file is corrupted.")
             break
         word = idxStr[beg:i]
         i += 1
         if i + 8 > len(idxStr):
             log.error("Index file is corrupted")
             break
         offset = binStrToInt(idxStr[i:i + 4])
         i += 4
         size = binStrToInt(idxStr[i:i + 4])
         i += 4
         self.indexData.append([word, offset, size, [], []])

コード例 #2

0

ファイルを表示

ファイル: stardict.py プロジェクト: qyqx/pyglossary

    def readIdxFile(self):
        if isfile(self.fileBasePath + ".idx.gz"):
            import gzip

            with gzip.open(self.fileBasePath + ".idx.gz") as f:
                idxStr = f.read()
        else:
            with open(self.fileBasePath + ".idx", "rb") as f:
                idxStr = f.read()
        self.indexData = []
        i = 0
        while i < len(idxStr):
            beg = i
            i = idxStr.find("\x00", beg)
            if i < 0:
                log.error("Index file is corrupted.")
                break
            word = idxStr[beg:i]
            i += 1
            if i + 8 > len(idxStr):
                log.error("Index file is corrupted")
                break
            offset = binStrToInt(idxStr[i : i + 4])
            i += 4
            size = binStrToInt(idxStr[i : i + 4])
            i += 4
            self.indexData.append([word, offset, size, [], []])

コード例 #3

0

ファイルを表示

ファイル: stardict.py プロジェクト: ilius/pyglossary

	def readIdxFile(self):
		if isfile(self._filename+".idx.gz"):
			with gzip.open(self._filename+".idx.gz") as idxFile:
				idxBytes = idxFile.read()
		else:
			with open(self._filename+".idx", "rb") as idxFile:
				idxBytes = idxFile.read()

		indexData = []
		pos = 0
		while pos < len(idxBytes):
			beg = pos
			pos = idxBytes.find(b"\x00", beg)
			if pos < 0:
				log.error("Index file is corrupted")
				break
			b_word = idxBytes[beg:pos]
			pos += 1
			if pos + 8 > len(idxBytes):
				log.error("Index file is corrupted")
				break
			offset = binStrToInt(idxBytes[pos:pos+4])
			pos += 4
			size = binStrToInt(idxBytes[pos:pos+4])
			pos += 4
			indexData.append([b_word, offset, size])

		return indexData

コード例 #4

0

ファイルを表示

ファイル: stardict.py プロジェクト: Forwardboy009/pyglossary

    def readIdxFile(self):
        if isfile(self.fileBasePath+'.idx.gz'):
            import gzip
            with gzip.open(self.fileBasePath+'.idx.gz') as f:
                idxStr = f.read()
        else:
            with open(self.fileBasePath+'.idx', 'rb') as f:
                idxStr = f.read()
        indexData = []
        i = 0
        while i < len(idxStr):
            beg = i
            i = idxStr.find(b'\x00', beg)
            if i < 0:
                log.error("Index file is corrupted.")
                break
            word = idxStr[beg:i]
            i += 1
            if i + 8 > len(idxStr):
                log.error("Index file is corrupted")
                break
            offset = binStrToInt(idxStr[i:i+4])
            i += 4
            size = binStrToInt(idxStr[i:i+4])
            i += 4
            indexData.append([word, offset, size])

        return indexData

コード例 #5

0

ファイルを表示

ファイル: stardict.py プロジェクト: karlb/pyglossary

    def readIdxFile(self) -> List[Tuple[bytes, int, int]]:
        if isfile(self._filename + ".idx.gz"):
            with gzip.open(self._filename + ".idx.gz") as idxFile:
                idxBytes = idxFile.read()
        else:
            with open(self._filename + ".idx", "rb") as idxFile:
                idxBytes = idxFile.read()

        indexData = []
        pos = 0
        while pos < len(idxBytes):
            beg = pos
            pos = idxBytes.find(b"\x00", beg)
            if pos < 0:
                log.error("Index file is corrupted")
                break
            b_word = idxBytes[beg:pos]
            pos += 1
            if pos + 8 > len(idxBytes):
                log.error("Index file is corrupted")
                break
            offset = binStrToInt(idxBytes[pos:pos + 4])
            pos += 4
            size = binStrToInt(idxBytes[pos:pos + 4])
            pos += 4
            indexData.append((b_word, offset, size))

        return indexData

コード例 #6

0

ファイルを表示

ファイル: bgl_reader.py プロジェクト: wxyjuly/pyglossary

	def readEntryDefi(self, block, pos, b_word):
		"""
		Read defi part of entry.

		Return value is a list.
		(False, None, None, None) if error
		(True, pos, u_defi, b_defi) if OK
			u_defi is a str instance (utf-8)
			b_defi is a bytes instance
		"""
		Err = (False, None, None, None)
		if pos + 2 > len(block.data):
			log.error(
				f"reading block offset={block.offset:#02x}"
				f", reading defi size: pos + 2 > len(block.data)"
			)
			return Err
		Len = binStrToInt(block.data[pos:pos + 2])
		pos += 2
		if pos + Len > len(block.data):
			log.error(
				f"reading block offset={block.offset:#02x}"
				f", block.type={block.type}"
				f", reading defi: pos + Len > len(block.data)"
			)
			return Err
		b_defi = block.data[pos:pos + Len]
		u_defi = self.processDefi(b_defi, b_word)
		self.defiMaxBytes = max(self.defiMaxBytes, len(b_defi))

		pos += Len
		return True, pos, u_defi, b_defi

コード例 #7

0

ファイルを表示

    def readSynFile(self, indexCount):
        """
            returns synData, a dict { wordIndex -> synWordsList }
        """
        if not isfile(self.fileBasePath + '.syn'):
            return {}
        synStr = open(self.fileBasePath + '.syn', 'rb').read()
        synStrLen = len(synStr)
        synData = {}
        i = 0
        while i < synStrLen:
            beg = i
            i = synStr.find('\x00', beg)
            if i < 0:
                log.error("Synonym file is corrupted.")
                break
            word = synStr[beg:i]
            i += 1
            if i + 4 > len(synStr):
                log.error("Synonym file is corrupted.")
                break
            index = binStrToInt(synStr[i:i + 4])
            i += 4
            if index >= indexCount:
                log.error(
                    "Corrupted synonym file. Word \"{0}\" references invalid item."
                    .format(word))
                continue

            try:
                synData[index].append(word)
            except KeyError:
                synData[index] = [word]

        return synData

コード例 #8

0

ファイルを表示

ファイル: stardict.py プロジェクト: xunglv/pyglossary

 def readSynFile(self):
     if not isfile(self.fileBasePath + '.syn'):
         return
     with open(self.fileBasePath + '.syn', 'rb') as f:
         synStr = f.read()
     i = 0
     while i < len(synStr):
         beg = i
         i = synStr.find('\x00', beg)
         if i < 0:
             log.error("Synonym file is corrupted.")
             break
         word = synStr[beg:i]
         i += 1
         if i + 4 > len(synStr):
             log.error("Synonym file is corrupted.")
             break
         index = binStrToInt(synStr[i:i + 4])
         i += 4
         if index >= len(self.indexData):
             log.error(
                 "Corrupted synonym file. Word \"{0}\" references invalid item."
                 .format(word))
             continue
         self.indexData[index][4].append(word)

コード例 #9

0

ファイルを表示

ファイル: bgl_reader.py プロジェクト: ilius/pyglossary

	def readEntryDefi(self, block, pos, b_word):
		"""
		Read defi part of entry.

		Return value is a list.
		(False, None, None, None) if error
		(True, pos, u_defi, b_defi) if OK
			u_defi is a str instance (utf-8)
			b_defi is a bytes instance
		"""
		Err = (False, None, None, None)
		if pos + 2 > len(block.data):
			log.error(
				"reading block offset=%#.2x" % block.offset +
				", reading defi size: pos + 2 > len(block.data)"
			)
			return Err
		Len = binStrToInt(block.data[pos:pos+2])
		pos += 2
		if pos + Len > len(block.data):
			log.error(
				"reading block offset=%#.2x" % block.offset +
				", block.type=%s" % block.type +
				", reading defi: pos + Len > len(block.data)"
			)
			return Err
		b_defi = block.data[pos:pos+Len]
		u_defi = self.processDefi(b_defi, b_word)
		self.defiMaxBytes = max(self.defiMaxBytes, len(b_defi))

		pos += Len
		return True, pos, u_defi, b_defi

コード例 #10

0

ファイルを表示

ファイル: stardict.py プロジェクト: xunglv/pyglossary

 def parseDefiBlockGeneral(self, data, word):
     """Parse definition block when sametypesequence option is not specified.
     """
     dataFileCorruptedError = "Data file is corrupted. Word \"{0}\"".format(
         word)
     res = []
     i = 0
     while i < len(data):
         t = data[i]
         if not isAsciiAlpha(t):
             log.error(dataFileCorruptedError)
             return None
         i += 1
         if isAsciiLower(t):
             beg = i
             i = data.find('\x00', beg)
             if i < 0:
                 log.error(dataFileCorruptedError)
                 return None
             res.append((data[beg:i], t))
             i += 1
         else:
             assert isAsciiUpper(t)
             if i + 4 > len(data):
                 log.error(dataFileCorruptedError)
                 return None
             size = binStrToInt(data[i:i + 4])
             i += 4
             if i + size > len(data):
                 log.error(dataFileCorruptedError)
                 return None
             res.append((data[i:i + size], t))
             i += size
     return res

コード例 #11

0

ファイルを表示

ファイル: bgl_reader.py プロジェクト: wxyjuly/pyglossary

	def openGzip(self):
		with open(self._filename, "rb") as bglFile:
			if not bglFile:
				log.error(f"file pointer empty: {bglFile}")
				return False
			b_head = bglFile.read(6)

		if len(b_head) < 6 or not b_head[:4] in (
			b"\x12\x34\x00\x01",
			b"\x12\x34\x00\x02",
		):
			log.error(f"invalid header: {b_head[:6]!r}")
			return False

		self.gzipOffset = gzipOffset = binStrToInt(b_head[4:6])
		log.debug(f"Position of gz header: {gzipOffset}")

		if gzipOffset < 6:
			log.error(f"invalid gzip header position: {gzipOffset}")
			return False

		self.file = BGLGzipFile(
			fileobj=FileOffS(self._filename, gzipOffset),
			closeFileobj=True,
		)

		return True

コード例 #12

0

ファイルを表示

ファイル: bgl_info.py プロジェクト: numb95/pyglossary

def utf16InfoDecode(b_value):
    """
        b_value is byte array
        returns str, or None (on errors)

        block type = 3
        block format: <2 byte code1><2 byte code2>
        if code2 == 0: then the block ends
        if code2 == 1: then the block continues as follows:
        <4 byte len1> \x00 \x00 <message in utf-16>
        len1 - length of message in 2-byte chars
    """
    if b_value[0] != 0:
        log.warning('utf16InfoDecode: b_value=%s, null expected at 0'%list(b_value))
        return
    
    if b_value[1] == 0:
        if len(b_value) > 2:
            log.warning('utf16InfoDecode: unexpected b_value size: %s'%len(b_value))
        return

    elif b_value[1] > 1:
        log.warning('utf16InfoDecode: b_value=%s, unexpected byte at 1'%list(b_value))
        return

    ## now b_value[1] == 1
    size = 2 * binStrToInt(b_value[2:6])
    if tuple(b_value[6:8]) != (0, 0):
        log.warning('utf16InfoDecode: b_value=%s, null expected at 6:8'%list(b_value))
    if size != len(b_value)-8:
        log.warning('utf16InfoDecode: b_value=%s, size does not match'%list(b_value))

    return b_value[8:].decode('utf16')## str

コード例 #13

0

ファイルを表示

ファイル: stardict.py プロジェクト: Forwardboy009/pyglossary

    def readSynFile(self, indexCount):
        """
            returns synData, a dict { wordIndex -> synWordsList }
        """
        if not isfile(self.fileBasePath+'.syn'):
            return {}
        with open(self.fileBasePath+'.syn', 'rb') as synFile:
            synStr = synFile.read()
        synStrLen = len(synStr)
        synData = {}
        i = 0
        while i < synStrLen:
            beg = i
            i = synStr.find(b'\x00', beg)
            if i < 0:
                log.error("Synonym file is corrupted.")
                break
            word = synStr[beg:i]
            i += 1
            if i + 4 > len(synStr):
                log.error("Synonym file is corrupted.")
                break
            index = binStrToInt(synStr[i:i+4])
            i += 4
            if index >= indexCount:
                log.error("Corrupted synonym file. Word \"{0}\" references invalid item.".format(word))
                continue
            
            word = toStr(word)
            try:
                synData[index].append(word)
            except KeyError:
                synData[index] = [word]

        return synData

コード例 #14

0

ファイルを表示

ファイル: stardict.py プロジェクト: Forwardboy009/pyglossary

 def parseDefiBlockGeneral(self, data, word):
     """
         Parse definition block when sametypesequence option is not specified.
     """
     dataFileCorruptedError = "Data file is corrupted. Word \"{0}\"".format(word)
     res = []
     i = 0
     while i < len(data):
         t = data[i]
         if not isAsciiAlpha(t):
             log.error(dataFileCorruptedError)
             return None
         i += 1
         if isAsciiLower(t):
             beg = i
             i = data.find(b'\x00', beg)
             if i < 0:
                 log.error(dataFileCorruptedError)
                 return None
             res.append((data[beg:i], t))
             i += 1
         else:
             assert isAsciiUpper(t)
             if i + 4 > len(data):
                 log.error(dataFileCorruptedError)
                 return None
             size = binStrToInt(data[i:i+4])
             i += 4
             if i + size > len(data):
                 log.error(dataFileCorruptedError)
                 return None
             res.append((data[i:i+size], t))
             i += size
     return res

コード例 #15

0

ファイルを表示

ファイル: stardict.py プロジェクト: ilius/pyglossary

	def parseDefiBlockGeneral(self, b_block):
		"""
		Parse definition block when sametypesequence option is not specified.

		Return a list of (b_defi, defiFormatCode) tuples
			where b_defi is a bytes instance
			and defiFormatCode is int, so: defiFormat = chr(defiFormatCode)
		"""
		res = []
		i = 0
		while i < len(b_block):
			t = b_block[i]
			if not bytes([t]).isalpha():
				return None
			i += 1
			if bytes([t]).islower():
				beg = i
				i = b_block.find(b"\x00", beg)
				if i < 0:
					return None
				res.append((b_block[beg:i], t))
				i += 1
			else:
				assert bytes([t]).isupper()
				if i + 4 > len(b_block):
					return None
				size = binStrToInt(b_block[i:i+4])
				i += 4
				if i + size > len(b_block):
					return None
				res.append((b_block[i:i+size], t))
				i += size
		return res

コード例 #16

0

ファイルを表示

ファイル: stardict.py プロジェクト: karlb/pyglossary

    def parseDefiBlockGeneral(self, b_block: bytes) -> List[Tuple[bytes, int]]:
        """
		Parse definition block when sametypesequence option is not specified.

		Return a list of (b_defi, defiFormatCode) tuples
			where b_defi is a bytes instance
			and defiFormatCode is int, so: defiFormat = chr(defiFormatCode)
		"""
        res = []
        i = 0
        while i < len(b_block):
            t = b_block[i]
            if not bytes([t]).isalpha():
                return None
            i += 1
            if bytes([t]).islower():
                beg = i
                i = b_block.find(b"\x00", beg)
                if i < 0:
                    return None
                res.append((b_block[beg:i], t))
                i += 1
            else:
                assert bytes([t]).isupper()
                if i + 4 > len(b_block):
                    return None
                size = binStrToInt(b_block[i:i + 4])
                i += 4
                if i + size > len(b_block):
                    return None
                res.append((b_block[i:i + size], t))
                i += size
        return res

コード例 #17

0

ファイルを表示

ファイル: bgl_reader.py プロジェクト: ilius/pyglossary

	def openGzip(self):
		with open(self._filename, "rb") as bglFile:
			if not bglFile:
				log.error("file pointer empty: %s" % bglFile)
				return False
			b_head = bglFile.read(6)

		if len(b_head) < 6 or not b_head[:4] in (
			b"\x12\x34\x00\x01",
			b"\x12\x34\x00\x02",
		):
			log.error("invalid header: %r" % b_head[:6])
			return False

		self.gzipOffset = gzipOffset = binStrToInt(b_head[4:6])
		log.debug("Position of gz header: %s" % gzipOffset)

		if gzipOffset < 6:
			log.error("invalid gzip header position: %s" % gzipOffset)
			return False

		self.file = BGLGzipFile(
			fileobj=FileOffS(self._filename, gzipOffset),
			closeFileobj=True,
		)

		return True

コード例 #18

0

ファイルを表示

ファイル: bgl_info.py プロジェクト: ilius/pyglossary

def languageInfoDecode(b_value):
	"""
		returns BabylonLanguage instance
	"""
	intValue = binStrToInt(b_value)
	try:
		return languageByCode[intValue]
	except IndexError:
		log.warning("read_type_3: unknown language code = %s" % intValue)
		return

コード例 #19

0

ファイルを表示

def languageInfoDecode(b_value):
    """
		returns BabylonLanguage instance
	"""
    intValue = binStrToInt(b_value)
    try:
        return languageByCode[intValue]
    except IndexError:
        log.warning("read_type_3: unknown language code = %s", intValue)
        return

コード例 #20

0

ファイルを表示

ファイル: stardict.py プロジェクト: zhyongwei/pyglossary

    def parseDefiBlockCompact(self, data, sametypesequence, rawWord):
        """
            Parse definition block when sametypesequence option is specified.
        """
        # data is bytes
        # rawWord is bytes
        sametypesequence = toBytes(sametypesequence)
        assert len(sametypesequence) > 0
        dataFileCorruptedError = "Data file is corrupted. Word \"{0}\"".format(
            rawWord)
        res = []
        i = 0
        for t in sametypesequence[:-1]:
            if i >= len(data):
                log.error(dataFileCorruptedError)
                return None
            if isAsciiLower(t):
                beg = i
                i = data.find(b'\x00', beg)
                if i < 0:
                    log.error(dataFileCorruptedError)
                    return None
                res.append((data[beg:i], t))
                i += 1
            else:
                assert isAsciiUpper(t)
                if i + 4 > len(data):
                    log.error(dataFileCorruptedError)
                    return None
                size = binStrToInt(data[i:i + 4])
                i += 4
                if i + size > len(data):
                    log.error(dataFileCorruptedError)
                    return None
                res.append((data[i:i + size], t))
                i += size

        if i >= len(data):
            log.error(dataFileCorruptedError)
            return None
        t = sametypesequence[-1]
        if isAsciiLower(t):
            if 0 in data[i:]:
                log.error(dataFileCorruptedError)
                return None
            res.append((data[i:], t))
        else:
            assert isAsciiUpper(t)
            res.append((data[i:], t))

        return res

コード例 #21

0

ファイルを表示

ファイル: bgl_reader.py プロジェクト: wxyjuly/pyglossary

	def readType3(self, block):
		"""
			reads block with type 3, and updates self.info
			returns None
		"""
		code, b_value = binStrToInt(block.data[:2]), block.data[2:]
		if not b_value:
			return
		# if not b_value.strip(b"\x00"): return  # FIXME

		try:
			key = infoKeysByCode[code]
		except KeyError:
			if b_value.strip(b"\x00"):
				log.debug(
					f"Unknown info type code={code:#02x}, b_value={b_value!r}",
				)
			return

		value = None
		func = infoKeyDecodeMethods.get(key)
		if func is None:
			value = b_value
		else:
			value = func(b_value)

		# `value` can be None, str, bytes or dict

		if not value:
			return

		if isinstance(value, dict):
			self.info.update(value)
			return

		if key in {
			"sourceLang",
			"targetLang",
			"defaultCharset",
			"sourceCharset",
			"targetCharset",
			"sourceEncoding",
			"targetEncoding",
			"bgl_numEntries",
			"iconData",
		}:
			setattr(self, key, value)
			return

		self.info[key] = value

コード例 #22

0

ファイルを表示

ファイル: stardict.py プロジェクト: Forwardboy009/pyglossary

    def parseDefiBlockCompact(self, data, sametypesequence, word):
        """
            Parse definition block when sametypesequence option is specified.
        """
        sametypesequence = toBytes(sametypesequence)
        assert len(sametypesequence) > 0
        dataFileCorruptedError = "Data file is corrupted. Word \"{0}\"".format(word)
        res = []
        i = 0
        for t in sametypesequence[:-1]:
            if i >= len(data):
                log.error(dataFileCorruptedError)
                return None
            if isAsciiLower(t):
                beg = i
                i = data.find('\x00', beg)
                if i < 0:
                    log.error(dataFileCorruptedError)
                    return None
                res.append((data[beg:i], t))
                i += 1
            else:
                assert isAsciiUpper(t)
                if i + 4 > len(data):
                    log.error(dataFileCorruptedError)
                    return None
                size = binStrToInt(data[i:i+4])
                i += 4
                if i + size > len(data):
                    log.error(dataFileCorruptedError)
                    return None
                res.append((data[i:i+size], t))
                i += size

        if i >= len(data):
            log.error(dataFileCorruptedError)
            return None
        t = sametypesequence[-1]
        if isAsciiLower(t):
            i2 = data.find('\x00', i)
            if i2 >= 0:
                log.error(dataFileCorruptedError)
                return None
            res.append((data[i:], t))
        else:
            assert isAsciiUpper(t)
            res.append((data[i:], t))

        return res

コード例 #23

0

ファイルを表示

ファイル: stardict.py プロジェクト: jeffzfw/pyglossary

    def parseDefiBlockCompact(self, data, sametypesequence, word):
        """Parse definition block when sametypesequence option is specified.
        """
        assert type(sametypesequence) == str
        assert len(sametypesequence) > 0
        dataFileCorruptedError = "Data file is corrupted. Word \"{0}\"".format(
            word)
        res = []
        i = 0
        for t in sametypesequence[:-1]:
            if i >= len(data):
                printAsError(dataFileCorruptedError)
                return None
            if isAsciiLower(t):
                beg = i
                i = data.find('\x00', beg)
                if i < 0:
                    printAsError(dataFileCorruptedError)
                    return None
                res.append((data[beg:i], t))
                i += 1
            else:
                assert isAsciiUpper(t)
                if i + 4 > len(data):
                    printAsError(dataFileCorruptedError)
                    return None
                size = binStrToInt(data[i:i + 4])
                i += 4
                if i + size > len(data):
                    printAsError(dataFileCorruptedError)
                    return None
                res.append((data[i:i + size], t))
                i += size

        if i >= len(data):
            printAsError(dataFileCorruptedError)
            return None
        t = sametypesequence[-1]
        if isAsciiLower(t):
            i2 = data.find('\x00', i)
            if i2 >= 0:
                printAsError(dataFileCorruptedError)
                return None
            res.append((data[i:], t))
        else:
            assert isAsciiUpper(t)
            res.append((data[i:], t))

        return res

コード例 #24

0

ファイルを表示

ファイル: stardict.py プロジェクト: karlb/pyglossary

    def parseDefiBlockCompact(
        self,
        b_block: bytes,
        sametypesequence: str,
    ) -> List[Tuple[bytes, int]]:
        """
		Parse definition block when sametypesequence option is specified.

		Return a list of (b_defi, defiFormatCode) tuples
			where b_defi is a bytes instance
			and defiFormatCode is int, so: defiFormat = chr(defiFormatCode)
		"""
        b_sametypesequence = sametypesequence.encode("utf-8")
        assert len(b_sametypesequence) > 0
        res = []
        i = 0
        for t in b_sametypesequence[:-1]:
            if i >= len(b_block):
                return None
            if bytes([t]).islower():
                beg = i
                i = b_block.find(b"\x00", beg)
                if i < 0:
                    return None
                res.append((b_block[beg:i], t))
                i += 1
            else:
                assert bytes([t]).isupper()
                if i + 4 > len(b_block):
                    return None
                size = binStrToInt(b_block[i:i + 4])
                i += 4
                if i + size > len(b_block):
                    return None
                res.append((b_block[i:i + size], t))
                i += size

        if i >= len(b_block):
            return None
        t = b_sametypesequence[-1]
        if bytes([t]).islower():
            if 0 in b_block[i:]:
                return None
            res.append((b_block[i:], t))
        else:
            assert bytes([t]).isupper()
            res.append((b_block[i:], t))

        return res

コード例 #25

0

ファイルを表示

ファイル: bgl_reader.py プロジェクト: ilius/pyglossary

	def readType0(self, block):
		code = block.data[0]
		if code == 2:
			# this number is vary close to self.bgl_numEntries,
			# but does not always equal to the number of entries
			# see self.readType3, code == 12 as well
			num = binStrToInt(block.data[1:])
		elif code == 8:
			self.defaultCharset = charsetInfoDecode(block.data[1:])
			if not self.defaultCharset:
				log.warning("defaultCharset is not valid")
		else:
			self.logUnknownBlock(block)
			return False
		return True

コード例 #26

0

ファイルを表示

ファイル: bgl_reader.py プロジェクト: wxyjuly/pyglossary

	def readType0(self, block):
		code = block.data[0]
		if code == 2:
			# this number is vary close to self.bgl_numEntries,
			# but does not always equal to the number of entries
			# see self.readType3, code == 12 as well
			num = binStrToInt(block.data[1:])
		elif code == 8:
			self.defaultCharset = charsetInfoDecode(block.data[1:])
			if not self.defaultCharset:
				log.warning("defaultCharset is not valid")
		else:
			self.logUnknownBlock(block)
			return False
		return True

コード例 #27

0

ファイルを表示

def utf16InfoDecode(b_value):
    """
		b_value is byte array
		returns str, or None (on errors)

		block type = 3
		block format: <2 byte code1><2 byte code2>
		if code2 == 0: then the block ends
		if code2 == 1: then the block continues as follows:
		<4 byte len1> \x00 \x00 <message in utf-16>
		len1 - length of message in 2-byte chars
	"""
    if b_value[0] != 0:
        log.warning(
            "utf16InfoDecode: b_value=%s, null expected at 0",
            b_value,
        )
        return

    if b_value[1] == 0:
        if len(b_value) > 2:
            log.warning(
                "utf16InfoDecode: unexpected b_value size: %s",
                len(b_value),
            )
        return

    elif b_value[1] > 1:
        log.warning(
            "utf16InfoDecode: b_value=%s, unexpected byte at 1",
            list(b_value),
        )
        return

    # now b_value[1] == 1
    size = 2 * binStrToInt(b_value[2:6])
    if tuple(b_value[6:8]) != (0, 0):
        log.warning(
            "utf16InfoDecode: b_value=%s, null expected at 6:8",
            list(b_value),
        )
    if size != len(b_value) - 8:
        log.warning(
            "utf16InfoDecode: b_value=%s, size does not match",
            list(b_value),
        )

    return b_value[8:].decode("utf16")  # str

コード例 #28

0

ファイルを表示

ファイル: bgl_reader.py プロジェクト: ilius/pyglossary

	def readType3(self, block):
		"""
			reads block with type 3, and updates self.info
			returns None
		"""
		code, b_value = binStrToInt(block.data[:2]), block.data[2:]
		if not b_value:
			return
		# if not b_value.strip(b"\x00"): return  # FIXME

		try:
			key = infoKeysByCode[code]
		except KeyError:
			if b_value.strip(b"\x00"):
				log.debug(
					"Unknown info type code=%#.2x" % code +
					", b_value=%r" % b_value,
				)
			return

		try:
			func = infoKeyDecodeMethods[key]
		except KeyError:
			value = b_value
		else:
			value = func(b_value)

		# `value` can be a bytes instance,
		# or str instance, depending on `key` FIXME

		if value:
			if isinstance(value, dict):
				self.info.update(value)
			elif key in {
				"sourceLang",
				"targetLang",
				"defaultCharset",
				"sourceCharset",
				"targetCharset",
				"sourceEncoding",
				"targetEncoding",
				"bgl_numEntries",
				"iconData",
			}:
				setattr(self, key, value)
			else:
				self.info[key] = value

コード例 #29

0

ファイルを表示

ファイル: stardict.py プロジェクト: ilius/pyglossary

	def parseDefiBlockCompact(self, b_block, sametypesequence):
		"""
		Parse definition block when sametypesequence option is specified.

		Return a list of (b_defi, defiFormatCode) tuples
			where b_defi is a bytes instance
			and defiFormatCode is int, so: defiFormat = chr(defiFormatCode)
		"""
		assert isinstance(b_block, bytes)
		sametypesequence = sametypesequence.encode("utf-8")
		assert len(sametypesequence) > 0
		res = []
		i = 0
		for t in sametypesequence[:-1]:
			if i >= len(b_block):
				return None
			if bytes([t]).islower():
				beg = i
				i = b_block.find(b"\x00", beg)
				if i < 0:
					return None
				res.append((b_block[beg:i], t))
				i += 1
			else:
				assert bytes([t]).isupper()
				if i + 4 > len(b_block):
					return None
				size = binStrToInt(b_block[i:i+4])
				i += 4
				if i + size > len(b_block):
					return None
				res.append((b_block[i:i+size], t))
				i += size

		if i >= len(b_block):
			return None
		t = sametypesequence[-1]
		if bytes([t]).islower():
			if 0 in b_block[i:]:
				return None
			res.append((b_block[i:], t))
		else:
			assert bytes([t]).isupper()
			res.append((b_block[i:], t))

		return res

コード例 #30

0

ファイルを表示

ファイル: bgl_reader.py プロジェクト: wxyjuly/pyglossary

	def readBytes(self, num):
		"""
			return -1 if error
		"""
		if num < 1 or num > 4:
			log.error(f"invalid argument num={num}")
			return -1
		self.file.flush()
		buf = self.file.read(num)
		if len(buf) == 0:
			log.debug("readBytes: end of file: len(buf)==0")
			return -1
		if len(buf) != num:
			log.error(
				f"readBytes: expected to read {num} bytes"
				f", but found {len(buf)} bytes"
			)
			return -1
		return binStrToInt(buf)

コード例 #31

0

ファイルを表示

ファイル: bgl_info.py プロジェクト: zhyongwei/pyglossary

def flagsInfoDecode(b_value):
    """
        returns a dict with these keys:
            utf8Encoding
                when this flag is set utf8 encoding is used for all articles
                when false, the encoding is set according to the source and target alphabet
            spellingAlternatives
                determines whether the glossary offers spelling alternatives for searched terms                           
            caseSensitive
                defines if the search for terms in this glossary is case sensitive
                see code 0x20 as well                         

    """
    flags = binStrToInt(b_value)
    return {
        'utf8Encoding': (flags & 0x8000 != 0),
        'spellingAlternatives': (flags & 0x10000 == 0),
        'caseSensitive': (flags & 0x1000 != 0),
    }

コード例 #32

0

ファイルを表示

ファイル: bgl_info.py プロジェクト: numb95/pyglossary

def flagsInfoDecode(b_value):
    """
        returns a dict with these keys:
            utf8Encoding
                when this flag is set utf8 encoding is used for all articles
                when false, the encoding is set according to the source and target alphabet
            spellingAlternatives
                determines whether the glossary offers spelling alternatives for searched terms                           
            caseSensitive
                defines if the search for terms in this glossary is case sensitive
                see code 0x20 as well                         

    """
    flags = binStrToInt(b_value)
    return {
        'utf8Encoding': (flags & 0x8000 != 0),
        'spellingAlternatives': (flags & 0x10000 == 0),
        'caseSensitive': (flags & 0x1000 != 0),
    }

コード例 #33

0

ファイルを表示

ファイル: bgl_reader.py プロジェクト: ilius/pyglossary

	def readBytes(self, num):
		"""
			return -1 if error
		"""
		if num < 1 or num > 4:
			log.error("invalid argument num=%s" % num)
			return -1
		self.file.flush()
		buf = self.file.read(num)
		if len(buf) == 0:
			log.debug("readBytes: end of file: len(buf)==0")
			return -1
		if len(buf) != num:
			log.error(
				"readBytes: expected to read %s bytes" % num +
				", but found %s bytes" % len(buf)
			)
			return -1
		return binStrToInt(buf)

コード例 #34

0

ファイルを表示

ファイル: stardict.py プロジェクト: ilius/pyglossary

	def readSynFile(self):
		"""
		return synDict, a dict { wordIndex -> altList }
		"""
		if not isfile(self._filename+".syn"):
			return {}
		with open(self._filename+".syn", "rb") as synFile:
			synBytes = synFile.read()
		synBytesLen = len(synBytes)
		synDict = {}
		pos = 0
		while pos < synBytesLen:
			beg = pos
			pos = synBytes.find(b"\x00", beg)
			if pos < 0:
				log.error("Synonym file is corrupted")
				break
			b_alt = synBytes[beg:pos]  # b_alt is bytes
			pos += 1
			if pos + 4 > len(synBytes):
				log.error("Synonym file is corrupted")
				break
			wordIndex = binStrToInt(synBytes[pos:pos+4])
			pos += 4
			if wordIndex >= self._wordCount:
				log.error(
					"Corrupted synonym file. " +
					"Word \"%s\" references invalid item" % b_alt
				)
				continue

			s_alt = b_alt.decode("utf-8")  # s_alt is str
			try:
				synDict[wordIndex].append(s_alt)
			except KeyError:
				synDict[wordIndex] = [s_alt]

		return synDict

コード例 #35

0

ファイルを表示

	def readSynFile(self) -> Dict[int, List[str]]:
		"""
		return synDict, a dict { wordIndex -> altList }
		"""
		if not isfile(self._filename+".syn"):
			return {}
		with open(self._filename+".syn", "rb") as synFile:
			synBytes = synFile.read()
		synBytesLen = len(synBytes)
		synDict = {}
		pos = 0
		while pos < synBytesLen:
			beg = pos
			pos = synBytes.find(b"\x00", beg)
			if pos < 0:
				log.error("Synonym file is corrupted")
				break
			b_alt = synBytes[beg:pos]  # b_alt is bytes
			pos += 1
			if pos + 4 > len(synBytes):
				log.error("Synonym file is corrupted")
				break
			wordIndex = binStrToInt(synBytes[pos:pos+4])
			pos += 4
			if wordIndex >= self._wordCount:
				log.error(
					"Corrupted synonym file. " +
					"Word \"%s\" references invalid item" % b_alt
				)
				continue

			s_alt = b_alt.decode("utf-8")  # s_alt is str
			try:
				synDict[wordIndex].append(s_alt)
			except KeyError:
				synDict[wordIndex] = [s_alt]

		return synDict

コード例 #36

0

ファイルを表示

ファイル: stardict.py プロジェクト: anjouslava/pyglossary

 def readSynFile(self):
     if not isfile(self.fileBasePath+'.syn'):
         return
     with open(self.fileBasePath+'.syn', 'rb') as f:
         synStr = f.read()
     i = 0
     while i < len(synStr):
         beg = i
         i = synStr.find('\x00', beg)
         if i < 0:
             log.error("Synonym file is corrupted.")
             break
         word = synStr[beg:i]
         i += 1
         if i + 4 > len(synStr):
             log.error("Synonym file is corrupted.")
             break
         index = binStrToInt(synStr[i:i+4])
         i += 4
         if index >= len(self.indexData):
             log.error("Corrupted synonym file. Word \"{0}\" references invalid item.".format(word))
             continue
         self.indexData[index][4].append(word)

コード例 #37

0

ファイルを表示

ファイル: bgl_reader.py プロジェクト: wxyjuly/pyglossary

	def readEntry_Type11(self, block):
		"""return (succeed, u_word, u_alts, u_defi)"""
		Err = (False, None, None, None)
		pos = 0

		# reading headword
		if pos + 5 > len(block.data):
			log.error(
				f"reading block offset={block.offset:#02x}"
				f", reading word size: pos + 5 > len(block.data)"
			)
			return Err
		wordLen = binStrToInt(block.data[pos:pos + 5])
		pos += 5
		if pos + wordLen > len(block.data):
			log.error(
				f"reading block offset={block.offset:#02x}"
				f", block.type={block.type}"
				f", reading word: pos + wordLen > len(block.data)"
			)
			return Err
		b_word = block.data[pos:pos + wordLen]
		u_word = self.processKey(b_word)
		pos += wordLen
		self.wordLenMax = max(self.wordLenMax, len(u_word))

		# reading alts and defi
		if pos + 4 > len(block.data):
			log.error(
				f"reading block offset={block.offset:#02x}"
				f", reading defi size: pos + 4 > len(block.data)"
			)
			return Err
		altsCount = binStrToInt(block.data[pos:pos + 4])
		pos += 4

		# reading alts
		# use set instead of list to prevent duplicates
		u_alts = set()
		for altIndex in range(altsCount):
			if pos + 4 > len(block.data):
				log.error(
					f"reading block offset={block.offset:#02x}"
					f", reading alt size: pos + 4 > len(block.data)"
				)
				return Err
			altLen = binStrToInt(block.data[pos:pos + 4])
			pos += 4
			if altLen == 0:
				if pos + altLen != len(block.data):
					# no evidence
					log.warning(
						f"reading block offset={block.offset:#02x}"
						f", reading alt size: pos + altLen != len(block.data)"
					)
				break
			if pos + altLen > len(block.data):
				log.error(
					f"reading block offset={block.offset:#02x}"
					f", block.type={block.type}"
					f", reading alt: pos + altLen > len(block.data)"
				)
				return Err
			b_alt = block.data[pos:pos + altLen]
			u_alt = self.processAlternativeKey(b_alt, b_word)
			# Like entry key, alt is not processed as html by babylon,
			# so do we.
			u_alts.add(u_alt)
			pos += altLen
		if u_word in u_alts:
			u_alts.remove(u_word)
		u_alts = list(sorted(u_alts))

		# reading defi
		defiLen = binStrToInt(block.data[pos:pos + 4])
		pos += 4
		if pos + defiLen > len(block.data):
			log.error(
				f"reading block offset={block.offset:#02x}"
				f", block.type={block.type}"
				f", reading defi: pos + defiLen > len(block.data)"
			)
			return Err
		b_defi = block.data[pos:pos + defiLen]
		u_defi = self.processDefi(b_defi, b_word)
		self.defiMaxBytes = max(self.defiMaxBytes, len(b_defi))
		pos += defiLen

		return True, u_word, u_alts, u_defi

コード例 #38

0

ファイルを表示

def decodeBglBinTime(b_value):
    jd1970 = gregorian.to_jd(1970, 1, 1)
    djd, hm = divmod(binStrToInt(b_value), 24 * 60)
    year, month, day = gregorian.jd_to(djd + jd1970)
    hour, minute = divmod(hm, 60)
    return "%.2d/%.2d/%.2d, %.2d:%.2d" % (year, month, day, hour, minute)

コード例 #39

0

ファイルを表示

def decodeBglBinTime(b_value):
    jd1970 = gregorian.to_jd(1970, 1, 1)
    djd, hm = divmod(binStrToInt(b_value), 24 * 60)
    year, month, day = gregorian.jd_to(djd + jd1970)
    hour, minute = divmod(hm, 60)
    return f"{year:04d}/{month:02d}/{day:02d}, {hour:02d}:{minute:02d}"

コード例 #40

0

ファイルを表示

ファイル: bgl_info.py プロジェクト: ilius/pyglossary

def decodeBglBinTime(b_value):
	jd1970 = gregorian.to_jd(1970, 1, 1)
	djd, hm = divmod(binStrToInt(b_value), 24*60)
	year, month, day = gregorian.jd_to(djd + jd1970)
	hour, minute = divmod(hm, 60)
	return "%.2d/%.2d/%.2d, %.2d:%.2d" % (year, month, day, hour, minute)

コード例 #41

0

ファイルを表示

ファイル: bgl_reader.py プロジェクト: ilius/pyglossary

	def readEntry_Type11(self, block):
		"""return (succeed, u_word, u_alts, u_defi)"""
		Err = (False, None, None, None)
		pos = 0

		# reading headword
		if pos + 5 > len(block.data):
			log.error(
				"reading block offset=%#.2x" % block.offset +
				", reading word size: pos + 5 > len(block.data)"
			)
			return Err
		wordLen = binStrToInt(block.data[pos:pos+5])
		pos += 5
		if pos + wordLen > len(block.data):
			log.error(
				"reading block offset=%#.2x" % block.offset +
				", block.type=%s" % block.type +
				", reading word: pos + wordLen > len(block.data)"
			)
			return Err
		b_word = block.data[pos:pos+wordLen]
		u_word = self.processKey(b_word)
		pos += wordLen
		self.wordLenMax = max(self.wordLenMax, len(u_word))

		# reading alts and defi
		if pos + 4 > len(block.data):
			log.error(
				"reading block offset=%#.2x" % block.offset +
				", reading defi size: pos + 4 > len(block.data)"
			)
			return Err
		altsCount = binStrToInt(block.data[pos:pos+4])
		pos += 4

		# reading alts
		# use set instead of list to prevent duplicates
		u_alts = set()
		for altIndex in range(altsCount):
			if pos + 4 > len(block.data):
				log.error(
					"reading block offset=%#.2x" % block.offset +
					", reading alt size: pos + 4 > len(block.data)"
				)
				return Err
			altLen = binStrToInt(block.data[pos:pos+4])
			pos += 4
			if altLen == 0:
				if pos + altLen != len(block.data):
					# no evidence
					log.warning(
						"reading block offset=%#.2x" % block.offset +
						", reading alt size: pos + altLen != len(block.data)"
					)
				break
			if pos + altLen > len(block.data):
				log.error(
					"reading block offset=%#.2x" % block.offset +
					", block.type=%s" % block.type +
					", reading alt: pos + altLen > len(block.data)"
				)
				return Err
			b_alt = block.data[pos:pos+altLen]
			u_alt = self.processAlternativeKey(b_alt, b_word)
			# Like entry key, alt is not processed as html by babylon,
			# so do we.
			u_alts.add(u_alt)
			pos += altLen
		if u_word in u_alts:
			u_alts.remove(u_word)
		u_alts = list(sorted(u_alts))

		# reading defi
		defiLen = binStrToInt(block.data[pos:pos+4])
		pos += 4
		if pos + defiLen > len(block.data):
			log.error(
				"reading block offset=%#.2x" % block.offset +
				", block.type=%s" % block.type +
				", reading defi: pos + defiLen > len(block.data)"
			)
			return Err
		b_defi = block.data[pos:pos+defiLen]
		u_defi = self.processDefi(b_defi, b_word)
		self.defiMaxBytes = max(self.defiMaxBytes, len(b_defi))
		pos += defiLen


		return True, u_word, u_alts, u_defi

コード例 #42

0

ファイルを表示

ファイル: bgl_reader.py プロジェクト: ilius/pyglossary

	def collectDefiFields(self, b_defi, b_key, fields):
		"""
		entry definition structure:
		<main definition>['\x14'[{field_code}{field_data}]*]
		{field_code} is one character
		{field_data} has arbitrary length
		"""
		# d0 is index of the '\x14 char in b_defi
		# d0 may be the last char of the string
		d0 = self.findDefiFieldsStart(b_defi)
		if d0 == -1:
			fields.b_defi = b_defi
			return

		fields.b_defi = b_defi[:d0]

		i = d0 + 1
		while i < len(b_defi):
			if self.metadata2:
				self.metadata2.defiTrailingFields[b_defi[i]] += 1

			if b_defi[i] == 0x02:
				# part of speech # "\x02" <one char - part of speech>
				if fields.partOfSpeech:
					log.debug(
						"collecting definition fields, " +
						"b_defi = %r\n" % b_defi +
						"b_key = %r:\n" % b_key +
						"duplicate part of speech item",
					)
				if i+1 >= len(b_defi):
					log.debug(
						"collecting definition fields, " +
						"b_defi = %r\n" % b_defi +
						"b_key = %r:\nb_defi ends after \\x02" % b_key
					)
					return

				posCode = b_defi[i+1]

				try:
					fields.partOfSpeech = partOfSpeechByCode[posCode]
				except KeyError:
					log.debug(
						"collecting definition fields, " +
						"b_defi = %r\n" % b_defi +
						"b_key = %r:\n" % b_key +
						"unknown part of speech code = %#.2x" % posCode
					)
					return
				i += 2
			elif b_defi[i] == 0x06:  # \x06<one byte>
				if fields.b_field_06:
					log.debug(
						"collecting definition fields, " +
						"b_defi = %r\n" % b_defi +
						"b_key = %r:\nduplicate type 6" % b_key
					)
				if i+1 >= len(b_defi):
					log.debug(
						"collecting definition fields, " +
						"b_defi = %r\n" % b_defi +
						"b_key = %r:\nb_defi ends after \\x06" % b_key
					)
					return
				fields.b_field_06 = b_defi[i+1]
				i += 2
			elif b_defi[i] == 0x07:  # \x07<two bytes>
				# Found in 4 Hebrew dictionaries. I do not understand.
				if i+3 > len(b_defi):
					log.debug(
						"collecting definition fields, " +
						"b_defi = %r\n" % b_defi +
						"b_key = %r:\ntoo few data after \\x07" % b_key
					)
					return
				fields.b_field_07 = b_defi[i+1:i+3]
				i += 3
			elif b_defi[i] == 0x13:  # "\x13"<one byte - length><data>
				# known values:
				# 03 06 0D C7
				# 04 00 00 00 44
				# ...
				# 04 00 00 00 5F
				if i + 1 >= len(b_defi):
					log.debug(
						"collecting definition fields, " +
						"b_defi = %r\n" % b_defi +
						"b_key = %r:\ntoo few data after \\x13" % b_key
					)
					return
				Len = b_defi[i+1]
				i += 2
				if Len == 0:
					log.debug(
						"collecting definition fields, " +
						"b_defi = %r\n" % b_defi +
						"b_key = %r:\nblank data after \\x13" % b_key
					)
					continue
				if i+Len > len(b_defi):
					log.debug(
						"collecting definition fields, " +
						"b_defi = %r\n" % b_defi +
						"b_key = %r:\ntoo few data after \\x13" % b_key
					)
					return
				fields.b_field_13 = b_defi[i:i+Len]
				i += Len
			elif b_defi[i] == 0x18:
				# \x18<one byte - title length><entry title>
				if fields.b_title:
					log.debug(
						"collecting definition fields, " +
						"b_defi = %r\n" % b_defi +
						"b_key = %r:\nduplicate entry title item" % b_key
					)
				if i+1 >= len(b_defi):
					log.debug(
						"collecting definition fields, " +
						"b_defi = %r\n" % b_defi +
						"b_key = %r:\nb_defi ends after \\x18" % b_key
					)
					return
				i += 1
				Len = b_defi[i]
				i += 1
				if Len == 0:
					# log.debug(
					#	"collecting definition fields, b_defi = %r\n" % b_defi +
					#	"b_key = %r:\nblank entry title" % b_key
					# )
					continue
				if i + Len > len(b_defi):
					log.debug(
						"collecting definition fields, " +
						"b_defi = %r\n" % b_defi +
						"b_key = %r:\ntitle is too long" % b_key
					)
					return
				fields.b_title = b_defi[i:i+Len]
				i += Len
			elif b_defi[i] == 0x1a:  # "\x1a"<one byte - length><text>
				# found only in Hebrew dictionaries, I do not understand.
				if i + 1 >= len(b_defi):
					log.debug(
						"collecting definition fields, " +
						"b_defi = %r\n" % b_defi +
						"b_key = %s:\ntoo few data after \\x1a" % b_key
					)
					return
				Len = b_defi[i+1]
				i += 2
				if Len == 0:
					log.debug(
						"collecting definition fields, " +
						"b_defi = %r\n" % b_defi +
						"b_key = %r:\nblank data after \\x1a" % b_key
					)
					continue
				if i+Len > len(b_defi):
					log.debug(
						"collecting definition fields, " +
						"b_defi = %r\n" % b_defi +
						"b_key = %r:\ntoo few data after \\x1a" % b_key
					)
					return
				fields.b_field_1a = b_defi[i:i+Len]
				i += Len
			elif b_defi[i] == 0x28:  # "\x28" <two bytes - length><html text>
				# title with transcription?
				if i + 2 >= len(b_defi):
					log.debug(
						"collecting definition fields, " +
						"b_defi = %r\n" % b_defi +
						"b_key = %r:\ntoo few data after \\x28" % b_key
					)
					return
				i += 1
				Len = binStrToInt(b_defi[i:i+2])
				i += 2
				if Len == 0:
					log.debug(
						"collecting definition fields, " +
						"b_defi = %r\n" % b_defi +
						"b_key = %r:\nblank data after \\x28" % b_key
					)
					continue
				if i+Len > len(b_defi):
					log.debug(
						"collecting definition fields, " +
						"b_defi = %r\n" % b_defi +
						"b_key = %r:\ntoo few data after \\x28" % b_key
					)
					return
				fields.b_title_trans = b_defi[i:i+Len]
				i += Len
			elif 0x40 <= b_defi[i] <= 0x4f:  # [\x41-\x4f] <one byte> <text>
				# often contains digits as text:
				# 56
				# &#0230;lps - key Alps
				# 48@i
				# has no apparent influence on the article
				code = b_defi[i]
				Len = b_defi[i] - 0x3f
				if i+2+Len > len(b_defi):
					log.debug(
						"collecting definition fields, " +
						"b_defi = %r\n" % b_defi +
						"b_key = %r:\ntoo few data after \\x40+" % b_key
					)
					return
				i += 2
				b_text = b_defi[i:i+Len]
				i += Len
				log.debug(
					"\nunknown definition field %#.2x" % code +
					", b_text=%r" % b_text
				)
			elif b_defi[i] == 0x50:
				# \x50 <one byte> <one byte - length><data>
				if i + 2 >= len(b_defi):
					log.debug(
						"collecting definition fields, " +
						"b_defi = %r\n" % b_defi +
						"b_key = %r:\ntoo few data after \\x50" % b_key
					)
					return
				fields.code_transcription_50 = b_defi[i+1]
				Len = b_defi[i+2]
				i += 3
				if Len == 0:
					log.debug(
						"collecting definition fields, " +
						"b_defi = %r\n" % b_defi +
						"b_key = %r:\nblank data after \\x50" % b_key
					)
					continue
				if i+Len > len(b_defi):
					log.debug(
						"collecting definition fields, " +
						"b_defi = %r\n" % b_defi +
						"b_key = %r:\ntoo few data after \\x50" % b_key
					)
					return
				fields.b_transcription_50 = b_defi[i:i+Len]
				i += Len
			elif b_defi[i] == 0x60:
				# "\x60" <one byte> <two bytes - length> <text>
				if i + 4 > len(b_defi):
					log.debug(
						"collecting definition fields, " +
						"b_defi = %r\n" % b_defi +
						"b_key = %r:\ntoo few data after \\x60" % b_key
					)
					return
				fields.code_transcription_60 = b_defi[i+1]
				i += 2
				Len = binStrToInt(b_defi[i:i+2])
				i += 2
				if Len == 0:
					log.debug(
						"collecting definition fields, " +
						"b_defi = %r\n" % b_defi +
						"b_key = %r:\nblank data after \\x60" % b_key
					)
					continue
				if i+Len > len(b_defi):
					log.debug(
						"collecting definition fields, " +
						"b_defi = %r\n" % b_defi +
						"b_key = %r:\ntoo few data after \\x60" % b_key
					)
					return
				fields.b_transcription_60 = b_defi[i:i+Len]
				i += Len
			else:
				log.debug(
					"collecting definition fields, " +
					"b_defi = %r\n" % b_defi +
					"b_key = %r:\n" % b_key +
					"unknown control char. Char code = %#.2x" % b_defi[i]
				)
				return

コード例 #43

0

ファイルを表示

ファイル: bgl_reader.py プロジェクト: wxyjuly/pyglossary

	def collectDefiFields(self, b_defi, b_key, fields):
		"""
		entry definition structure:
		<main definition>['\x14'[{field_code}{field_data}]*]
		{field_code} is one character
		{field_data} has arbitrary length
		"""
		# d0 is index of the '\x14 char in b_defi
		# d0 may be the last char of the string
		d0 = self.findDefiFieldsStart(b_defi)
		if d0 == -1:
			fields.b_defi = b_defi
			return

		fields.b_defi = b_defi[:d0]

		i = d0 + 1
		while i < len(b_defi):
			if self.metadata2:
				self.metadata2.defiTrailingFields[b_defi[i]] += 1

			if b_defi[i] == 0x02:
				# part of speech # "\x02" <one char - part of speech>
				if fields.partOfSpeech:
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}"
						f"\nb_key = {b_key!r}"
						f":\nduplicate part of speech item",
					)
				if i + 1 >= len(b_defi):
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}"
						f"\nb_key = {b_key!r}:\nb_defi ends after \\x02"
					)
					return

				posCode = b_defi[i + 1]

				try:
					fields.partOfSpeech = partOfSpeechByCode[posCode]
				except KeyError:
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}"
						f"\nb_key = {b_key!r}"
						f":\nunknown part of speech code = {posCode:#02x}"
					)
					return
				i += 2
			elif b_defi[i] == 0x06:  # \x06<one byte>
				if fields.b_field_06:
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}"
						f"\nb_key = {b_key!r}:\nduplicate type 6"
					)
				if i + 1 >= len(b_defi):
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}"
						f"\nb_key = {b_key!r}:\nb_defi ends after \\x06"
					)
					return
				fields.b_field_06 = b_defi[i + 1]
				i += 2
			elif b_defi[i] == 0x07:  # \x07<two bytes>
				# Found in 4 Hebrew dictionaries. I do not understand.
				if i + 3 > len(b_defi):
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}"
						f"\nb_key = {b_key!r}:\ntoo few data after \\x07"
					)
					return
				fields.b_field_07 = b_defi[i + 1:i + 3]
				i += 3
			elif b_defi[i] == 0x13:  # "\x13"<one byte - length><data>
				# known values:
				# 03 06 0D C7
				# 04 00 00 00 44
				# ...
				# 04 00 00 00 5F
				if i + 1 >= len(b_defi):
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}"
						f"\nb_key = {b_key!r}:\ntoo few data after \\x13"
					)
					return
				Len = b_defi[i + 1]
				i += 2
				if Len == 0:
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}\n"
						f"b_key = {b_key!r}:\nblank data after \\x13"
					)
					continue
				if i + Len > len(b_defi):
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}\n" +
						f"b_key = {b_key!r}:\ntoo few data after \\x13"
					)
					return
				fields.b_field_13 = b_defi[i:i + Len]
				i += Len
			elif b_defi[i] == 0x18:
				# \x18<one byte - title length><entry title>
				if fields.b_title:
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}"
						f"b_key = {b_key!r}:\nduplicate entry title item"
					)
				if i + 1 >= len(b_defi):
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}\n"
						f"b_key = {b_key!r}:\nb_defi ends after \\x18"
					)
					return
				i += 1
				Len = b_defi[i]
				i += 1
				if Len == 0:
					# log.debug(
					#	f"collecting definition fields, b_defi = {b_defi!r}\n"
					#	f"b_key = {b_key!r}:\nblank entry title"
					# )
					continue
				if i + Len > len(b_defi):
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}\n"
						f"b_key = {b_key!r}:\ntitle is too long"
					)
					return
				fields.b_title = b_defi[i:i + Len]
				i += Len
			elif b_defi[i] == 0x1a:  # "\x1a"<one byte - length><text>
				# found only in Hebrew dictionaries, I do not understand.
				if i + 1 >= len(b_defi):
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}"
						f"\nb_key = {b_key}:\ntoo few data after \\x1a"
					)
					return
				Len = b_defi[i + 1]
				i += 2
				if Len == 0:
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}"
						f"\nb_key = {b_key!r}:\nblank data after \\x1a"
					)
					continue
				if i + Len > len(b_defi):
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}"
						f"\nb_key = {b_key!r}:\ntoo few data after \\x1a"
					)
					return
				fields.b_field_1a = b_defi[i:i + Len]
				i += Len
			elif b_defi[i] == 0x28:  # "\x28" <two bytes - length><html text>
				# title with transcription?
				if i + 2 >= len(b_defi):
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}"
						f"\nb_key = {b_key!r}:\ntoo few data after \\x28"
					)
					return
				i += 1
				Len = binStrToInt(b_defi[i:i + 2])
				i += 2
				if Len == 0:
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}"
						f"\nb_key = {b_key!r}:\nblank data after \\x28"
					)
					continue
				if i + Len > len(b_defi):
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}"
						f"\nb_key = {b_key!r}:\ntoo few data after \\x28"
					)
					return
				fields.b_title_trans = b_defi[i:i + Len]
				i += Len
			elif 0x40 <= b_defi[i] <= 0x4f:  # [\x41-\x4f] <one byte> <text>
				# often contains digits as text:
				# 56
				# &#0230;lps - key Alps
				# 48@i
				# has no apparent influence on the article
				code = b_defi[i]
				Len = b_defi[i] - 0x3f
				if i + 2 + Len > len(b_defi):
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}"
						f"\nb_key = {b_key!r}:\ntoo few data after \\x40+"
					)
					return
				i += 2
				b_text = b_defi[i:i + Len]
				i += Len
				log.debug(
					f"unknown definition field {code:#02x}, b_text={b_text!r}"
				)
			elif b_defi[i] == 0x50:
				# \x50 <one byte> <one byte - length><data>
				if i + 2 >= len(b_defi):
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}"
						f"\nb_key = {b_key!r}:\ntoo few data after \\x50"
					)
					return
				fields.code_transcription_50 = b_defi[i + 1]
				Len = b_defi[i + 2]
				i += 3
				if Len == 0:
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}"
						f"\nb_key = {b_key!r}:\nblank data after \\x50"
					)
					continue
				if i + Len > len(b_defi):
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}"
						f"\nb_key = {b_key!r}:\ntoo few data after \\x50"
					)
					return
				fields.b_transcription_50 = b_defi[i:i + Len]
				i += Len
			elif b_defi[i] == 0x60:
				# "\x60" <one byte> <two bytes - length> <text>
				if i + 4 > len(b_defi):
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}"
						f"\nb_key = {b_key!r}:\ntoo few data after \\x60"
					)
					return
				fields.code_transcription_60 = b_defi[i + 1]
				i += 2
				Len = binStrToInt(b_defi[i:i + 2])
				i += 2
				if Len == 0:
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}"
						f"\nb_key = {b_key!r}:\nblank data after \\x60"
					)
					continue
				if i + Len > len(b_defi):
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}" +
						f"\nb_key = {b_key!r}:\ntoo few data after \\x60"
					)
					return
				fields.b_transcription_60 = b_defi[i:i + Len]
				i += Len
			else:
				log.debug(
					f"collecting definition fields, b_defi = {b_defi!r}"
					f"\nb_key = {b_key!r}"
					f":\nunknown control char. Char code = {b_defi[i]:#02x}"
				)
				return