def readIdxFile(self): if isfile(self.fileBasePath + '.idx.gz'): import gzip with gzip.open(self.fileBasePath + '.idx.gz') as f: idxStr = f.read() else: with open(self.fileBasePath + '.idx', 'rb') as f: idxStr = f.read() self.indexData = [] i = 0 while i < len(idxStr): beg = i i = idxStr.find('\x00', beg) if i < 0: log.error("Index file is corrupted.") break word = idxStr[beg:i] i += 1 if i + 8 > len(idxStr): log.error("Index file is corrupted") break offset = binStrToInt(idxStr[i:i + 4]) i += 4 size = binStrToInt(idxStr[i:i + 4]) i += 4 self.indexData.append([word, offset, size, [], []])
def readIdxFile(self): if isfile(self.fileBasePath + ".idx.gz"): import gzip with gzip.open(self.fileBasePath + ".idx.gz") as f: idxStr = f.read() else: with open(self.fileBasePath + ".idx", "rb") as f: idxStr = f.read() self.indexData = [] i = 0 while i < len(idxStr): beg = i i = idxStr.find("\x00", beg) if i < 0: log.error("Index file is corrupted.") break word = idxStr[beg:i] i += 1 if i + 8 > len(idxStr): log.error("Index file is corrupted") break offset = binStrToInt(idxStr[i : i + 4]) i += 4 size = binStrToInt(idxStr[i : i + 4]) i += 4 self.indexData.append([word, offset, size, [], []])
def readIdxFile(self): if isfile(self._filename+".idx.gz"): with gzip.open(self._filename+".idx.gz") as idxFile: idxBytes = idxFile.read() else: with open(self._filename+".idx", "rb") as idxFile: idxBytes = idxFile.read() indexData = [] pos = 0 while pos < len(idxBytes): beg = pos pos = idxBytes.find(b"\x00", beg) if pos < 0: log.error("Index file is corrupted") break b_word = idxBytes[beg:pos] pos += 1 if pos + 8 > len(idxBytes): log.error("Index file is corrupted") break offset = binStrToInt(idxBytes[pos:pos+4]) pos += 4 size = binStrToInt(idxBytes[pos:pos+4]) pos += 4 indexData.append([b_word, offset, size]) return indexData
def readIdxFile(self): if isfile(self.fileBasePath+'.idx.gz'): import gzip with gzip.open(self.fileBasePath+'.idx.gz') as f: idxStr = f.read() else: with open(self.fileBasePath+'.idx', 'rb') as f: idxStr = f.read() indexData = [] i = 0 while i < len(idxStr): beg = i i = idxStr.find(b'\x00', beg) if i < 0: log.error("Index file is corrupted.") break word = idxStr[beg:i] i += 1 if i + 8 > len(idxStr): log.error("Index file is corrupted") break offset = binStrToInt(idxStr[i:i+4]) i += 4 size = binStrToInt(idxStr[i:i+4]) i += 4 indexData.append([word, offset, size]) return indexData
def readIdxFile(self) -> List[Tuple[bytes, int, int]]: if isfile(self._filename + ".idx.gz"): with gzip.open(self._filename + ".idx.gz") as idxFile: idxBytes = idxFile.read() else: with open(self._filename + ".idx", "rb") as idxFile: idxBytes = idxFile.read() indexData = [] pos = 0 while pos < len(idxBytes): beg = pos pos = idxBytes.find(b"\x00", beg) if pos < 0: log.error("Index file is corrupted") break b_word = idxBytes[beg:pos] pos += 1 if pos + 8 > len(idxBytes): log.error("Index file is corrupted") break offset = binStrToInt(idxBytes[pos:pos + 4]) pos += 4 size = binStrToInt(idxBytes[pos:pos + 4]) pos += 4 indexData.append((b_word, offset, size)) return indexData
def readEntryDefi(self, block, pos, b_word): """ Read defi part of entry. Return value is a list. (False, None, None, None) if error (True, pos, u_defi, b_defi) if OK u_defi is a str instance (utf-8) b_defi is a bytes instance """ Err = (False, None, None, None) if pos + 2 > len(block.data): log.error( f"reading block offset={block.offset:#02x}" f", reading defi size: pos + 2 > len(block.data)" ) return Err Len = binStrToInt(block.data[pos:pos + 2]) pos += 2 if pos + Len > len(block.data): log.error( f"reading block offset={block.offset:#02x}" f", block.type={block.type}" f", reading defi: pos + Len > len(block.data)" ) return Err b_defi = block.data[pos:pos + Len] u_defi = self.processDefi(b_defi, b_word) self.defiMaxBytes = max(self.defiMaxBytes, len(b_defi)) pos += Len return True, pos, u_defi, b_defi
def readSynFile(self, indexCount): """ returns synData, a dict { wordIndex -> synWordsList } """ if not isfile(self.fileBasePath + '.syn'): return {} synStr = open(self.fileBasePath + '.syn', 'rb').read() synStrLen = len(synStr) synData = {} i = 0 while i < synStrLen: beg = i i = synStr.find('\x00', beg) if i < 0: log.error("Synonym file is corrupted.") break word = synStr[beg:i] i += 1 if i + 4 > len(synStr): log.error("Synonym file is corrupted.") break index = binStrToInt(synStr[i:i + 4]) i += 4 if index >= indexCount: log.error( "Corrupted synonym file. Word \"{0}\" references invalid item." .format(word)) continue try: synData[index].append(word) except KeyError: synData[index] = [word] return synData
def readSynFile(self): if not isfile(self.fileBasePath + '.syn'): return with open(self.fileBasePath + '.syn', 'rb') as f: synStr = f.read() i = 0 while i < len(synStr): beg = i i = synStr.find('\x00', beg) if i < 0: log.error("Synonym file is corrupted.") break word = synStr[beg:i] i += 1 if i + 4 > len(synStr): log.error("Synonym file is corrupted.") break index = binStrToInt(synStr[i:i + 4]) i += 4 if index >= len(self.indexData): log.error( "Corrupted synonym file. Word \"{0}\" references invalid item." .format(word)) continue self.indexData[index][4].append(word)
def readEntryDefi(self, block, pos, b_word): """ Read defi part of entry. Return value is a list. (False, None, None, None) if error (True, pos, u_defi, b_defi) if OK u_defi is a str instance (utf-8) b_defi is a bytes instance """ Err = (False, None, None, None) if pos + 2 > len(block.data): log.error( "reading block offset=%#.2x" % block.offset + ", reading defi size: pos + 2 > len(block.data)" ) return Err Len = binStrToInt(block.data[pos:pos+2]) pos += 2 if pos + Len > len(block.data): log.error( "reading block offset=%#.2x" % block.offset + ", block.type=%s" % block.type + ", reading defi: pos + Len > len(block.data)" ) return Err b_defi = block.data[pos:pos+Len] u_defi = self.processDefi(b_defi, b_word) self.defiMaxBytes = max(self.defiMaxBytes, len(b_defi)) pos += Len return True, pos, u_defi, b_defi
def parseDefiBlockGeneral(self, data, word): """Parse definition block when sametypesequence option is not specified. """ dataFileCorruptedError = "Data file is corrupted. Word \"{0}\"".format( word) res = [] i = 0 while i < len(data): t = data[i] if not isAsciiAlpha(t): log.error(dataFileCorruptedError) return None i += 1 if isAsciiLower(t): beg = i i = data.find('\x00', beg) if i < 0: log.error(dataFileCorruptedError) return None res.append((data[beg:i], t)) i += 1 else: assert isAsciiUpper(t) if i + 4 > len(data): log.error(dataFileCorruptedError) return None size = binStrToInt(data[i:i + 4]) i += 4 if i + size > len(data): log.error(dataFileCorruptedError) return None res.append((data[i:i + size], t)) i += size return res
def openGzip(self): with open(self._filename, "rb") as bglFile: if not bglFile: log.error(f"file pointer empty: {bglFile}") return False b_head = bglFile.read(6) if len(b_head) < 6 or not b_head[:4] in ( b"\x12\x34\x00\x01", b"\x12\x34\x00\x02", ): log.error(f"invalid header: {b_head[:6]!r}") return False self.gzipOffset = gzipOffset = binStrToInt(b_head[4:6]) log.debug(f"Position of gz header: {gzipOffset}") if gzipOffset < 6: log.error(f"invalid gzip header position: {gzipOffset}") return False self.file = BGLGzipFile( fileobj=FileOffS(self._filename, gzipOffset), closeFileobj=True, ) return True
def utf16InfoDecode(b_value): """ b_value is byte array returns str, or None (on errors) block type = 3 block format: <2 byte code1><2 byte code2> if code2 == 0: then the block ends if code2 == 1: then the block continues as follows: <4 byte len1> \x00 \x00 <message in utf-16> len1 - length of message in 2-byte chars """ if b_value[0] != 0: log.warning('utf16InfoDecode: b_value=%s, null expected at 0'%list(b_value)) return if b_value[1] == 0: if len(b_value) > 2: log.warning('utf16InfoDecode: unexpected b_value size: %s'%len(b_value)) return elif b_value[1] > 1: log.warning('utf16InfoDecode: b_value=%s, unexpected byte at 1'%list(b_value)) return ## now b_value[1] == 1 size = 2 * binStrToInt(b_value[2:6]) if tuple(b_value[6:8]) != (0, 0): log.warning('utf16InfoDecode: b_value=%s, null expected at 6:8'%list(b_value)) if size != len(b_value)-8: log.warning('utf16InfoDecode: b_value=%s, size does not match'%list(b_value)) return b_value[8:].decode('utf16')## str
def readSynFile(self, indexCount): """ returns synData, a dict { wordIndex -> synWordsList } """ if not isfile(self.fileBasePath+'.syn'): return {} with open(self.fileBasePath+'.syn', 'rb') as synFile: synStr = synFile.read() synStrLen = len(synStr) synData = {} i = 0 while i < synStrLen: beg = i i = synStr.find(b'\x00', beg) if i < 0: log.error("Synonym file is corrupted.") break word = synStr[beg:i] i += 1 if i + 4 > len(synStr): log.error("Synonym file is corrupted.") break index = binStrToInt(synStr[i:i+4]) i += 4 if index >= indexCount: log.error("Corrupted synonym file. Word \"{0}\" references invalid item.".format(word)) continue word = toStr(word) try: synData[index].append(word) except KeyError: synData[index] = [word] return synData
def parseDefiBlockGeneral(self, data, word): """ Parse definition block when sametypesequence option is not specified. """ dataFileCorruptedError = "Data file is corrupted. Word \"{0}\"".format(word) res = [] i = 0 while i < len(data): t = data[i] if not isAsciiAlpha(t): log.error(dataFileCorruptedError) return None i += 1 if isAsciiLower(t): beg = i i = data.find(b'\x00', beg) if i < 0: log.error(dataFileCorruptedError) return None res.append((data[beg:i], t)) i += 1 else: assert isAsciiUpper(t) if i + 4 > len(data): log.error(dataFileCorruptedError) return None size = binStrToInt(data[i:i+4]) i += 4 if i + size > len(data): log.error(dataFileCorruptedError) return None res.append((data[i:i+size], t)) i += size return res
def parseDefiBlockGeneral(self, b_block): """ Parse definition block when sametypesequence option is not specified. Return a list of (b_defi, defiFormatCode) tuples where b_defi is a bytes instance and defiFormatCode is int, so: defiFormat = chr(defiFormatCode) """ res = [] i = 0 while i < len(b_block): t = b_block[i] if not bytes([t]).isalpha(): return None i += 1 if bytes([t]).islower(): beg = i i = b_block.find(b"\x00", beg) if i < 0: return None res.append((b_block[beg:i], t)) i += 1 else: assert bytes([t]).isupper() if i + 4 > len(b_block): return None size = binStrToInt(b_block[i:i+4]) i += 4 if i + size > len(b_block): return None res.append((b_block[i:i+size], t)) i += size return res
def parseDefiBlockGeneral(self, b_block: bytes) -> List[Tuple[bytes, int]]: """ Parse definition block when sametypesequence option is not specified. Return a list of (b_defi, defiFormatCode) tuples where b_defi is a bytes instance and defiFormatCode is int, so: defiFormat = chr(defiFormatCode) """ res = [] i = 0 while i < len(b_block): t = b_block[i] if not bytes([t]).isalpha(): return None i += 1 if bytes([t]).islower(): beg = i i = b_block.find(b"\x00", beg) if i < 0: return None res.append((b_block[beg:i], t)) i += 1 else: assert bytes([t]).isupper() if i + 4 > len(b_block): return None size = binStrToInt(b_block[i:i + 4]) i += 4 if i + size > len(b_block): return None res.append((b_block[i:i + size], t)) i += size return res
def openGzip(self): with open(self._filename, "rb") as bglFile: if not bglFile: log.error("file pointer empty: %s" % bglFile) return False b_head = bglFile.read(6) if len(b_head) < 6 or not b_head[:4] in ( b"\x12\x34\x00\x01", b"\x12\x34\x00\x02", ): log.error("invalid header: %r" % b_head[:6]) return False self.gzipOffset = gzipOffset = binStrToInt(b_head[4:6]) log.debug("Position of gz header: %s" % gzipOffset) if gzipOffset < 6: log.error("invalid gzip header position: %s" % gzipOffset) return False self.file = BGLGzipFile( fileobj=FileOffS(self._filename, gzipOffset), closeFileobj=True, ) return True
def languageInfoDecode(b_value): """ returns BabylonLanguage instance """ intValue = binStrToInt(b_value) try: return languageByCode[intValue] except IndexError: log.warning("read_type_3: unknown language code = %s" % intValue) return
def languageInfoDecode(b_value): """ returns BabylonLanguage instance """ intValue = binStrToInt(b_value) try: return languageByCode[intValue] except IndexError: log.warning("read_type_3: unknown language code = %s", intValue) return
def parseDefiBlockCompact(self, data, sametypesequence, rawWord): """ Parse definition block when sametypesequence option is specified. """ # data is bytes # rawWord is bytes sametypesequence = toBytes(sametypesequence) assert len(sametypesequence) > 0 dataFileCorruptedError = "Data file is corrupted. Word \"{0}\"".format( rawWord) res = [] i = 0 for t in sametypesequence[:-1]: if i >= len(data): log.error(dataFileCorruptedError) return None if isAsciiLower(t): beg = i i = data.find(b'\x00', beg) if i < 0: log.error(dataFileCorruptedError) return None res.append((data[beg:i], t)) i += 1 else: assert isAsciiUpper(t) if i + 4 > len(data): log.error(dataFileCorruptedError) return None size = binStrToInt(data[i:i + 4]) i += 4 if i + size > len(data): log.error(dataFileCorruptedError) return None res.append((data[i:i + size], t)) i += size if i >= len(data): log.error(dataFileCorruptedError) return None t = sametypesequence[-1] if isAsciiLower(t): if 0 in data[i:]: log.error(dataFileCorruptedError) return None res.append((data[i:], t)) else: assert isAsciiUpper(t) res.append((data[i:], t)) return res
def readType3(self, block): """ reads block with type 3, and updates self.info returns None """ code, b_value = binStrToInt(block.data[:2]), block.data[2:] if not b_value: return # if not b_value.strip(b"\x00"): return # FIXME try: key = infoKeysByCode[code] except KeyError: if b_value.strip(b"\x00"): log.debug( f"Unknown info type code={code:#02x}, b_value={b_value!r}", ) return value = None func = infoKeyDecodeMethods.get(key) if func is None: value = b_value else: value = func(b_value) # `value` can be None, str, bytes or dict if not value: return if isinstance(value, dict): self.info.update(value) return if key in { "sourceLang", "targetLang", "defaultCharset", "sourceCharset", "targetCharset", "sourceEncoding", "targetEncoding", "bgl_numEntries", "iconData", }: setattr(self, key, value) return self.info[key] = value
def parseDefiBlockCompact(self, data, sametypesequence, word): """ Parse definition block when sametypesequence option is specified. """ sametypesequence = toBytes(sametypesequence) assert len(sametypesequence) > 0 dataFileCorruptedError = "Data file is corrupted. Word \"{0}\"".format(word) res = [] i = 0 for t in sametypesequence[:-1]: if i >= len(data): log.error(dataFileCorruptedError) return None if isAsciiLower(t): beg = i i = data.find('\x00', beg) if i < 0: log.error(dataFileCorruptedError) return None res.append((data[beg:i], t)) i += 1 else: assert isAsciiUpper(t) if i + 4 > len(data): log.error(dataFileCorruptedError) return None size = binStrToInt(data[i:i+4]) i += 4 if i + size > len(data): log.error(dataFileCorruptedError) return None res.append((data[i:i+size], t)) i += size if i >= len(data): log.error(dataFileCorruptedError) return None t = sametypesequence[-1] if isAsciiLower(t): i2 = data.find('\x00', i) if i2 >= 0: log.error(dataFileCorruptedError) return None res.append((data[i:], t)) else: assert isAsciiUpper(t) res.append((data[i:], t)) return res
def parseDefiBlockCompact(self, data, sametypesequence, word): """Parse definition block when sametypesequence option is specified. """ assert type(sametypesequence) == str assert len(sametypesequence) > 0 dataFileCorruptedError = "Data file is corrupted. Word \"{0}\"".format( word) res = [] i = 0 for t in sametypesequence[:-1]: if i >= len(data): printAsError(dataFileCorruptedError) return None if isAsciiLower(t): beg = i i = data.find('\x00', beg) if i < 0: printAsError(dataFileCorruptedError) return None res.append((data[beg:i], t)) i += 1 else: assert isAsciiUpper(t) if i + 4 > len(data): printAsError(dataFileCorruptedError) return None size = binStrToInt(data[i:i + 4]) i += 4 if i + size > len(data): printAsError(dataFileCorruptedError) return None res.append((data[i:i + size], t)) i += size if i >= len(data): printAsError(dataFileCorruptedError) return None t = sametypesequence[-1] if isAsciiLower(t): i2 = data.find('\x00', i) if i2 >= 0: printAsError(dataFileCorruptedError) return None res.append((data[i:], t)) else: assert isAsciiUpper(t) res.append((data[i:], t)) return res
def parseDefiBlockCompact( self, b_block: bytes, sametypesequence: str, ) -> List[Tuple[bytes, int]]: """ Parse definition block when sametypesequence option is specified. Return a list of (b_defi, defiFormatCode) tuples where b_defi is a bytes instance and defiFormatCode is int, so: defiFormat = chr(defiFormatCode) """ b_sametypesequence = sametypesequence.encode("utf-8") assert len(b_sametypesequence) > 0 res = [] i = 0 for t in b_sametypesequence[:-1]: if i >= len(b_block): return None if bytes([t]).islower(): beg = i i = b_block.find(b"\x00", beg) if i < 0: return None res.append((b_block[beg:i], t)) i += 1 else: assert bytes([t]).isupper() if i + 4 > len(b_block): return None size = binStrToInt(b_block[i:i + 4]) i += 4 if i + size > len(b_block): return None res.append((b_block[i:i + size], t)) i += size if i >= len(b_block): return None t = b_sametypesequence[-1] if bytes([t]).islower(): if 0 in b_block[i:]: return None res.append((b_block[i:], t)) else: assert bytes([t]).isupper() res.append((b_block[i:], t)) return res
def readType0(self, block): code = block.data[0] if code == 2: # this number is vary close to self.bgl_numEntries, # but does not always equal to the number of entries # see self.readType3, code == 12 as well num = binStrToInt(block.data[1:]) elif code == 8: self.defaultCharset = charsetInfoDecode(block.data[1:]) if not self.defaultCharset: log.warning("defaultCharset is not valid") else: self.logUnknownBlock(block) return False return True
def utf16InfoDecode(b_value): """ b_value is byte array returns str, or None (on errors) block type = 3 block format: <2 byte code1><2 byte code2> if code2 == 0: then the block ends if code2 == 1: then the block continues as follows: <4 byte len1> \x00 \x00 <message in utf-16> len1 - length of message in 2-byte chars """ if b_value[0] != 0: log.warning( "utf16InfoDecode: b_value=%s, null expected at 0", b_value, ) return if b_value[1] == 0: if len(b_value) > 2: log.warning( "utf16InfoDecode: unexpected b_value size: %s", len(b_value), ) return elif b_value[1] > 1: log.warning( "utf16InfoDecode: b_value=%s, unexpected byte at 1", list(b_value), ) return # now b_value[1] == 1 size = 2 * binStrToInt(b_value[2:6]) if tuple(b_value[6:8]) != (0, 0): log.warning( "utf16InfoDecode: b_value=%s, null expected at 6:8", list(b_value), ) if size != len(b_value) - 8: log.warning( "utf16InfoDecode: b_value=%s, size does not match", list(b_value), ) return b_value[8:].decode("utf16") # str
def readType3(self, block): """ reads block with type 3, and updates self.info returns None """ code, b_value = binStrToInt(block.data[:2]), block.data[2:] if not b_value: return # if not b_value.strip(b"\x00"): return # FIXME try: key = infoKeysByCode[code] except KeyError: if b_value.strip(b"\x00"): log.debug( "Unknown info type code=%#.2x" % code + ", b_value=%r" % b_value, ) return try: func = infoKeyDecodeMethods[key] except KeyError: value = b_value else: value = func(b_value) # `value` can be a bytes instance, # or str instance, depending on `key` FIXME if value: if isinstance(value, dict): self.info.update(value) elif key in { "sourceLang", "targetLang", "defaultCharset", "sourceCharset", "targetCharset", "sourceEncoding", "targetEncoding", "bgl_numEntries", "iconData", }: setattr(self, key, value) else: self.info[key] = value
def parseDefiBlockCompact(self, b_block, sametypesequence): """ Parse definition block when sametypesequence option is specified. Return a list of (b_defi, defiFormatCode) tuples where b_defi is a bytes instance and defiFormatCode is int, so: defiFormat = chr(defiFormatCode) """ assert isinstance(b_block, bytes) sametypesequence = sametypesequence.encode("utf-8") assert len(sametypesequence) > 0 res = [] i = 0 for t in sametypesequence[:-1]: if i >= len(b_block): return None if bytes([t]).islower(): beg = i i = b_block.find(b"\x00", beg) if i < 0: return None res.append((b_block[beg:i], t)) i += 1 else: assert bytes([t]).isupper() if i + 4 > len(b_block): return None size = binStrToInt(b_block[i:i+4]) i += 4 if i + size > len(b_block): return None res.append((b_block[i:i+size], t)) i += size if i >= len(b_block): return None t = sametypesequence[-1] if bytes([t]).islower(): if 0 in b_block[i:]: return None res.append((b_block[i:], t)) else: assert bytes([t]).isupper() res.append((b_block[i:], t)) return res
def readBytes(self, num): """ return -1 if error """ if num < 1 or num > 4: log.error(f"invalid argument num={num}") return -1 self.file.flush() buf = self.file.read(num) if len(buf) == 0: log.debug("readBytes: end of file: len(buf)==0") return -1 if len(buf) != num: log.error( f"readBytes: expected to read {num} bytes" f", but found {len(buf)} bytes" ) return -1 return binStrToInt(buf)
def flagsInfoDecode(b_value): """ returns a dict with these keys: utf8Encoding when this flag is set utf8 encoding is used for all articles when false, the encoding is set according to the source and target alphabet spellingAlternatives determines whether the glossary offers spelling alternatives for searched terms caseSensitive defines if the search for terms in this glossary is case sensitive see code 0x20 as well """ flags = binStrToInt(b_value) return { 'utf8Encoding': (flags & 0x8000 != 0), 'spellingAlternatives': (flags & 0x10000 == 0), 'caseSensitive': (flags & 0x1000 != 0), }
def readBytes(self, num): """ return -1 if error """ if num < 1 or num > 4: log.error("invalid argument num=%s" % num) return -1 self.file.flush() buf = self.file.read(num) if len(buf) == 0: log.debug("readBytes: end of file: len(buf)==0") return -1 if len(buf) != num: log.error( "readBytes: expected to read %s bytes" % num + ", but found %s bytes" % len(buf) ) return -1 return binStrToInt(buf)
def readSynFile(self): """ return synDict, a dict { wordIndex -> altList } """ if not isfile(self._filename+".syn"): return {} with open(self._filename+".syn", "rb") as synFile: synBytes = synFile.read() synBytesLen = len(synBytes) synDict = {} pos = 0 while pos < synBytesLen: beg = pos pos = synBytes.find(b"\x00", beg) if pos < 0: log.error("Synonym file is corrupted") break b_alt = synBytes[beg:pos] # b_alt is bytes pos += 1 if pos + 4 > len(synBytes): log.error("Synonym file is corrupted") break wordIndex = binStrToInt(synBytes[pos:pos+4]) pos += 4 if wordIndex >= self._wordCount: log.error( "Corrupted synonym file. " + "Word \"%s\" references invalid item" % b_alt ) continue s_alt = b_alt.decode("utf-8") # s_alt is str try: synDict[wordIndex].append(s_alt) except KeyError: synDict[wordIndex] = [s_alt] return synDict
def readSynFile(self) -> Dict[int, List[str]]: """ return synDict, a dict { wordIndex -> altList } """ if not isfile(self._filename+".syn"): return {} with open(self._filename+".syn", "rb") as synFile: synBytes = synFile.read() synBytesLen = len(synBytes) synDict = {} pos = 0 while pos < synBytesLen: beg = pos pos = synBytes.find(b"\x00", beg) if pos < 0: log.error("Synonym file is corrupted") break b_alt = synBytes[beg:pos] # b_alt is bytes pos += 1 if pos + 4 > len(synBytes): log.error("Synonym file is corrupted") break wordIndex = binStrToInt(synBytes[pos:pos+4]) pos += 4 if wordIndex >= self._wordCount: log.error( "Corrupted synonym file. " + "Word \"%s\" references invalid item" % b_alt ) continue s_alt = b_alt.decode("utf-8") # s_alt is str try: synDict[wordIndex].append(s_alt) except KeyError: synDict[wordIndex] = [s_alt] return synDict
def readSynFile(self): if not isfile(self.fileBasePath+'.syn'): return with open(self.fileBasePath+'.syn', 'rb') as f: synStr = f.read() i = 0 while i < len(synStr): beg = i i = synStr.find('\x00', beg) if i < 0: log.error("Synonym file is corrupted.") break word = synStr[beg:i] i += 1 if i + 4 > len(synStr): log.error("Synonym file is corrupted.") break index = binStrToInt(synStr[i:i+4]) i += 4 if index >= len(self.indexData): log.error("Corrupted synonym file. Word \"{0}\" references invalid item.".format(word)) continue self.indexData[index][4].append(word)
def readEntry_Type11(self, block): """return (succeed, u_word, u_alts, u_defi)""" Err = (False, None, None, None) pos = 0 # reading headword if pos + 5 > len(block.data): log.error( f"reading block offset={block.offset:#02x}" f", reading word size: pos + 5 > len(block.data)" ) return Err wordLen = binStrToInt(block.data[pos:pos + 5]) pos += 5 if pos + wordLen > len(block.data): log.error( f"reading block offset={block.offset:#02x}" f", block.type={block.type}" f", reading word: pos + wordLen > len(block.data)" ) return Err b_word = block.data[pos:pos + wordLen] u_word = self.processKey(b_word) pos += wordLen self.wordLenMax = max(self.wordLenMax, len(u_word)) # reading alts and defi if pos + 4 > len(block.data): log.error( f"reading block offset={block.offset:#02x}" f", reading defi size: pos + 4 > len(block.data)" ) return Err altsCount = binStrToInt(block.data[pos:pos + 4]) pos += 4 # reading alts # use set instead of list to prevent duplicates u_alts = set() for altIndex in range(altsCount): if pos + 4 > len(block.data): log.error( f"reading block offset={block.offset:#02x}" f", reading alt size: pos + 4 > len(block.data)" ) return Err altLen = binStrToInt(block.data[pos:pos + 4]) pos += 4 if altLen == 0: if pos + altLen != len(block.data): # no evidence log.warning( f"reading block offset={block.offset:#02x}" f", reading alt size: pos + altLen != len(block.data)" ) break if pos + altLen > len(block.data): log.error( f"reading block offset={block.offset:#02x}" f", block.type={block.type}" f", reading alt: pos + altLen > len(block.data)" ) return Err b_alt = block.data[pos:pos + altLen] u_alt = self.processAlternativeKey(b_alt, b_word) # Like entry key, alt is not processed as html by babylon, # so do we. u_alts.add(u_alt) pos += altLen if u_word in u_alts: u_alts.remove(u_word) u_alts = list(sorted(u_alts)) # reading defi defiLen = binStrToInt(block.data[pos:pos + 4]) pos += 4 if pos + defiLen > len(block.data): log.error( f"reading block offset={block.offset:#02x}" f", block.type={block.type}" f", reading defi: pos + defiLen > len(block.data)" ) return Err b_defi = block.data[pos:pos + defiLen] u_defi = self.processDefi(b_defi, b_word) self.defiMaxBytes = max(self.defiMaxBytes, len(b_defi)) pos += defiLen return True, u_word, u_alts, u_defi
def decodeBglBinTime(b_value): jd1970 = gregorian.to_jd(1970, 1, 1) djd, hm = divmod(binStrToInt(b_value), 24 * 60) year, month, day = gregorian.jd_to(djd + jd1970) hour, minute = divmod(hm, 60) return "%.2d/%.2d/%.2d, %.2d:%.2d" % (year, month, day, hour, minute)
def decodeBglBinTime(b_value): jd1970 = gregorian.to_jd(1970, 1, 1) djd, hm = divmod(binStrToInt(b_value), 24 * 60) year, month, day = gregorian.jd_to(djd + jd1970) hour, minute = divmod(hm, 60) return f"{year:04d}/{month:02d}/{day:02d}, {hour:02d}:{minute:02d}"
def decodeBglBinTime(b_value): jd1970 = gregorian.to_jd(1970, 1, 1) djd, hm = divmod(binStrToInt(b_value), 24*60) year, month, day = gregorian.jd_to(djd + jd1970) hour, minute = divmod(hm, 60) return "%.2d/%.2d/%.2d, %.2d:%.2d" % (year, month, day, hour, minute)
def readEntry_Type11(self, block): """return (succeed, u_word, u_alts, u_defi)""" Err = (False, None, None, None) pos = 0 # reading headword if pos + 5 > len(block.data): log.error( "reading block offset=%#.2x" % block.offset + ", reading word size: pos + 5 > len(block.data)" ) return Err wordLen = binStrToInt(block.data[pos:pos+5]) pos += 5 if pos + wordLen > len(block.data): log.error( "reading block offset=%#.2x" % block.offset + ", block.type=%s" % block.type + ", reading word: pos + wordLen > len(block.data)" ) return Err b_word = block.data[pos:pos+wordLen] u_word = self.processKey(b_word) pos += wordLen self.wordLenMax = max(self.wordLenMax, len(u_word)) # reading alts and defi if pos + 4 > len(block.data): log.error( "reading block offset=%#.2x" % block.offset + ", reading defi size: pos + 4 > len(block.data)" ) return Err altsCount = binStrToInt(block.data[pos:pos+4]) pos += 4 # reading alts # use set instead of list to prevent duplicates u_alts = set() for altIndex in range(altsCount): if pos + 4 > len(block.data): log.error( "reading block offset=%#.2x" % block.offset + ", reading alt size: pos + 4 > len(block.data)" ) return Err altLen = binStrToInt(block.data[pos:pos+4]) pos += 4 if altLen == 0: if pos + altLen != len(block.data): # no evidence log.warning( "reading block offset=%#.2x" % block.offset + ", reading alt size: pos + altLen != len(block.data)" ) break if pos + altLen > len(block.data): log.error( "reading block offset=%#.2x" % block.offset + ", block.type=%s" % block.type + ", reading alt: pos + altLen > len(block.data)" ) return Err b_alt = block.data[pos:pos+altLen] u_alt = self.processAlternativeKey(b_alt, b_word) # Like entry key, alt is not processed as html by babylon, # so do we. u_alts.add(u_alt) pos += altLen if u_word in u_alts: u_alts.remove(u_word) u_alts = list(sorted(u_alts)) # reading defi defiLen = binStrToInt(block.data[pos:pos+4]) pos += 4 if pos + defiLen > len(block.data): log.error( "reading block offset=%#.2x" % block.offset + ", block.type=%s" % block.type + ", reading defi: pos + defiLen > len(block.data)" ) return Err b_defi = block.data[pos:pos+defiLen] u_defi = self.processDefi(b_defi, b_word) self.defiMaxBytes = max(self.defiMaxBytes, len(b_defi)) pos += defiLen return True, u_word, u_alts, u_defi
def collectDefiFields(self, b_defi, b_key, fields): """ entry definition structure: <main definition>['\x14'[{field_code}{field_data}]*] {field_code} is one character {field_data} has arbitrary length """ # d0 is index of the '\x14 char in b_defi # d0 may be the last char of the string d0 = self.findDefiFieldsStart(b_defi) if d0 == -1: fields.b_defi = b_defi return fields.b_defi = b_defi[:d0] i = d0 + 1 while i < len(b_defi): if self.metadata2: self.metadata2.defiTrailingFields[b_defi[i]] += 1 if b_defi[i] == 0x02: # part of speech # "\x02" <one char - part of speech> if fields.partOfSpeech: log.debug( "collecting definition fields, " + "b_defi = %r\n" % b_defi + "b_key = %r:\n" % b_key + "duplicate part of speech item", ) if i+1 >= len(b_defi): log.debug( "collecting definition fields, " + "b_defi = %r\n" % b_defi + "b_key = %r:\nb_defi ends after \\x02" % b_key ) return posCode = b_defi[i+1] try: fields.partOfSpeech = partOfSpeechByCode[posCode] except KeyError: log.debug( "collecting definition fields, " + "b_defi = %r\n" % b_defi + "b_key = %r:\n" % b_key + "unknown part of speech code = %#.2x" % posCode ) return i += 2 elif b_defi[i] == 0x06: # \x06<one byte> if fields.b_field_06: log.debug( "collecting definition fields, " + "b_defi = %r\n" % b_defi + "b_key = %r:\nduplicate type 6" % b_key ) if i+1 >= len(b_defi): log.debug( "collecting definition fields, " + "b_defi = %r\n" % b_defi + "b_key = %r:\nb_defi ends after \\x06" % b_key ) return fields.b_field_06 = b_defi[i+1] i += 2 elif b_defi[i] == 0x07: # \x07<two bytes> # Found in 4 Hebrew dictionaries. I do not understand. if i+3 > len(b_defi): log.debug( "collecting definition fields, " + "b_defi = %r\n" % b_defi + "b_key = %r:\ntoo few data after \\x07" % b_key ) return fields.b_field_07 = b_defi[i+1:i+3] i += 3 elif b_defi[i] == 0x13: # "\x13"<one byte - length><data> # known values: # 03 06 0D C7 # 04 00 00 00 44 # ... # 04 00 00 00 5F if i + 1 >= len(b_defi): log.debug( "collecting definition fields, " + "b_defi = %r\n" % b_defi + "b_key = %r:\ntoo few data after \\x13" % b_key ) return Len = b_defi[i+1] i += 2 if Len == 0: log.debug( "collecting definition fields, " + "b_defi = %r\n" % b_defi + "b_key = %r:\nblank data after \\x13" % b_key ) continue if i+Len > len(b_defi): log.debug( "collecting definition fields, " + "b_defi = %r\n" % b_defi + "b_key = %r:\ntoo few data after \\x13" % b_key ) return fields.b_field_13 = b_defi[i:i+Len] i += Len elif b_defi[i] == 0x18: # \x18<one byte - title length><entry title> if fields.b_title: log.debug( "collecting definition fields, " + "b_defi = %r\n" % b_defi + "b_key = %r:\nduplicate entry title item" % b_key ) if i+1 >= len(b_defi): log.debug( "collecting definition fields, " + "b_defi = %r\n" % b_defi + "b_key = %r:\nb_defi ends after \\x18" % b_key ) return i += 1 Len = b_defi[i] i += 1 if Len == 0: # log.debug( # "collecting definition fields, b_defi = %r\n" % b_defi + # "b_key = %r:\nblank entry title" % b_key # ) continue if i + Len > len(b_defi): log.debug( "collecting definition fields, " + "b_defi = %r\n" % b_defi + "b_key = %r:\ntitle is too long" % b_key ) return fields.b_title = b_defi[i:i+Len] i += Len elif b_defi[i] == 0x1a: # "\x1a"<one byte - length><text> # found only in Hebrew dictionaries, I do not understand. if i + 1 >= len(b_defi): log.debug( "collecting definition fields, " + "b_defi = %r\n" % b_defi + "b_key = %s:\ntoo few data after \\x1a" % b_key ) return Len = b_defi[i+1] i += 2 if Len == 0: log.debug( "collecting definition fields, " + "b_defi = %r\n" % b_defi + "b_key = %r:\nblank data after \\x1a" % b_key ) continue if i+Len > len(b_defi): log.debug( "collecting definition fields, " + "b_defi = %r\n" % b_defi + "b_key = %r:\ntoo few data after \\x1a" % b_key ) return fields.b_field_1a = b_defi[i:i+Len] i += Len elif b_defi[i] == 0x28: # "\x28" <two bytes - length><html text> # title with transcription? if i + 2 >= len(b_defi): log.debug( "collecting definition fields, " + "b_defi = %r\n" % b_defi + "b_key = %r:\ntoo few data after \\x28" % b_key ) return i += 1 Len = binStrToInt(b_defi[i:i+2]) i += 2 if Len == 0: log.debug( "collecting definition fields, " + "b_defi = %r\n" % b_defi + "b_key = %r:\nblank data after \\x28" % b_key ) continue if i+Len > len(b_defi): log.debug( "collecting definition fields, " + "b_defi = %r\n" % b_defi + "b_key = %r:\ntoo few data after \\x28" % b_key ) return fields.b_title_trans = b_defi[i:i+Len] i += Len elif 0x40 <= b_defi[i] <= 0x4f: # [\x41-\x4f] <one byte> <text> # often contains digits as text: # 56 # ælps - key Alps # 48@i # has no apparent influence on the article code = b_defi[i] Len = b_defi[i] - 0x3f if i+2+Len > len(b_defi): log.debug( "collecting definition fields, " + "b_defi = %r\n" % b_defi + "b_key = %r:\ntoo few data after \\x40+" % b_key ) return i += 2 b_text = b_defi[i:i+Len] i += Len log.debug( "\nunknown definition field %#.2x" % code + ", b_text=%r" % b_text ) elif b_defi[i] == 0x50: # \x50 <one byte> <one byte - length><data> if i + 2 >= len(b_defi): log.debug( "collecting definition fields, " + "b_defi = %r\n" % b_defi + "b_key = %r:\ntoo few data after \\x50" % b_key ) return fields.code_transcription_50 = b_defi[i+1] Len = b_defi[i+2] i += 3 if Len == 0: log.debug( "collecting definition fields, " + "b_defi = %r\n" % b_defi + "b_key = %r:\nblank data after \\x50" % b_key ) continue if i+Len > len(b_defi): log.debug( "collecting definition fields, " + "b_defi = %r\n" % b_defi + "b_key = %r:\ntoo few data after \\x50" % b_key ) return fields.b_transcription_50 = b_defi[i:i+Len] i += Len elif b_defi[i] == 0x60: # "\x60" <one byte> <two bytes - length> <text> if i + 4 > len(b_defi): log.debug( "collecting definition fields, " + "b_defi = %r\n" % b_defi + "b_key = %r:\ntoo few data after \\x60" % b_key ) return fields.code_transcription_60 = b_defi[i+1] i += 2 Len = binStrToInt(b_defi[i:i+2]) i += 2 if Len == 0: log.debug( "collecting definition fields, " + "b_defi = %r\n" % b_defi + "b_key = %r:\nblank data after \\x60" % b_key ) continue if i+Len > len(b_defi): log.debug( "collecting definition fields, " + "b_defi = %r\n" % b_defi + "b_key = %r:\ntoo few data after \\x60" % b_key ) return fields.b_transcription_60 = b_defi[i:i+Len] i += Len else: log.debug( "collecting definition fields, " + "b_defi = %r\n" % b_defi + "b_key = %r:\n" % b_key + "unknown control char. Char code = %#.2x" % b_defi[i] ) return
def collectDefiFields(self, b_defi, b_key, fields): """ entry definition structure: <main definition>['\x14'[{field_code}{field_data}]*] {field_code} is one character {field_data} has arbitrary length """ # d0 is index of the '\x14 char in b_defi # d0 may be the last char of the string d0 = self.findDefiFieldsStart(b_defi) if d0 == -1: fields.b_defi = b_defi return fields.b_defi = b_defi[:d0] i = d0 + 1 while i < len(b_defi): if self.metadata2: self.metadata2.defiTrailingFields[b_defi[i]] += 1 if b_defi[i] == 0x02: # part of speech # "\x02" <one char - part of speech> if fields.partOfSpeech: log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}" f":\nduplicate part of speech item", ) if i + 1 >= len(b_defi): log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}:\nb_defi ends after \\x02" ) return posCode = b_defi[i + 1] try: fields.partOfSpeech = partOfSpeechByCode[posCode] except KeyError: log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}" f":\nunknown part of speech code = {posCode:#02x}" ) return i += 2 elif b_defi[i] == 0x06: # \x06<one byte> if fields.b_field_06: log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}:\nduplicate type 6" ) if i + 1 >= len(b_defi): log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}:\nb_defi ends after \\x06" ) return fields.b_field_06 = b_defi[i + 1] i += 2 elif b_defi[i] == 0x07: # \x07<two bytes> # Found in 4 Hebrew dictionaries. I do not understand. if i + 3 > len(b_defi): log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}:\ntoo few data after \\x07" ) return fields.b_field_07 = b_defi[i + 1:i + 3] i += 3 elif b_defi[i] == 0x13: # "\x13"<one byte - length><data> # known values: # 03 06 0D C7 # 04 00 00 00 44 # ... # 04 00 00 00 5F if i + 1 >= len(b_defi): log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}:\ntoo few data after \\x13" ) return Len = b_defi[i + 1] i += 2 if Len == 0: log.debug( f"collecting definition fields, b_defi = {b_defi!r}\n" f"b_key = {b_key!r}:\nblank data after \\x13" ) continue if i + Len > len(b_defi): log.debug( f"collecting definition fields, b_defi = {b_defi!r}\n" + f"b_key = {b_key!r}:\ntoo few data after \\x13" ) return fields.b_field_13 = b_defi[i:i + Len] i += Len elif b_defi[i] == 0x18: # \x18<one byte - title length><entry title> if fields.b_title: log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"b_key = {b_key!r}:\nduplicate entry title item" ) if i + 1 >= len(b_defi): log.debug( f"collecting definition fields, b_defi = {b_defi!r}\n" f"b_key = {b_key!r}:\nb_defi ends after \\x18" ) return i += 1 Len = b_defi[i] i += 1 if Len == 0: # log.debug( # f"collecting definition fields, b_defi = {b_defi!r}\n" # f"b_key = {b_key!r}:\nblank entry title" # ) continue if i + Len > len(b_defi): log.debug( f"collecting definition fields, b_defi = {b_defi!r}\n" f"b_key = {b_key!r}:\ntitle is too long" ) return fields.b_title = b_defi[i:i + Len] i += Len elif b_defi[i] == 0x1a: # "\x1a"<one byte - length><text> # found only in Hebrew dictionaries, I do not understand. if i + 1 >= len(b_defi): log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key}:\ntoo few data after \\x1a" ) return Len = b_defi[i + 1] i += 2 if Len == 0: log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}:\nblank data after \\x1a" ) continue if i + Len > len(b_defi): log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}:\ntoo few data after \\x1a" ) return fields.b_field_1a = b_defi[i:i + Len] i += Len elif b_defi[i] == 0x28: # "\x28" <two bytes - length><html text> # title with transcription? if i + 2 >= len(b_defi): log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}:\ntoo few data after \\x28" ) return i += 1 Len = binStrToInt(b_defi[i:i + 2]) i += 2 if Len == 0: log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}:\nblank data after \\x28" ) continue if i + Len > len(b_defi): log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}:\ntoo few data after \\x28" ) return fields.b_title_trans = b_defi[i:i + Len] i += Len elif 0x40 <= b_defi[i] <= 0x4f: # [\x41-\x4f] <one byte> <text> # often contains digits as text: # 56 # ælps - key Alps # 48@i # has no apparent influence on the article code = b_defi[i] Len = b_defi[i] - 0x3f if i + 2 + Len > len(b_defi): log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}:\ntoo few data after \\x40+" ) return i += 2 b_text = b_defi[i:i + Len] i += Len log.debug( f"unknown definition field {code:#02x}, b_text={b_text!r}" ) elif b_defi[i] == 0x50: # \x50 <one byte> <one byte - length><data> if i + 2 >= len(b_defi): log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}:\ntoo few data after \\x50" ) return fields.code_transcription_50 = b_defi[i + 1] Len = b_defi[i + 2] i += 3 if Len == 0: log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}:\nblank data after \\x50" ) continue if i + Len > len(b_defi): log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}:\ntoo few data after \\x50" ) return fields.b_transcription_50 = b_defi[i:i + Len] i += Len elif b_defi[i] == 0x60: # "\x60" <one byte> <two bytes - length> <text> if i + 4 > len(b_defi): log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}:\ntoo few data after \\x60" ) return fields.code_transcription_60 = b_defi[i + 1] i += 2 Len = binStrToInt(b_defi[i:i + 2]) i += 2 if Len == 0: log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}:\nblank data after \\x60" ) continue if i + Len > len(b_defi): log.debug( f"collecting definition fields, b_defi = {b_defi!r}" + f"\nb_key = {b_key!r}:\ntoo few data after \\x60" ) return fields.b_transcription_60 = b_defi[i:i + Len] i += Len else: log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}" f":\nunknown control char. Char code = {b_defi[i]:#02x}" ) return