Esempio n. 1
0
	def readEntryDefi(self, block, pos, b_word):
		"""
		Read defi part of entry.

		Return value is a list.
		(False, None, None, None) if error
		(True, pos, u_defi, b_defi) if OK
			u_defi is a str instance (utf-8)
			b_defi is a bytes instance
		"""
		Err = (False, None, None, None)
		if pos + 2 > len(block.data):
			log.error(
				f"reading block offset={block.offset:#02x}"
				f", reading defi size: pos + 2 > len(block.data)"
			)
			return Err
		Len = uintFromBytes(block.data[pos:pos + 2])
		pos += 2
		if pos + Len > len(block.data):
			log.error(
				f"reading block offset={block.offset:#02x}"
				f", block.type={block.type}"
				f", reading defi: pos + Len > len(block.data)"
			)
			return Err
		b_defi = block.data[pos:pos + Len]
		u_defi = self.processDefi(b_defi, b_word)
		self.defiMaxBytes = max(self.defiMaxBytes, len(b_defi))

		pos += Len
		return True, pos, u_defi, b_defi
Esempio n. 2
0
	def openGzip(self):
		with open(self._filename, "rb") as bglFile:
			if not bglFile:
				log.error(f"file pointer empty: {bglFile}")
				return False
			b_head = bglFile.read(6)

		if len(b_head) < 6 or not b_head[:4] in (
			b"\x12\x34\x00\x01",
			b"\x12\x34\x00\x02",
		):
			log.error(f"invalid header: {b_head[:6]!r}")
			return False

		self.gzipOffset = gzipOffset = uintFromBytes(b_head[4:6])
		log.debug(f"Position of gz header: {gzipOffset}")

		if gzipOffset < 6:
			log.error(f"invalid gzip header position: {gzipOffset}")
			return False

		self.file = BGLGzipFile(
			fileobj=FileOffS(self._filename, gzipOffset),
			closeFileobj=True,
		)

		return True
Esempio n. 3
0
def languageInfoDecode(b_value):
    """
		returns BabylonLanguage instance
	"""
    intValue = uintFromBytes(b_value)
    try:
        return languageByCode[intValue]
    except IndexError:
        log.warning(f"read_type_3: unknown language code = {intValue}")
        return
Esempio n. 4
0
	def readType0(self, block):
		code = block.data[0]
		if code == 2:
			# this number is vary close to self.bgl_numEntries,
			# but does not always equal to the number of entries
			# see self.readType3, code == 12 as well
			num = uintFromBytes(block.data[1:])
		elif code == 8:
			self.defaultCharset = charsetInfoDecode(block.data[1:])
			if not self.defaultCharset:
				log.warning("defaultCharset is not valid")
		else:
			self.logUnknownBlock(block)
			return False
		return True
Esempio n. 5
0
	def readType3(self, block):
		"""
			reads block with type 3, and updates self.info
			returns None
		"""
		code, b_value = uintFromBytes(block.data[:2]), block.data[2:]
		if not b_value:
			return
		# if not b_value.strip(b"\x00"): return  # FIXME

		try:
			item = infoType3ByCode[code]
		except KeyError:
			if b_value.strip(b"\x00"):
				log.debug(
					f"Unknown info type code={code:#02x}, b_value={b_value!r}",
				)
			return

		key = item.name
		decode = item.decode

		if key.endswith(".ico"):
			self.iconDataList.append((key, b_value))
			return

		value = None
		if decode is None:
			value = b_value
		else:
			value = decode(b_value)

		# `value` can be None, str, bytes or dict

		if not value:
			return

		if isinstance(value, dict):
			self.info.update(value)
			return

		if item.attr:
			setattr(self, key, value)
			return

		self.info[key] = value
Esempio n. 6
0
	def readBytes(self, num):
		"""
			return -1 if error
		"""
		if num < 1 or num > 4:
			log.error(f"invalid argument num={num}")
			return -1
		self.file.flush()
		buf = self.file.read(num)
		if len(buf) == 0:
			log.debug("readBytes: end of file: len(buf)==0")
			return -1
		if len(buf) != num:
			log.error(
				f"readBytes: expected to read {num} bytes"
				f", but found {len(buf)} bytes"
			)
			return -1
		return uintFromBytes(buf)
Esempio n. 7
0
def flagsInfoDecode(b_value):
    """
		returns a dict with these keys:
			utf8Encoding
				when this flag is set utf8 encoding is used for all articles
				when false, the encoding is set according to the source and
					target alphabet
			bgl_spellingAlternatives
				determines whether the glossary offers spelling alternatives
				for searched terms
			bgl_caseSensitive
				defines if the search for terms in this glossary is
					case sensitive
				see code 0x20 as well

	"""
    flags = uintFromBytes(b_value)
    return {
        "utf8Encoding": (flags & 0x8000 != 0),
        "bgl_spellingAlternatives": (flags & 0x10000 == 0),
        "bgl_caseSensitive": (flags & 0x1000 != 0),
    }
Esempio n. 8
0
def utf16InfoDecode(b_value):
    """
		b_value is byte array
		returns str, or None (on errors)

		block type = 3
		block format: <2 byte code1><2 byte code2>
		if code2 == 0: then the block ends
		if code2 == 1: then the block continues as follows:
		<4 byte len1> \x00 \x00 <message in utf-16>
		len1 - length of message in 2-byte chars
	"""
    if b_value[0] != 0:
        log.warning(
            f"utf16InfoDecode: b_value={b_value}, null expected at 0", )
        return

    if b_value[1] == 0:
        if len(b_value) > 2:
            log.warning(
                f"utf16InfoDecode: unexpected b_value size: {len(b_value)}", )
        return

    elif b_value[1] > 1:
        log.warning(
            f"utf16InfoDecode: b_value={b_value!r}, unexpected byte at 1", )
        return

    # now b_value[1] == 1
    size = 2 * uintFromBytes(b_value[2:6])
    if tuple(b_value[6:8]) != (0, 0):
        log.warning(
            f"utf16InfoDecode: b_value={b_value!r}, null expected at 6:8", )
    if size != len(b_value) - 8:
        log.warning(
            f"utf16InfoDecode: b_value={b_value!r}, size does not match", )

    return b_value[8:].decode("utf16")  # str
Esempio n. 9
0
def decodeBglBinTime(b_value):
    jd1970 = gregorian.to_jd(1970, 1, 1)
    djd, hm = divmod(uintFromBytes(b_value), 24 * 60)
    year, month, day = gregorian.jd_to(djd + jd1970)
    hour, minute = divmod(hm, 60)
    return f"{year:04d}/{month:02d}/{day:02d}, {hour:02d}:{minute:02d}"
Esempio n. 10
0
	def readEntry_Type11(self, block):
		"""return (succeed, u_word, u_alts, u_defi)"""
		Err = (False, None, None, None)
		pos = 0

		# reading headword
		if pos + 5 > len(block.data):
			log.error(
				f"reading block offset={block.offset:#02x}"
				f", reading word size: pos + 5 > len(block.data)"
			)
			return Err
		wordLen = uintFromBytes(block.data[pos:pos + 5])
		pos += 5
		if pos + wordLen > len(block.data):
			log.error(
				f"reading block offset={block.offset:#02x}"
				f", block.type={block.type}"
				f", reading word: pos + wordLen > len(block.data)"
			)
			return Err
		b_word = block.data[pos:pos + wordLen]
		u_word = self.processKey(b_word)
		pos += wordLen
		self.wordLenMax = max(self.wordLenMax, len(u_word))

		# reading alts and defi
		if pos + 4 > len(block.data):
			log.error(
				f"reading block offset={block.offset:#02x}"
				f", reading defi size: pos + 4 > len(block.data)"
			)
			return Err
		altsCount = uintFromBytes(block.data[pos:pos + 4])
		pos += 4

		# reading alts
		# use set instead of list to prevent duplicates
		u_alts = set()
		for altIndex in range(altsCount):
			if pos + 4 > len(block.data):
				log.error(
					f"reading block offset={block.offset:#02x}"
					f", reading alt size: pos + 4 > len(block.data)"
				)
				return Err
			altLen = uintFromBytes(block.data[pos:pos + 4])
			pos += 4
			if altLen == 0:
				if pos + altLen != len(block.data):
					# no evidence
					log.warning(
						f"reading block offset={block.offset:#02x}"
						f", reading alt size: pos + altLen != len(block.data)"
					)
				break
			if pos + altLen > len(block.data):
				log.error(
					f"reading block offset={block.offset:#02x}"
					f", block.type={block.type}"
					f", reading alt: pos + altLen > len(block.data)"
				)
				return Err
			b_alt = block.data[pos:pos + altLen]
			u_alt = self.processAlternativeKey(b_alt, b_word)
			# Like entry key, alt is not processed as html by babylon,
			# so do we.
			u_alts.add(u_alt)
			pos += altLen
		if u_word in u_alts:
			u_alts.remove(u_word)
		u_alts = list(sorted(u_alts))

		# reading defi
		defiLen = uintFromBytes(block.data[pos:pos + 4])
		pos += 4
		if pos + defiLen > len(block.data):
			log.error(
				f"reading block offset={block.offset:#02x}"
				f", block.type={block.type}"
				f", reading defi: pos + defiLen > len(block.data)"
			)
			return Err
		b_defi = block.data[pos:pos + defiLen]
		u_defi = self.processDefi(b_defi, b_word)
		self.defiMaxBytes = max(self.defiMaxBytes, len(b_defi))
		pos += defiLen

		return True, u_word, u_alts, u_defi
Esempio n. 11
0
	def collectDefiFields(self, b_defi, b_key, fields):
		"""
		entry definition structure:
		<main definition>['\x14'[{field_code}{field_data}]*]
		{field_code} is one character
		{field_data} has arbitrary length
		"""
		# d0 is index of the '\x14 char in b_defi
		# d0 may be the last char of the string
		d0 = self.findDefiFieldsStart(b_defi)
		if d0 == -1:
			fields.b_defi = b_defi
			return

		fields.b_defi = b_defi[:d0]

		i = d0 + 1
		while i < len(b_defi):
			if self.metadata2:
				self.metadata2.defiTrailingFields[b_defi[i]] += 1

			if b_defi[i] == 0x02:
				# part of speech # "\x02" <one char - part of speech>
				if fields.partOfSpeech:
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}"
						f"\nb_key = {b_key!r}"
						f":\nduplicate part of speech item",
					)
				if i + 1 >= len(b_defi):
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}"
						f"\nb_key = {b_key!r}:\nb_defi ends after \\x02"
					)
					return

				posCode = b_defi[i + 1]

				try:
					fields.partOfSpeech = partOfSpeechByCode[posCode]
				except KeyError:
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}"
						f"\nb_key = {b_key!r}"
						f":\nunknown part of speech code = {posCode:#02x}"
					)
					return
				i += 2
			elif b_defi[i] == 0x06:  # \x06<one byte>
				if fields.b_field_06:
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}"
						f"\nb_key = {b_key!r}:\nduplicate type 6"
					)
				if i + 1 >= len(b_defi):
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}"
						f"\nb_key = {b_key!r}:\nb_defi ends after \\x06"
					)
					return
				fields.b_field_06 = b_defi[i + 1]
				i += 2
			elif b_defi[i] == 0x07:  # \x07<two bytes>
				# Found in 4 Hebrew dictionaries. I do not understand.
				if i + 3 > len(b_defi):
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}"
						f"\nb_key = {b_key!r}:\ntoo few data after \\x07"
					)
					return
				fields.b_field_07 = b_defi[i + 1:i + 3]
				i += 3
			elif b_defi[i] == 0x13:  # "\x13"<one byte - length><data>
				# known values:
				# 03 06 0D C7
				# 04 00 00 00 44
				# ...
				# 04 00 00 00 5F
				if i + 1 >= len(b_defi):
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}"
						f"\nb_key = {b_key!r}:\ntoo few data after \\x13"
					)
					return
				Len = b_defi[i + 1]
				i += 2
				if Len == 0:
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}\n"
						f"b_key = {b_key!r}:\nblank data after \\x13"
					)
					continue
				if i + Len > len(b_defi):
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}\n" +
						f"b_key = {b_key!r}:\ntoo few data after \\x13"
					)
					return
				fields.b_field_13 = b_defi[i:i + Len]
				i += Len
			elif b_defi[i] == 0x18:
				# \x18<one byte - title length><entry title>
				if fields.b_title:
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}"
						f"b_key = {b_key!r}:\nduplicate entry title item"
					)
				if i + 1 >= len(b_defi):
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}\n"
						f"b_key = {b_key!r}:\nb_defi ends after \\x18"
					)
					return
				i += 1
				Len = b_defi[i]
				i += 1
				if Len == 0:
					# log.debug(
					#	f"collecting definition fields, b_defi = {b_defi!r}\n"
					#	f"b_key = {b_key!r}:\nblank entry title"
					# )
					continue
				if i + Len > len(b_defi):
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}\n"
						f"b_key = {b_key!r}:\ntitle is too long"
					)
					return
				fields.b_title = b_defi[i:i + Len]
				i += Len
			elif b_defi[i] == 0x1a:  # "\x1a"<one byte - length><text>
				# found only in Hebrew dictionaries, I do not understand.
				if i + 1 >= len(b_defi):
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}"
						f"\nb_key = {b_key}:\ntoo few data after \\x1a"
					)
					return
				Len = b_defi[i + 1]
				i += 2
				if Len == 0:
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}"
						f"\nb_key = {b_key!r}:\nblank data after \\x1a"
					)
					continue
				if i + Len > len(b_defi):
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}"
						f"\nb_key = {b_key!r}:\ntoo few data after \\x1a"
					)
					return
				fields.b_field_1a = b_defi[i:i + Len]
				i += Len
			elif b_defi[i] == 0x28:  # "\x28" <two bytes - length><html text>
				# title with transcription?
				if i + 2 >= len(b_defi):
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}"
						f"\nb_key = {b_key!r}:\ntoo few data after \\x28"
					)
					return
				i += 1
				Len = uintFromBytes(b_defi[i:i + 2])
				i += 2
				if Len == 0:
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}"
						f"\nb_key = {b_key!r}:\nblank data after \\x28"
					)
					continue
				if i + Len > len(b_defi):
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}"
						f"\nb_key = {b_key!r}:\ntoo few data after \\x28"
					)
					return
				fields.b_title_trans = b_defi[i:i + Len]
				i += Len
			elif 0x40 <= b_defi[i] <= 0x4f:  # [\x41-\x4f] <one byte> <text>
				# often contains digits as text:
				# 56
				# &#0230;lps - key Alps
				# 48@i
				# has no apparent influence on the article
				code = b_defi[i]
				Len = b_defi[i] - 0x3f
				if i + 2 + Len > len(b_defi):
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}"
						f"\nb_key = {b_key!r}:\ntoo few data after \\x40+"
					)
					return
				i += 2
				b_text = b_defi[i:i + Len]
				i += Len
				log.debug(
					f"unknown definition field {code:#02x}, b_text={b_text!r}"
				)
			elif b_defi[i] == 0x50:
				# \x50 <one byte> <one byte - length><data>
				if i + 2 >= len(b_defi):
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}"
						f"\nb_key = {b_key!r}:\ntoo few data after \\x50"
					)
					return
				fields.code_transcription_50 = b_defi[i + 1]
				Len = b_defi[i + 2]
				i += 3
				if Len == 0:
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}"
						f"\nb_key = {b_key!r}:\nblank data after \\x50"
					)
					continue
				if i + Len > len(b_defi):
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}"
						f"\nb_key = {b_key!r}:\ntoo few data after \\x50"
					)
					return
				fields.b_transcription_50 = b_defi[i:i + Len]
				i += Len
			elif b_defi[i] == 0x60:
				# "\x60" <one byte> <two bytes - length> <text>
				if i + 4 > len(b_defi):
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}"
						f"\nb_key = {b_key!r}:\ntoo few data after \\x60"
					)
					return
				fields.code_transcription_60 = b_defi[i + 1]
				i += 2
				Len = uintFromBytes(b_defi[i:i + 2])
				i += 2
				if Len == 0:
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}"
						f"\nb_key = {b_key!r}:\nblank data after \\x60"
					)
					continue
				if i + Len > len(b_defi):
					log.debug(
						f"collecting definition fields, b_defi = {b_defi!r}" +
						f"\nb_key = {b_key!r}:\ntoo few data after \\x60"
					)
					return
				fields.b_transcription_60 = b_defi[i:i + Len]
				i += Len
			else:
				log.debug(
					f"collecting definition fields, b_defi = {b_defi!r}"
					f"\nb_key = {b_key!r}"
					f":\nunknown control char. Char code = {b_defi[i]:#02x}"
				)
				return