def __init__(self, dict_file, mode="r"): """Open the Mobi file with mode read "r" or write "w" """ if mode not in ("r", "w"): raise RuntimeError('MobiFile() requires mode "r" or "w"') self.filename = dict_file self.parsed = False self.verbose = False mode_dict = {'r': 'rb', 'w': 'wb'} with open(dict_file, mode_dict[mode]) as fp: self.bstr = BitStream(fp.read()) self.header = Header(self.bstr) # only lsd if self.header.magic != 'LingVo': raise LsdError('Allow only Lsd "LingVo" ident: %s' % repr(self.header.magic)) self.bstr.seek(self.header.dictionary_encoder_offset) version = self.header.version if version == 0x142001: # user dictionaries self.decoder = decoder.UserDictionaryDecoder(self.bstr) elif version == 0x141004: # system dictionaries self.decoder = decoder.SystemDictionaryDecoder(self.bstr) elif version == 0x145001: # abbreviation dictionaries self.decoder = decoder.AbbreviationDictionaryDecoder(self.bstr) else: self.header.dump(True) raise LsdError("Not supported dict version %s" % hex(self.header.version)) self.overlay = None self.headings = [] self.dict = []
class LsdFile(): def __init__(self, dict_file, mode="r"): """Open the Mobi file with mode read "r" or write "w" """ if mode not in ("r", "w"): raise RuntimeError('MobiFile() requires mode "r" or "w"') self.filename = dict_file self.parsed = False self.verbose = False mode_dict = {'r': 'rb', 'w': 'wb'} with open(dict_file, mode_dict[mode]) as fp: self.bstr = BitStream(fp.read()) self.header = Header(self.bstr) # only lsd if self.header.magic != 'LingVo': raise LsdError('Allow only Lsd "LingVo" ident: %s' % repr(self.header.magic)) self.bstr.seek(self.header.dictionary_encoder_offset) version = self.header.version if version == 0x142001: # user dictionaries self.decoder = decoder.UserDictionaryDecoder(self.bstr) elif version == 0x141004: # system dictionaries self.decoder = decoder.SystemDictionaryDecoder(self.bstr) elif version == 0x145001: # abbreviation dictionaries self.decoder = decoder.AbbreviationDictionaryDecoder(self.bstr) else: self.header.dump(True) raise LsdError("Not supported dict version %s" % hex(self.header.version)) self.overlay = None self.headings = [] self.dict = [] @property def pages_count(self): return (self.header.pages_end - self.header.pages_offset) // 512 def get_page_offset(self, page_number): return self.header.pages_offset + 512 * page_number def collect_headings(self): res = [] for i in range(self.pages_count): headings = self.collect_heading_from_page(i) res += headings return res def collect_heading_from_page(self, page_number): res = [] self.bstr.seek(self.get_page_offset(page_number)) page = CachePage(self.bstr) if page.is_leaf: prefix = "" for idx in range(page.headings_count): heading = ArticleHeading(self.decoder, self.bstr, prefix, self.header.version) prefix = heading.text res.append(heading) return res def decode_article(self, reference): self.bstr.seek(self.header.articles_offset + reference) size = self.bstr.read_bits(16) if size == 0xFFFF: size = self.bstr.read_bits(32) res = self.decoder.decode_article(size) #assert(res) return res def parse(self): print("decoding dictionary..") self.overlay = OverlayReader(self.bstr, self.header.overlay_data) print("decoding headings: %d" % self.header.entries_count) self.headings = self.collect_headings() if len(self.headings) != self.header.entries_count: raise Exception("Decoded not all entries %d != %d" % (len(self.headings), self.header.entries_count)) print("decoding articles..") for h in self.headings: self.dict.append((h.ext_text, self.decode_article(h.reference))) self.parsed = True print("OK") @property def annotation(self): res = "" if self.bstr.seek(self.header.annotation_offset): size = self.bstr.read_bits(16) res = self.decoder.decode_article(size) return res def dump(self): self.header.dump(self.verbose) self.decoder.dump(self.verbose) if self.verbose: self.overlay.dump() def make_filename(self, path, ext): base, orig_ext = os.path.splitext(self.filename) if path != "": base = os.path.join(path, os.path.basename(base)) return base + '.' + ext def write_icon(self, path=""): if self.header.icon_size == 0: return ico_file = self.make_filename(path, "bmp") with open(ico_file, 'w') as ico: ico.write(self.header.icon) print('Write icon: %s' % ico_file) def write_annotation(self, path=""): if self.annotation == "": return ann_file = self.make_filename(path, "ann") with codecs.open(ann_file, 'w', encoding='utf-16') as ann: ann.write(self.annotation) print('Write annotation: %s' % ann_file) def write_prefix(self, path=""): if self.annotation == "": return pref_file = self.make_filename(path, "pref") with codecs.open(pref_file, 'w', encoding='utf-8') as pref: pref.write(self.decoder.prefix) print('Write prefix: %s' % pref_file) def write_overlay(self, path=""): pass @staticmethod def normalize_article(article): res = article.replace(u'\n', u'\n\t') return res def write_dsl(self, path=""): if len(self.dict) == 0: print("Nothing writing to dsl!") return dsl_file = self.make_filename(path, "dsl") with codecs.open(dsl_file, 'w', encoding='utf-16') as dsl: dsl.write(u"#NAME\t\"" + self.header.name + u"\"\n") dsl.write(u"#INDEX_LANGUAGE\t\"" + tools.lang_map[self.header.source_language] + u"\"\n") dsl.write(u"#CONTENTS_LANGUAGE\t\"" + tools.lang_map[self.header.target_language] + u"\"\n") if self.header.icon_size > 0: base, orig_ext = os.path.splitext( os.path.basename(self.filename)) dsl.write(u"#ICON_FILE\t\"" + base + '.' + "bmp" + u"\"\n") dsl.write(u"\n") for h, r in self.dict: dsl.write(h) dsl.write(u"\n\t") dsl.write(self.normalize_article(r)) dsl.write(u"\n") print('Write dsl: %s' % dsl_file) def write(self, path=""): if not self.parsed: raise LsdError("Must parsed first!") self.write_icon(path) self.write_annotation(path) self.write_overlay(path) self.write_dsl(path) if self.verbose: self.write_prefix(path)
def __init__(self, dict_file, verbose=False): self.filename = dict_file self._readed = False self._parsed = False self.verbose = verbose with open(dict_file, 'rb') as fp: self.bstr = BitStream(bytearray(fp.read())) self.overlay = None self.headings = ArticleHeadingList() self.dict = [] self.header = Header(self.bstr) # check magic if self.header.magic != u'LingVo': raise LsdError('Allow only Lsd "LingVo" ident: %s' % repr(self.header.magic)) # initialize decoder self.decoder = None hi_version = self.header.hi_version version = self.header.version if hi_version == 0x11: # lingvo 11 dictionary: 0x11001 self.decoder = decoder.UserDictionaryDecoder(self.bstr) elif hi_version == 0x12: # lingvo 12 dictionary: 0x12001 self.decoder = decoder.UserDictionaryDecoder(self.bstr) elif hi_version == 0x13: # x3 dictionary: 0x131001 and 0x132001 if pages count > 1000 self.decoder = decoder.SystemDictionaryDecoder13(self.bstr) elif hi_version == 0x14: # x5 dictionary if version == 0x142001: # user dictionaries self.decoder = decoder.UserDictionaryDecoder(self.bstr) elif version == 0x141004: # system dictionaries self.decoder = decoder.SystemDictionaryDecoder14(self.bstr) elif version == 0x145001: # abbreviation dictionaries self.decoder = decoder.AbbreviationDictionaryDecoder(self.bstr) elif hi_version == 0x15: # x6 dictionary if version == 0x152001: # user dictionaries self.decoder = decoder.UserDictionaryDecoder(self.bstr) elif version == 0x151005: # system dictionaries # xor dictionary self.xor_block_x6(self.header.dictionary_encoder_offset, self.header.articles_offset) self.decoder = decoder.SystemDictionaryDecoder14(self.bstr) elif version == 0x155001: # abbreviation dictionaries self.decoder = decoder.AbbreviationDictionaryDecoder(self.bstr) if self.decoder is None: self.dump() print("Not supported dictionary version: %s" % hex(self.header.version)) exit(1) # raise LsdError("Not supported dict version %s" % hex(self.header.version)) name_len = self.bstr.read_some(1) self.name = self.bstr.read_unicode(name_len, False) self.first_heading = self.bstr.read_unicode(self.bstr.read_byte(), False) self.last_heading = self.bstr.read_unicode(self.bstr.read_byte(), False) capitals_len = reverse32(self.bstr.read_int()) self.capitals = self.bstr.read_unicode(capitals_len, False) # icon v12+ if self.header.version > 0x120000: self.icon_size = reverse16(self.bstr.read_word()) self.icon = self.bstr.read(self.icon_size) else: self.icon_size = 0 self.icon = None if self.header.version > 0x140000: self.header_checksum = reverse32(self.bstr.read_int()) else: self.header_checksum = 0 if self.header.version > 0x120000: self.pages_end = reverse32(self.bstr.read_int()) self.overlay_data = reverse32(self.bstr.read_int()) else: self.pages_end = self.bstr.length self.overlay_data = self.bstr.length # no overlay if self.header.version > 0x140000: self.dummy1 = reverse32(self.bstr.read_int()) self.dummy2 = reverse32(self.bstr.read_int()) else: self.dummy1 = 0 self.dummy2 = 0 # set bstr pos for decoding self.bstr.seek(self.header.dictionary_encoder_offset)
class LsdFile: def __init__(self, dict_file, verbose=False): self.filename = dict_file self._readed = False self._parsed = False self.verbose = verbose with open(dict_file, 'rb') as fp: self.bstr = BitStream(bytearray(fp.read())) self.overlay = None self.headings = ArticleHeadingList() self.dict = [] self.header = Header(self.bstr) # check magic if self.header.magic != u'LingVo': raise LsdError('Allow only Lsd "LingVo" ident: %s' % repr(self.header.magic)) # initialize decoder self.decoder = None hi_version = self.header.hi_version version = self.header.version if hi_version == 0x11: # lingvo 11 dictionary: 0x11001 self.decoder = decoder.UserDictionaryDecoder(self.bstr) elif hi_version == 0x12: # lingvo 12 dictionary: 0x12001 self.decoder = decoder.UserDictionaryDecoder(self.bstr) elif hi_version == 0x13: # x3 dictionary: 0x131001 and 0x132001 if pages count > 1000 self.decoder = decoder.SystemDictionaryDecoder13(self.bstr) elif hi_version == 0x14: # x5 dictionary if version == 0x142001: # user dictionaries self.decoder = decoder.UserDictionaryDecoder(self.bstr) elif version == 0x141004: # system dictionaries self.decoder = decoder.SystemDictionaryDecoder14(self.bstr) elif version == 0x145001: # abbreviation dictionaries self.decoder = decoder.AbbreviationDictionaryDecoder(self.bstr) elif hi_version == 0x15: # x6 dictionary if version == 0x152001: # user dictionaries self.decoder = decoder.UserDictionaryDecoder(self.bstr) elif version == 0x151005: # system dictionaries # xor dictionary self.xor_block_x6(self.header.dictionary_encoder_offset, self.header.articles_offset) self.decoder = decoder.SystemDictionaryDecoder14(self.bstr) elif version == 0x155001: # abbreviation dictionaries self.decoder = decoder.AbbreviationDictionaryDecoder(self.bstr) if self.decoder is None: self.dump() print("Not supported dictionary version: %s" % hex(self.header.version)) exit(1) # raise LsdError("Not supported dict version %s" % hex(self.header.version)) name_len = self.bstr.read_some(1) self.name = self.bstr.read_unicode(name_len, False) self.first_heading = self.bstr.read_unicode(self.bstr.read_byte(), False) self.last_heading = self.bstr.read_unicode(self.bstr.read_byte(), False) capitals_len = reverse32(self.bstr.read_int()) self.capitals = self.bstr.read_unicode(capitals_len, False) # icon v12+ if self.header.version > 0x120000: self.icon_size = reverse16(self.bstr.read_word()) self.icon = self.bstr.read(self.icon_size) else: self.icon_size = 0 self.icon = None if self.header.version > 0x140000: self.header_checksum = reverse32(self.bstr.read_int()) else: self.header_checksum = 0 if self.header.version > 0x120000: self.pages_end = reverse32(self.bstr.read_int()) self.overlay_data = reverse32(self.bstr.read_int()) else: self.pages_end = self.bstr.length self.overlay_data = self.bstr.length # no overlay if self.header.version > 0x140000: self.dummy1 = reverse32(self.bstr.read_int()) self.dummy2 = reverse32(self.bstr.read_int()) else: self.dummy1 = 0 self.dummy2 = 0 # set bstr pos for decoding self.bstr.seek(self.header.dictionary_encoder_offset) # x6 system dictionary table based xor decoding # each block xored with start key=0x7f # 1. dictionary_encoder_offset -> article_offset # must by decoded befor decoder.read() # 2. annotation_offset -> dictionary_encoder_offset # annotation decoded in the read_annotation # 3. each article encoded individully # articles_offset + heading.reference -> articles_offset + heading.next-reference # article decoded in the def xor_block_x6(self, start, end, key=0x7f): for i in range(start, end): byte = self.bstr.record[i] self.bstr.record[i] = byte ^ key key = xor_pad[byte] return key @property def pages_count(self): return (self.pages_end - self.header.pages_offset) // 512 def get_page_offset(self, page_number): return self.header.pages_offset + 512 * page_number def read_headings(self): for i in range(self.pages_count): self.read_heading_from_page(i) # set last next_reference self.headings[-1].next_reference = self.header.pages_offset - self.header.articles_offset def merge_headings(self): res = [] # fill next_reference in the headings prev = self.headings[0] res.append(prev) for i in range(1, len(self.headings)): h = self.headings[i] if prev.reference == h.reference: # multititle article prev.merge(h) else: res[-1].next_reference = h.reference res.append(h) prev = h # headings[i].next_reference = headings[i+1].reference # set next_reference for last item to the pages_offset res[-1].next_reference = self.header.pages_offset - self.header.articles_offset return res def read_heading_from_page(self, page_number): self.bstr.seek(self.get_page_offset(page_number)) page = CachePage(self.bstr) if page.is_leaf: prefix = "" for idx in range(page.headings_count): heading = ArticleHeading() prefix = heading.read(self.decoder, self.bstr, prefix) self.headings.append(heading) def read_article(self, heading): self.bstr.seek(self.header.articles_offset + heading.reference) if self.header.version == 0x151005: # xor article self.xor_block_x6(self.header.articles_offset + heading.reference, self.header.articles_offset + heading.next_reference) size = self.bstr.read_bits(16) if size == 0xFFFF: size = self.bstr.read_bits(32) res = self.decoder.decode_article(size) # assert(res) return res def read_annotation(self): if self.header.version == 0x151005: # xor annotation self.xor_block_x6(self.header.annotation_offset, self.header.dictionary_encoder_offset) res = "" if self.bstr.seek(self.header.annotation_offset): size = self.bstr.read_bits(16) res = self.decoder.decode_article(size) return res @property def readed(self): return self._readed def read(self): if self.verbose: print("reading dictionary..") self.decoder.read() self._readed = True @property def parsed(self): return self._parsed def parse(self): if not self.readed: self.read() if self.verbose: print("decoding overlay..") self.overlay = OverlayReader(self.bstr, self.overlay_data) if self.verbose: print("decoding headings: %d" % self.header.entries_count) self.read_headings() if self.headings.appended != self.header.entries_count: raise LsdError("Decoded not all entries %d != %d" % (self.headings.appended, self.header.entries_count)) # merge multititle headings # self.headings = self.merge_headings() if self.verbose: print("decoding articles: %d" % len(self.headings)) for h in self.headings: # h.dump() self.dict.append((h, self.read_article(h))) self._parsed = True if self.verbose: print("OK") def write(self, path=""): """ save decoded dictionary """ if not self.parsed: self.parse() self.write_icon(path) self.write_annotation(path) self.write_overlay(path) self.write_dsl(path) if self.verbose: self.write_prefix(path) def make_filename(self, path, ext): base, orig_ext = os.path.splitext(self.filename) if path != "": base = os.path.join(path, os.path.basename(base)) return base + '.' + ext def write_icon(self, path=""): if self.icon_size == 0: return ico_file = self.make_filename(path, "bmp") with open(ico_file, 'wb') as ico: ico.write(self.icon) if self.verbose: print('Write icon: %s' % ico_file) def write_annotation(self, path=""): annotation = self.read_annotation() if annotation == "": return ann_file = self.make_filename(path, "ann") with codecs.open(ann_file, 'w', encoding='utf-16') as ann: ann.write(annotation) if self.verbose: print('Write annotation: %s' % ann_file) def write_prefix(self, path=""): if self.decoder.prefix == "": return pref_file = self.make_filename(path, "pref") with codecs.open(pref_file, 'w', encoding='utf-8') as pref: pref.write(self.decoder.prefix) if self.verbose: print('Write prefix: %s' % pref_file) def write_overlay(self, path=""): pass @staticmethod def normalize_article(article): res = article.replace(u'\n', u'\n\t') return res def write_dsl(self, path=""): if len(self.dict) == 0: print("Nothing writing to dsl!") return dsl_file = self.make_filename(path, "dsl") with codecs.open(dsl_file, 'w', encoding='utf-16') as dsl: dsl.write(u"#NAME\t\"" + self.name + u"\"\n") dsl.write(u"#INDEX_LANGUAGE\t\"" + tools.lang_map[self.header.source_language] + u"\"\n") dsl.write(u"#CONTENTS_LANGUAGE\t\"" + tools.lang_map[self.header.target_language] + u"\"\n") if self.icon_size > 0: base, orig_ext = os.path.splitext(os.path.basename(self.filename)) dsl.write(u"#ICON_FILE\t\"" + base + '.' + "bmp" + u"\"\n") dsl.write(u"\n") for h, r in self.dict: if h.simple: dsl.write(h.get_first_ext_text()) dsl.write(u"\n\t") else: for item in h.headings: dsl.write(item.ext_text) dsl.write(u"\n") dsl.write(u"\t") dsl.write(self.normalize_article(r)) dsl.write(u"\n") if self.verbose: print('Write dsl: %s' % dsl_file) def dump(self): self.header.dump() # dump header for not supported versions if self.decoder is not None: print("Name: %s" % self.name) print("First heading: %s" % self.first_heading) print("Last heading: %s" % self.last_heading) print("Capitals: %s" % self.capitals) print("Pages end: %s" % hex(self.pages_end)) print("Overlay data: %s" % hex(self.overlay_data)) print("Pages count: %d" % ((self.pages_end - self.header.pages_offset) // 512)) if self.header.version > 0x140000: print("dummy1: %s" % hex(self.dummy1)) print("dummy2: %s" % hex(self.dummy2)) print("Icon enable: %s" % (self.icon_size > 0)) if self.readed: self.decoder.dump() self.overlay.dump()
class LsdFile(): def __init__(self, dict_file, mode="r"): """Open the Mobi file with mode read "r" or write "w" """ if mode not in ("r", "w"): raise RuntimeError('MobiFile() requires mode "r" or "w"') self.filename = dict_file self.parsed = False self.verbose = False mode_dict = {'r': 'rb', 'w': 'wb'} with open(dict_file, mode_dict[mode]) as fp: self.bstr = BitStream(fp.read()) self.header = Header(self.bstr) # only lsd if self.header.magic != 'LingVo': raise LsdError('Allow only Lsd "LingVo" ident: %s' % repr(self.header.magic)) self.bstr.seek(self.header.dictionary_encoder_offset) version = self.header.version if version == 0x142001: # user dictionaries self.decoder = decoder.UserDictionaryDecoder(self.bstr) elif version == 0x141004: # system dictionaries self.decoder = decoder.SystemDictionaryDecoder(self.bstr) elif version == 0x145001: # abbreviation dictionaries self.decoder = decoder.AbbreviationDictionaryDecoder(self.bstr) else: self.header.dump(True) raise LsdError("Not supported dict version %s" % hex(self.header.version)) self.overlay = None self.headings = [] self.dict = [] @property def pages_count(self): return (self.header.pages_end - self.header.pages_offset) // 512 def get_page_offset(self, page_number): return self.header.pages_offset + 512 * page_number def collect_headings(self): res = [] for i in range(self.pages_count): headings = self.collect_heading_from_page(i) res += headings return res def collect_heading_from_page(self, page_number): res = [] self.bstr.seek(self.get_page_offset(page_number)) page = CachePage(self.bstr) if page.is_leaf: prefix = "" for idx in range(page.headings_count): heading = ArticleHeading(self.decoder, self.bstr, prefix, self.header.version) prefix = heading.text res.append(heading) return res def decode_article(self, reference): self.bstr.seek(self.header.articles_offset + reference) size = self.bstr.read_bits(16) if size == 0xFFFF: size = self.bstr.read_bits(32) res = self.decoder.decode_article(size) #assert(res) return res def parse(self): print("decoding dictionary..") self.overlay = OverlayReader(self.bstr, self.header.overlay_data) print("decoding headings: %d" % self.header.entries_count) self.headings = self.collect_headings() if len(self.headings) != self.header.entries_count: raise Exception("Decoded not all entries %d != %d" % (len(self.headings), self.header.entries_count)) print("decoding articles..") for h in self.headings: self.dict.append((h.ext_text, self.decode_article(h.reference))) self.parsed = True print("OK") @property def annotation(self): res = "" if self.bstr.seek(self.header.annotation_offset): size = self.bstr.read_bits(16) res = self.decoder.decode_article(size) return res def dump(self): self.header.dump(self.verbose) self.decoder.dump(self.verbose) if self.verbose: self.overlay.dump() def make_filename(self, path, ext): base, orig_ext = os.path.splitext(self.filename) if path != "": base = os.path.join(path, os.path.basename(base)) return base + '.' + ext def write_icon(self, path=""): if self.header.icon_size == 0: return ico_file = self.make_filename(path, "bmp") with open(ico_file, 'w') as ico: ico.write(self.header.icon) print('Write icon: %s' % ico_file) def write_annotation(self, path=""): if self.annotation == "": return ann_file = self.make_filename(path, "ann") with codecs.open(ann_file, 'w', encoding='utf-16') as ann: ann.write(self.annotation) print('Write annotation: %s' % ann_file) def write_prefix(self, path=""): if self.annotation == "": return pref_file = self.make_filename(path, "pref") with codecs.open(pref_file, 'w', encoding='utf-8') as pref: pref.write(self.decoder.prefix) print('Write prefix: %s' % pref_file) def write_overlay(self, path=""): pass @staticmethod def normalize_article(article): res = article.replace(u'\n', u'\n\t') return res def write_dsl(self, path=""): if len(self.dict) == 0: print("Nothing writing to dsl!") return dsl_file = self.make_filename(path, "dsl") with codecs.open(dsl_file, 'w', encoding='utf-16') as dsl: dsl.write(u"#NAME\t\"" + self.header.name + u"\"\n") dsl.write(u"#INDEX_LANGUAGE\t\"" + tools.lang_map[self.header.source_language] + u"\"\n") dsl.write(u"#CONTENTS_LANGUAGE\t\"" + tools.lang_map[self.header.target_language] + u"\"\n") if self.header.icon_size > 0: base, orig_ext = os.path.splitext(os.path.basename(self.filename)) dsl.write(u"#ICON_FILE\t\"" + base + '.' + "bmp" + u"\"\n") dsl.write(u"\n") for h, r in self.dict: dsl.write(h) dsl.write(u"\n\t") dsl.write(self.normalize_article(r)) dsl.write(u"\n") print('Write dsl: %s' % dsl_file) def write(self, path=""): if not self.parsed: raise LsdError("Must parsed first!") self.write_icon(path) self.write_annotation(path) self.write_overlay(path) self.write_dsl(path) if self.verbose: self.write_prefix(path)