class Reader(object): _encoding: str = "" _substyle: bool = True def __init__(self, glos): self._glos = glos self.clear() self._re_internal_link = re.compile('href=(["\'])(entry://|[dx]:)') def clear(self): self._filename = "" self._mdx = None self._mdd = [] self._wordCount = 0 self._dataEntryCount = 0 # dict of mainWord -> newline-separated altenatives self._linksDict = {} # type: Dict[str, str] def open(self, filename): from pyglossary.plugin_lib.readmdict import MDX, MDD self._filename = filename self._mdx = MDX(filename, self._encoding, self._substyle) """ multiple MDD files are supported with this naming schema: FILE.mdx FILE.mdd FILE.1.mdd FILE.2.mdd FILE.3.mdd """ filenameNoExt, ext = splitext(self._filename) mddBase = "".join([filenameNoExt, extsep]) for fname in (f"{mddBase}mdd", f"{mddBase}1.mdd"): if isfile(fname): self._mdd.append(MDD(fname)) mddN = 2 while isfile(f"{mddBase}{mddN}.mdd"): self._mdd.append(MDD(f"{mddBase}{mddN}.mdd")) mddN += 1 dataEntryCount = 0 for mdd in self._mdd: dataEntryCount += len(mdd) self._dataEntryCount = dataEntryCount log.info(f"Found {len(self._mdd)} mdd files with {dataEntryCount} entries") log.debug("mdx.header = " + pformat(self._mdx.header)) # for key, value in self._mdx.header.items(): # key = key.lower() # self._glos.setInfo(key, value) try: title = self._mdx.header[b"Title"] except KeyError: pass else: self._glos.setInfo("name", title) desc = self._mdx.header.get(b"Description", "") if desc: self._glos.setInfo("description", desc) self.loadLinks() def loadLinks(self): from pyglossary.plugin_lib.readmdict import MDX log.info("extracting links...") linksDict = {} word = "" wordCount = 0 for b_word, b_defi in self._mdx.items(): word = b_word.decode("utf-8") defi = b_defi.decode("utf-8").strip() if defi.startswith("@@@LINK="): if not word: log.warn(f"unexpected defi: {defi}") continue mainWord = defi[8:] if mainWord in linksDict: linksDict[mainWord] += "\n" + word else: linksDict[mainWord] = word continue wordCount += 1 log.info( "extracting links done, " f"sizeof(linksDict)={sys.getsizeof(linksDict)}" ) log.info(f"wordCount = {wordCount}") self._linksDict = linksDict self._wordCount = wordCount self._mdx = MDX(self._filename, self._encoding, self._substyle) def __iter__(self): if self._mdx is None: log.error("trying to iterate on a closed MDX file") return glos = self._glos linksDict = self._linksDict for b_word, b_defi in self._mdx.items(): word = b_word.decode("utf-8") defi = b_defi.decode("utf-8").strip() if defi.startswith("@@@LINK="): continue defi = self._re_internal_link.sub(r'href=\1bword://', defi) defi = defi.replace(' src="file://', ' src="') words = word altsStr = linksDict.get(word, "") if altsStr: words = [word] + altsStr.split("\n") yield glos.newEntry(words, defi) self._mdx = None del linksDict self._linksDict = {} gc.collect() for mdd in self._mdd: try: for b_fname, b_data in mdd.items(): fname = toStr(b_fname) fname = fname.replace("\\", os.sep).lstrip(os.sep) yield glos.newDataEntry(fname, b_data) except Exception as e: log.exception(f"Error reading {mdd.filename}") self._mdd = [] def __len__(self): return self._wordCount + self._dataEntryCount def close(self): self.clear()
class Reader(object): def __init__(self, glos): self._glos = glos self.clear() def clear(self): self._filename = "" self._encoding = "" self._substyle = True self._mdx = None self._mdd = [] def open(self, filename, encoding="", substyle=True): from pyglossary.plugin_lib.readmdict import MDX, MDD self._filename = filename self._encoding = encoding self._substyle = substyle self._mdx = MDX(filename, self._encoding, self._substyle) filenameNoExt, ext = splitext(self._filename) mddBase = "".join([filenameNoExt, extsep]) for fname in (f"{mddBase}mdd", f"{mddBase}1.mdd"): if isfile(fname): self._mdd.append(MDD(fname)) mddN = 2 while isfile(f"{mddBase}{mddN}.mdd"): self._mdd.append(MDD(f"{mddBase}{mddN}.mdd")) mddN += 1 log.info(f"Found {len(self._mdd)} mdd files") log.debug("mdx.header = " + pformat(self._mdx.header)) # for key, value in self._mdx.header.items(): # key = key.lower() # self._glos.setInfo(key, value) try: title = self._mdx.header[b"Title"] except KeyError: pass else: self._glos.setInfo("name", title) desc = self._mdx.header.get(b"Description", "") if desc: self._glos.setInfo("description", desc) def __iter__(self): if self._mdx is None: log.error("trying to iterate on a closed MDX file") else: for word, defi in self._mdx.items(): word = toStr(word) defi = toStr(defi) yield self._glos.newEntry(word, defi) self._mdx = None for mdd in self._mdd: for b_fname, b_data in mdd.items(): fname = toStr(b_fname) fname = fname.replace("\\", os.sep).lstrip(os.sep) yield self._glos.newDataEntry(fname, b_data) self._mdd = [] def __len__(self): if self._mdx is None: log.error( "OctopusMdict: called len(reader) while reader is not open") return 0 return len(self._mdx) def close(self): self.clear()
class Reader(object): def __init__(self, glos): self._glos = glos self.clear() def clear(self): self._filename = '' self._encoding = '' self._substyle = True self._mdx = None self._mdd = None self._mddFilename = '' self._dataDir = '' def open(self, filename, **options): from pyglossary.plugin_lib.readmdict import MDX, MDD self._filename = filename self._encoding = options.get('encoding', '') self._substyle = options.get('substyle', True) self._mdx = MDX(filename, self._encoding, self._substyle) ### filenameNoExt, ext = splitext(self._filename) self._dataDir = options.get('resPath', filenameNoExt + '_files') mddFilename = ''.join([filenameNoExt, extsep, 'mdd']) if isfile(mddFilename): self._mdd = MDD(mddFilename) self._mddFilename = mddFilename ### log.pretty(self._mdx.header, 'mdx.header=') #for key, value in self._mdx.header.items(): # key = key.lower() # self._glos.setInfo(key, value) try: title = self._mdx.header[b'Title'] except KeyError: pass else: self._glos.setInfo('title', title) self._glos.setInfo('description', self._mdx.header.get(b'Description', '')) ### try: self.writeDataFiles() except: log.exception('error while saving MDict data files') def writeDataFiles(self): if not self._mdd: return if not isdir(self._dataDir): os.makedirs(self._dataDir) for key, value in self._mdd.items(): key = toStr(key) fpath = ''.join([self._dataDir, key.replace('\\', os.path.sep)]) if not isdir(dirname(fpath)): os.makedirs(dirname(fpath)) log.debug('saving MDict data file: %s' % fpath) f = open(fpath, 'wb') f.write(value) f.close() self._mdd = None def __iter__(self): if self._mdx is None: log.error('trying to iterate on a closed MDX file') else: for word, defi in self._mdx.items(): word = toStr(word) defi = toStr(defi) yield Entry(word, defi) self._mdx = None def __len__(self): if self._mdx is None: log.error( 'OctopusMdict: called len(reader) while reader is not open') return 0 return len(self._mdx) def close(self): self.clear()
class Reader(object): def __init__(self, glos): self._glos = glos self.clear() def clear(self): self._filename = '' self._encoding = '' self._substyle = True self._mdx = None self._mdd = None self._mddFilename = '' self._dataDir = '' def open(self, filename, **options): self._filename = filename self._encoding = options.get('encoding', '') self._substyle = options.get('substyle', True) self._mdx = MDX(filename, self._encoding, self._substyle) ### filenameNoExt, ext = splitext(self._filename) self._dataDir = options.get('resPath', filenameNoExt + '_files') mddFilename = ''.join([filenameNoExt, extsep, 'mdd']) if isfile(mddFilename): self._mdd = MDD(mddFilename) self._mddFilename = mddFilename ### log.pretty(self._mdx.header, 'mdx.header=') #for key, value in self._mdx.header.items(): # key = key.lower() # self._glos.setInfo(key, value) try: title = self._mdx.header['Title'] except KeyError: pass else: self._glos.setInfo('title', title) self._glos.setInfo('description', self._mdx.header.get('Description', '')) ### try: self.writeDataFiles() except: log.exception('error while saving MDict data files') def writeDataFiles(self): if not self._mdd: return if not isdir(self._dataDir): os.makedirs(self._dataDir) for key, value in self._mdd.items(): fpath = ''.join([self._dataDir, key.replace('\\', os.path.sep)]); if not isdir(dirname(fpath)): os.makedirs(dirname(fpath)) log.debug('saving MDict data file: %s'%fpath) f = open(fpath, 'wb') f.write(value) f.close() def __iter__(self): for word, defi in self._mdx.items(): yield Entry(word, defi) __len__ = lambda self: len(self._mdx) def close(self): self.clear()
class Reader(object): def __init__(self, glos): self._glos = glos self.clear() def clear(self): self._filename = "" self._encoding = "" self._substyle = True self._mdx = None self._mdd = None self._mddFilename = "" def open(self, filename, **options): from pyglossary.plugin_lib.readmdict import MDX, MDD self._filename = filename self._encoding = options.get("encoding", "") self._substyle = options.get("substyle", True) self._mdx = MDX(filename, self._encoding, self._substyle) filenameNoExt, ext = splitext(self._filename) mddFilename = "".join([filenameNoExt, extsep, "mdd"]) if isfile(mddFilename): self._mdd = MDD(mddFilename) self._mddFilename = mddFilename log.pretty(self._mdx.header, "mdx.header=") # for key, value in self._mdx.header.items(): # key = key.lower() # self._glos.setInfo(key, value) try: title = self._mdx.header[b"Title"] except KeyError: pass else: self._glos.setInfo("title", title) self._glos.setInfo( "description", self._mdx.header.get(b"Description", ""), ) def __iter__(self): if self._mdx is None: log.error("trying to iterate on a closed MDX file") else: for word, defi in self._mdx.items(): word = toStr(word) defi = toStr(defi) yield self._glos.newEntry(word, defi) self._mdx = None if self._mdd: for b_fname, b_data in self._mdd.items(): fname = toStr(b_fname) fname = fname.replace("\\", os.sep).lstrip(os.sep) yield self._glos.newDataEntry(fname, b_data) self._mdd = None def __len__(self): if self._mdx is None: log.error( "OctopusMdict: called len(reader) while reader is not open" ) return 0 return len(self._mdx) def close(self): self.clear()
class Reader(object): def __init__(self, glos): self._glos = glos self.clear() def clear(self): self._filename = '' self._encoding = '' self._substyle = True self._mdx = None self._mdd = None self._mddFilename = '' self._dataDir = '' def open(self, filename, **options): from pyglossary.plugin_lib.readmdict import MDX, MDD self._filename = filename self._encoding = options.get('encoding', '') self._substyle = options.get('substyle', True) self._mdx = MDX(filename, self._encoding, self._substyle) ### filenameNoExt, ext = splitext(self._filename) self._dataDir = options.get('resPath', filenameNoExt + '_files') mddFilename = ''.join([filenameNoExt, extsep, 'mdd']) if isfile(mddFilename): self._mdd = MDD(mddFilename) self._mddFilename = mddFilename ### log.pretty(self._mdx.header, 'mdx.header=') #for key, value in self._mdx.header.items(): # key = key.lower() # self._glos.setInfo(key, value) try: title = self._mdx.header[b'Title'] except KeyError: pass else: self._glos.setInfo('title', title) self._glos.setInfo('description', self._mdx.header.get(b'Description', '')) ### try: self.writeDataFiles() except: log.exception('error while saving MDict data files') def writeDataFiles(self): if not self._mdd: return if not isdir(self._dataDir): os.makedirs(self._dataDir) for key, value in self._mdd.items(): key = toStr(key) fpath = ''.join([self._dataDir, key.replace('\\', os.path.sep)]); if not isdir(dirname(fpath)): os.makedirs(dirname(fpath)) log.debug('saving MDict data file: %s'%fpath) f = open(fpath, 'wb') f.write(value) f.close() self._mdd = None def __iter__(self): if self._mdx is None: log.error('trying to iterate on a closed MDX file') else: for word, defi in self._mdx.items(): word = toStr(word) defi = toStr(defi) yield Entry(word, defi) self._mdx = None def __len__(self): if self._mdx is None: log.error( 'OctopusMdict: called len(reader) while reader is not open' ) return 0 return len(self._mdx) def close(self): self.clear()
class Reader(object): def __init__(self, glos): self._glos = glos self.clear() def clear(self): self._filename = '' self._encoding = '' self._substyle = True self._mdx = None self._mdd = None self._mddFilename = '' self._dataDir = '' def open(self, filename, **options): self._filename = filename self._encoding = options.get('encoding', '') self._substyle = options.get('substyle', True) self._mdx = MDX(filename, self._encoding, self._substyle) ### filenameNoExt, ext = splitext(self._filename) self._dataDir = options.get('resPath', filenameNoExt + '_files') mddFilename = ''.join([filenameNoExt, extsep, 'mdd']) if isfile(mddFilename): self._mdd = MDD(mddFilename) self._mddFilename = mddFilename ### log.pretty(self._mdx.header, 'mdx.header=') #for key, value in self._mdx.header.items(): # key = key.lower() # self._glos.setInfo(key, value) try: title = self._mdx.header['Title'] except KeyError: pass else: self._glos.setInfo('title', title) self._glos.setInfo('description', self._mdx.header.get('Description', '')) ### try: self.writeDataFiles() except: log.exception('error while saving MDict data files') def writeDataFiles(self): if not self._mdd: return if not isdir(self._dataDir): os.makedirs(self._dataDir) for key, value in self._mdd.items(): fpath = ''.join([self._dataDir, key.replace('\\', os.path.sep)]) if not isdir(dirname(fpath)): os.makedirs(dirname(fpath)) log.debug('saving MDict data file: %s' % fpath) f = open(fpath, 'wb') f.write(value) f.close() def __iter__(self): for word, defi in self._mdx.items(): yield Entry(word, defi) __len__ = lambda self: len(self._mdx) def close(self): self.clear()