Esempio n. 1
0
class Reader(object):
	_encoding: str = ""
	_substyle: bool = True

	def __init__(self, glos):
		self._glos = glos
		self.clear()
		self._re_internal_link = re.compile('href=(["\'])(entry://|[dx]:)')

	def clear(self):
		self._filename = ""
		self._mdx = None
		self._mdd = []
		self._wordCount = 0
		self._dataEntryCount = 0

		# dict of mainWord -> newline-separated altenatives
		self._linksDict = {}  # type: Dict[str, str]

	def open(self, filename):
		from pyglossary.plugin_lib.readmdict import MDX, MDD
		self._filename = filename
		self._mdx = MDX(filename, self._encoding, self._substyle)

		"""
			multiple MDD files are supported with this naming schema:
				FILE.mdx
				FILE.mdd
				FILE.1.mdd
				FILE.2.mdd
				FILE.3.mdd
		"""

		filenameNoExt, ext = splitext(self._filename)
		mddBase = "".join([filenameNoExt, extsep])
		for fname in (f"{mddBase}mdd", f"{mddBase}1.mdd"):
			if isfile(fname):
				self._mdd.append(MDD(fname))
		mddN = 2
		while isfile(f"{mddBase}{mddN}.mdd"):
			self._mdd.append(MDD(f"{mddBase}{mddN}.mdd"))
			mddN += 1

		dataEntryCount = 0
		for mdd in self._mdd:
			dataEntryCount += len(mdd)
		self._dataEntryCount = dataEntryCount
		log.info(f"Found {len(self._mdd)} mdd files with {dataEntryCount} entries")

		log.debug("mdx.header = " + pformat(self._mdx.header))
		# for key, value in self._mdx.header.items():
		#	key = key.lower()
		#	self._glos.setInfo(key, value)
		try:
			title = self._mdx.header[b"Title"]
		except KeyError:
			pass
		else:
			self._glos.setInfo("name", title)
		desc = self._mdx.header.get(b"Description", "")
		if desc:
			self._glos.setInfo("description", desc)

		self.loadLinks()

	def loadLinks(self):
		from pyglossary.plugin_lib.readmdict import MDX
		log.info("extracting links...")
		linksDict = {}
		word = ""
		wordCount = 0
		for b_word, b_defi in self._mdx.items():
			word = b_word.decode("utf-8")
			defi = b_defi.decode("utf-8").strip()
			if defi.startswith("@@@LINK="):
				if not word:
					log.warn(f"unexpected defi: {defi}")
					continue
				mainWord = defi[8:]
				if mainWord in linksDict:
					linksDict[mainWord] += "\n" + word
				else:
					linksDict[mainWord] = word
				continue
			wordCount += 1

		log.info(
			"extracting links done, "
			f"sizeof(linksDict)={sys.getsizeof(linksDict)}"
		)
		log.info(f"wordCount = {wordCount}")
		self._linksDict = linksDict
		self._wordCount = wordCount
		self._mdx = MDX(self._filename, self._encoding, self._substyle)

	def __iter__(self):
		if self._mdx is None:
			log.error("trying to iterate on a closed MDX file")
			return

		glos = self._glos
		linksDict = self._linksDict
		for b_word, b_defi in self._mdx.items():
			word = b_word.decode("utf-8")
			defi = b_defi.decode("utf-8").strip()
			if defi.startswith("@@@LINK="):
				continue
			defi = self._re_internal_link.sub(r'href=\1bword://', defi)
			defi = defi.replace(' src="file://', ' src="')
			words = word
			altsStr = linksDict.get(word, "")
			if altsStr:
				words = [word] + altsStr.split("\n")
			yield glos.newEntry(words, defi)

		self._mdx = None
		del linksDict
		self._linksDict = {}
		gc.collect()

		for mdd in self._mdd:
			try:
				for b_fname, b_data in mdd.items():
					fname = toStr(b_fname)
					fname = fname.replace("\\", os.sep).lstrip(os.sep)
					yield glos.newDataEntry(fname, b_data)
			except Exception as e:
				log.exception(f"Error reading {mdd.filename}")
		self._mdd = []

	def __len__(self):
		return self._wordCount + self._dataEntryCount

	def close(self):
		self.clear()
Esempio n. 2
0
class Reader(object):
    def __init__(self, glos):
        self._glos = glos
        self.clear()

    def clear(self):
        self._filename = ""
        self._encoding = ""
        self._substyle = True
        self._mdx = None
        self._mdd = []

    def open(self, filename, encoding="", substyle=True):
        from pyglossary.plugin_lib.readmdict import MDX, MDD
        self._filename = filename
        self._encoding = encoding
        self._substyle = substyle
        self._mdx = MDX(filename, self._encoding, self._substyle)

        filenameNoExt, ext = splitext(self._filename)
        mddBase = "".join([filenameNoExt, extsep])
        for fname in (f"{mddBase}mdd", f"{mddBase}1.mdd"):
            if isfile(fname):
                self._mdd.append(MDD(fname))
        mddN = 2
        while isfile(f"{mddBase}{mddN}.mdd"):
            self._mdd.append(MDD(f"{mddBase}{mddN}.mdd"))
            mddN += 1
        log.info(f"Found {len(self._mdd)} mdd files")

        log.debug("mdx.header = " + pformat(self._mdx.header))
        # for key, value in self._mdx.header.items():
        #	key = key.lower()
        #	self._glos.setInfo(key, value)
        try:
            title = self._mdx.header[b"Title"]
        except KeyError:
            pass
        else:
            self._glos.setInfo("name", title)
        desc = self._mdx.header.get(b"Description", "")
        if desc:
            self._glos.setInfo("description", desc)

    def __iter__(self):
        if self._mdx is None:
            log.error("trying to iterate on a closed MDX file")
        else:
            for word, defi in self._mdx.items():
                word = toStr(word)
                defi = toStr(defi)
                yield self._glos.newEntry(word, defi)
            self._mdx = None

        for mdd in self._mdd:
            for b_fname, b_data in mdd.items():
                fname = toStr(b_fname)
                fname = fname.replace("\\", os.sep).lstrip(os.sep)
                yield self._glos.newDataEntry(fname, b_data)
        self._mdd = []

    def __len__(self):
        if self._mdx is None:
            log.error(
                "OctopusMdict: called len(reader) while reader is not open")
            return 0
        return len(self._mdx)

    def close(self):
        self.clear()
Esempio n. 3
0
class Reader(object):
    def __init__(self, glos):
        self._glos = glos
        self.clear()

    def clear(self):
        self._filename = ''
        self._encoding = ''
        self._substyle = True
        self._mdx = None
        self._mdd = None
        self._mddFilename = ''
        self._dataDir = ''

    def open(self, filename, **options):
        from pyglossary.plugin_lib.readmdict import MDX, MDD
        self._filename = filename
        self._encoding = options.get('encoding', '')
        self._substyle = options.get('substyle', True)
        self._mdx = MDX(filename, self._encoding, self._substyle)
        ###
        filenameNoExt, ext = splitext(self._filename)
        self._dataDir = options.get('resPath', filenameNoExt + '_files')
        mddFilename = ''.join([filenameNoExt, extsep, 'mdd'])
        if isfile(mddFilename):
            self._mdd = MDD(mddFilename)
            self._mddFilename = mddFilename
        ###
        log.pretty(self._mdx.header, 'mdx.header=')
        #for key, value in self._mdx.header.items():
        #    key = key.lower()
        #    self._glos.setInfo(key, value)
        try:
            title = self._mdx.header[b'Title']
        except KeyError:
            pass
        else:
            self._glos.setInfo('title', title)
        self._glos.setInfo('description',
                           self._mdx.header.get(b'Description', ''))
        ###
        try:
            self.writeDataFiles()
        except:
            log.exception('error while saving MDict data files')

    def writeDataFiles(self):
        if not self._mdd:
            return
        if not isdir(self._dataDir):
            os.makedirs(self._dataDir)
        for key, value in self._mdd.items():
            key = toStr(key)
            fpath = ''.join([self._dataDir,
                             key.replace('\\', os.path.sep)])
            if not isdir(dirname(fpath)):
                os.makedirs(dirname(fpath))
            log.debug('saving MDict data file: %s' % fpath)
            f = open(fpath, 'wb')
            f.write(value)
            f.close()
        self._mdd = None

    def __iter__(self):
        if self._mdx is None:
            log.error('trying to iterate on a closed MDX file')
        else:
            for word, defi in self._mdx.items():
                word = toStr(word)
                defi = toStr(defi)
                yield Entry(word, defi)
            self._mdx = None

    def __len__(self):
        if self._mdx is None:
            log.error(
                'OctopusMdict: called len(reader) while reader is not open')
            return 0
        return len(self._mdx)

    def close(self):
        self.clear()
Esempio n. 4
0
class Reader(object):
    def __init__(self, glos):
        self._glos = glos
        self.clear()
    def clear(self):
        self._filename = ''
        self._encoding = ''
        self._substyle = True
        self._mdx = None
        self._mdd = None
        self._mddFilename = ''
        self._dataDir = ''
    def open(self, filename, **options):
        self._filename = filename
        self._encoding = options.get('encoding', '')
        self._substyle = options.get('substyle', True)
        self._mdx = MDX(filename, self._encoding, self._substyle)
        ###
        filenameNoExt, ext = splitext(self._filename)
        self._dataDir = options.get('resPath', filenameNoExt + '_files')
        mddFilename = ''.join([filenameNoExt, extsep, 'mdd'])
        if isfile(mddFilename):
            self._mdd = MDD(mddFilename)
            self._mddFilename = mddFilename
        ###
        log.pretty(self._mdx.header, 'mdx.header=')
        #for key, value in self._mdx.header.items():
        #    key = key.lower()
        #    self._glos.setInfo(key, value)
        try:
            title = self._mdx.header['Title']
        except KeyError:
            pass
        else:
            self._glos.setInfo('title', title)
        self._glos.setInfo('description', self._mdx.header.get('Description', ''))
        ###
        try:
            self.writeDataFiles()
        except:
            log.exception('error while saving MDict data files')

    def writeDataFiles(self):
        if not self._mdd:
            return
        if not isdir(self._dataDir):
            os.makedirs(self._dataDir)
        for key, value in self._mdd.items():
            fpath = ''.join([self._dataDir, key.replace('\\', os.path.sep)]);
            if not isdir(dirname(fpath)):
                os.makedirs(dirname(fpath))
            log.debug('saving MDict data file: %s'%fpath)
            f = open(fpath, 'wb')
            f.write(value)
            f.close()

    def __iter__(self):
        for word, defi in self._mdx.items():
            yield Entry(word, defi)

    __len__ = lambda self: len(self._mdx)

    def close(self):
        self.clear()
Esempio n. 5
0
class Reader(object):
	def __init__(self, glos):
		self._glos = glos
		self.clear()

	def clear(self):
		self._filename = ""
		self._encoding = ""
		self._substyle = True
		self._mdx = None
		self._mdd = None
		self._mddFilename = ""

	def open(self, filename, **options):
		from pyglossary.plugin_lib.readmdict import MDX, MDD
		self._filename = filename
		self._encoding = options.get("encoding", "")
		self._substyle = options.get("substyle", True)
		self._mdx = MDX(filename, self._encoding, self._substyle)

		filenameNoExt, ext = splitext(self._filename)
		mddFilename = "".join([filenameNoExt, extsep, "mdd"])
		if isfile(mddFilename):
			self._mdd = MDD(mddFilename)
			self._mddFilename = mddFilename

		log.pretty(self._mdx.header, "mdx.header=")
		# for key, value in self._mdx.header.items():
		#	key = key.lower()
		#	self._glos.setInfo(key, value)
		try:
			title = self._mdx.header[b"Title"]
		except KeyError:
			pass
		else:
			self._glos.setInfo("title", title)
		self._glos.setInfo(
			"description",
			self._mdx.header.get(b"Description", ""),
		)

	def __iter__(self):
		if self._mdx is None:
			log.error("trying to iterate on a closed MDX file")
		else:
			for word, defi in self._mdx.items():
				word = toStr(word)
				defi = toStr(defi)
				yield self._glos.newEntry(word, defi)
			self._mdx = None

		if self._mdd:
			for b_fname, b_data in self._mdd.items():
				fname = toStr(b_fname)
				fname = fname.replace("\\", os.sep).lstrip(os.sep)
				yield self._glos.newDataEntry(fname, b_data)
			self._mdd = None

	def __len__(self):
		if self._mdx is None:
			log.error(
				"OctopusMdict: called len(reader) while reader is not open"
			)
			return 0
		return len(self._mdx)

	def close(self):
		self.clear()
Esempio n. 6
0
class Reader(object):
    def __init__(self, glos):
        self._glos = glos
        self.clear()
    def clear(self):
        self._filename = ''
        self._encoding = ''
        self._substyle = True
        self._mdx = None
        self._mdd = None
        self._mddFilename = ''
        self._dataDir = ''
    def open(self, filename, **options):
        from pyglossary.plugin_lib.readmdict import MDX, MDD
        self._filename = filename
        self._encoding = options.get('encoding', '')
        self._substyle = options.get('substyle', True)
        self._mdx = MDX(filename, self._encoding, self._substyle)
        ###
        filenameNoExt, ext = splitext(self._filename)
        self._dataDir = options.get('resPath', filenameNoExt + '_files')
        mddFilename = ''.join([filenameNoExt, extsep, 'mdd'])
        if isfile(mddFilename):
            self._mdd = MDD(mddFilename)
            self._mddFilename = mddFilename
        ###
        log.pretty(self._mdx.header, 'mdx.header=')
        #for key, value in self._mdx.header.items():
        #    key = key.lower()
        #    self._glos.setInfo(key, value)
        try:
            title = self._mdx.header[b'Title']
        except KeyError:
            pass
        else:
            self._glos.setInfo('title', title)
        self._glos.setInfo('description', self._mdx.header.get(b'Description', ''))
        ###
        try:
            self.writeDataFiles()
        except:
            log.exception('error while saving MDict data files')

    def writeDataFiles(self):
        if not self._mdd:
            return
        if not isdir(self._dataDir):
            os.makedirs(self._dataDir)
        for key, value in self._mdd.items():
            key = toStr(key)
            fpath = ''.join([self._dataDir, key.replace('\\', os.path.sep)]);
            if not isdir(dirname(fpath)):
                os.makedirs(dirname(fpath))
            log.debug('saving MDict data file: %s'%fpath)
            f = open(fpath, 'wb')
            f.write(value)
            f.close()
        self._mdd = None

    def __iter__(self):
        if self._mdx is None:
            log.error('trying to iterate on a closed MDX file')
        else:
            for word, defi in self._mdx.items():
                word = toStr(word)
                defi = toStr(defi)
                yield Entry(word, defi)
            self._mdx = None

    def __len__(self):
        if self._mdx is None:
            log.error(
                'OctopusMdict: called len(reader) while reader is not open'
            )
            return 0
        return len(self._mdx)

    def close(self):
        self.clear()
Esempio n. 7
0
class Reader(object):
    def __init__(self, glos):
        self._glos = glos
        self.clear()

    def clear(self):
        self._filename = ''
        self._encoding = ''
        self._substyle = True
        self._mdx = None
        self._mdd = None
        self._mddFilename = ''
        self._dataDir = ''

    def open(self, filename, **options):
        self._filename = filename
        self._encoding = options.get('encoding', '')
        self._substyle = options.get('substyle', True)
        self._mdx = MDX(filename, self._encoding, self._substyle)
        ###
        filenameNoExt, ext = splitext(self._filename)
        self._dataDir = options.get('resPath', filenameNoExt + '_files')
        mddFilename = ''.join([filenameNoExt, extsep, 'mdd'])
        if isfile(mddFilename):
            self._mdd = MDD(mddFilename)
            self._mddFilename = mddFilename
        ###
        log.pretty(self._mdx.header, 'mdx.header=')
        #for key, value in self._mdx.header.items():
        #    key = key.lower()
        #    self._glos.setInfo(key, value)
        try:
            title = self._mdx.header['Title']
        except KeyError:
            pass
        else:
            self._glos.setInfo('title', title)
        self._glos.setInfo('description',
                           self._mdx.header.get('Description', ''))
        ###
        try:
            self.writeDataFiles()
        except:
            log.exception('error while saving MDict data files')

    def writeDataFiles(self):
        if not self._mdd:
            return
        if not isdir(self._dataDir):
            os.makedirs(self._dataDir)
        for key, value in self._mdd.items():
            fpath = ''.join([self._dataDir,
                             key.replace('\\', os.path.sep)])
            if not isdir(dirname(fpath)):
                os.makedirs(dirname(fpath))
            log.debug('saving MDict data file: %s' % fpath)
            f = open(fpath, 'wb')
            f.write(value)
            f.close()

    def __iter__(self):
        for word, defi in self._mdx.items():
            yield Entry(word, defi)

    __len__ = lambda self: len(self._mdx)

    def close(self):
        self.clear()