Exemple #1
0
 def nextPair(self):
     if not self._file:
         raise StopIteration
     line = self._file.readline()
     if not line:
         raise StopIteration
     line = line.strip()## This also removed tailing newline
     if not line:
         return
     ###
     word, tab, defi = line.partition('\t')
     if not tab:
         log.error('Warning: line starting with "%s" has no tab!'%line[:10])
         return
     ###
     if self._glos.getPref('enable_alts', True):
         word = splitByBarUnescapeNTB(word)
         if len(word)==1:
             word = word[0]
     else:
         word = unescapeNTB(word, bar=True)
     ###
     defi = unescapeNTB(defi)
     ###
     return word, defi
Exemple #2
0
 def nextPair(self) -> "Tuple[str, str]":
     if not self._file:
         raise StopIteration
     line = self.readline()
     if not line:
         raise StopIteration
     line = line.rstrip("\n")
     if not line:
         return
     ###
     word, tab, defi = line.partition("\t")
     if not tab:
         log.error(f"Warning: line starting with {line[:10]!r} has no tab!")
         return
     ###
     if self._glos.getConfig("enable_alts", True):
         word = splitByBarUnescapeNTB(word)
         if len(word) == 1:
             word = word[0]
     else:
         word = unescapeNTB(word, bar=False)
     ###
     defi = unescapeNTB(defi)
     ###
     return word, defi
Exemple #3
0
	def nextPair(self) -> Tuple[str, str]:
		if not self._file:
			raise StopIteration
		line = self._file.readline()
		if not line:
			raise StopIteration
		line = line.strip()  # This also removes tailing newline
		if not line:
			return
		###
		word, tab, defi = line.partition("\t")
		if not tab:
			log.error(
				"Warning: line starting with \"%s\" has no tab!" % line[:10]
			)
			return
		###
		if self._glos.getPref("enable_alts", True):
			word = splitByBarUnescapeNTB(word)
			if len(word) == 1:
				word = word[0]
		else:
			word = unescapeNTB(word, bar=True)
		###
		defi = unescapeNTB(defi)
		###
		return word, defi
Exemple #4
0
 def __next__(self):
     if not self._nextPath:
         log.error('iterating over a reader which is not open')
         raise StopIteration
     if self._nextPath == 'END':
         if self._pos != self._len:
             log.warning('%s words found, wordCount in info.json was %s'%(self._pos, self._len))
             self._len = self._pos
         raise StopIteration
     ###
     self._pos += 1
     ###
     with open(join(self._filename, self._nextPath), 'r', encoding=self._encoding) as fp:
         self._nextPath = fp.readline().rstrip()
         word = fp.readline().rstrip()
         defi = fp.read().rstrip()
     ###
     if self._glos.getPref('enable_alts', True):
         word = splitByBarUnescapeNTB(word)
         if len(word)==1:
             word = word[0]
     else:
         word = unescapeNTB(word, bar=True)
     ###
     #defi = unescapeNTB(defi)
     ###
     return Entry(word, defi)
Exemple #5
0
    def __iter__(self) -> Iterator[BaseEntry]:
        if not self._rootPath:
            log.error("iterating over a reader which is not open")
            raise StopIteration

        wordCount = 0
        nextPath = self._rootPath
        while nextPath != "END":
            wordCount += 1
            # before or after reading word and defi
            # (and skipping empty entry)? FIXME

            with open(
                    join(self._filename, nextPath),
                    "r",
                    encoding=self._encoding,
            ) as fromFile:
                header = fromFile.readline().rstrip()
                if self._havePrevLink:
                    self._prevPath, nextPath = header.split(" ")
                else:
                    nextPath = header
                word = fromFile.readline()
                if not word:
                    yield None  # update progressbar
                    continue
                defi = fromFile.read()
                if not defi:
                    log.warning(
                        f"Edlin Reader: no definition for word {word!r}"
                        f", skipping")
                    yield None  # update progressbar
                    continue
                word = word.rstrip()
                defi = defi.rstrip()

            if self._glos.getPref("enable_alts", True):
                word = splitByBarUnescapeNTB(word)
                if len(word) == 1:
                    word = word[0]
            else:
                word = unescapeNTB(word, bar=True)

            # defi = unescapeNTB(defi)
            yield self._glos.newEntry(word, defi)

        if wordCount != self._wordCount:
            log.warning(f"{wordCount} words found, "
                        f"wordCount in info.json was {self._wordCount}")
            self._wordCount = wordCount

        resDir = self._resDir
        for fname in self._resFileNames:
            with open(join(resDir, fname), "rb") as fromFile:
                yield self._glos.newDataEntry(
                    fname,
                    fromFile.read(),
                )
Exemple #6
0
 def __next__(self):
     if not self._nextPath:
         log.error('iterating over a reader which is not open')
         raise StopIteration
     if self._nextPath == 'END':
         if self._pos != self._len:
             log.warning('%s words found, wordCount in info.json was %s' %
                         (self._pos, self._len))
             self._len = self._pos
         raise StopIteration
     ###
     self._pos += 1  ## before or after reading word and defi (and skipping empty entry)? FIXME
     ###
     with open(join(self._filename, self._nextPath),
               'r',
               encoding=self._encoding) as fp:
         header = fp.readline().rstrip()
         if self._havePrevLink:
             self._prevPath, self._nextPath = header.split(' ')
         else:
             self._nextPath = header
         word = fp.readline()
         if not word:
             return
         defi = fp.read()
         if not defi:
             log.warning(
                 'Edlin Reader: no definition for word "%s", skipping' %
                 word)
             return
         word = word.rstrip()
         defi = defi.rstrip()
     ###
     if self._glos.getPref('enable_alts', True):
         word = splitByBarUnescapeNTB(word)
         if len(word) == 1:
             word = word[0]
     else:
         word = unescapeNTB(word, bar=True)
     ###
     #defi = unescapeNTB(defi)
     ###
     return Entry(word, defi)
Exemple #7
0
 def __next__(self):
     if not self._nextPath:
         log.error('iterating over a reader which is not open')
         raise StopIteration
     if self._nextPath == 'END':
         if self._pos != self._len:
             log.warning('%s words found, wordCount in info.json was %s'%(self._pos, self._len))
             self._len = self._pos
         raise StopIteration
     ###
     self._pos += 1 ## before or after reading word and defi (and skipping empty entry)? FIXME
     ###
     with open(join(self._filename, self._nextPath), 'r', encoding=self._encoding) as fp:
         header = fp.readline().rstrip()
         if self._havePrevLink:
             self._prevPath, self._nextPath = header.split(' ')
         else:
             self._nextPath = header
         word = fp.readline()
         if not word:
             return
         defi = fp.read()
         if not defi:
             log.warning('Edlin Reader: no definition for word "%s", skipping'%word)
             return
         word = word.rstrip()
         defi = defi.rstrip()
     ###
     if self._glos.getPref('enable_alts', True):
         word = splitByBarUnescapeNTB(word)
         if len(word)==1:
             word = word[0]
     else:
         word = unescapeNTB(word, bar=True)
     ###
     #defi = unescapeNTB(defi)
     ###
     return Entry(word, defi)
Exemple #8
0
    def fixLinks(self, linkTargetSet):
        import gc
        from cachetools import LRUCache

        gc.collect()
        dirn = self._filename

        filenameList = self._filenameList

        fileByWord = {}
        for line in open(join(dirn, "index.txt"), encoding="utf-8"):
            line = line.rstrip("\n")
            if not line:
                continue
            entryIndex, wordEsc, filename, _ = line.split("\t")
            entryIndex = int(entryIndex)
            # entryId = f"entry{entryIndex}"
            word = unescapeNTB(wordEsc)
            if word not in linkTargetSet:
                continue
            if word in fileByWord:
                # log.info(f'fileByWord[{word}]={fileByWord[word]}, filename={filename}')
                fileByWord[word].append((filename, entryIndex))
            else:
                fileByWord[word] = [(filename, entryIndex)]

        linksByFile = LRUCache(maxsize=100)

        # with open(join(dirn, "fileByWord.json"), "w") as fileByWordFile:
        # 	json.dump(fileByWord, fileByWordFile, ensure_ascii=False, indent="\t")

        def getLinksByFile(fileIndex):
            _file = linksByFile.get(fileIndex)
            if _file is not None:
                return _file
            _file = open(
                join(dirn, f"links{fileIndex}"),
                mode="a",
                encoding="utf-8",
            )
            linksByFile[fileIndex] = _file
            return _file

        log.info("")
        for line in open(join(dirn, "links.txt"), encoding="utf-8"):
            line = line.rstrip("\n")
            if not line:
                continue
            target, fileIndex, x_start, x_size = line.split("\t")
            target = unescapeNTB(target)
            if target not in fileByWord:
                targetNew = ""
            else:
                targetFilename, targetEntryIndex = fileByWord[target][0]
                if targetFilename == filename:
                    continue
                targetNew = f"{targetFilename}#entry{targetEntryIndex}"
            _file = getLinksByFile(int(fileIndex))
            _file.write(f"{x_start}\t{x_size}\t{targetNew}\n")
            _file.flush()

        for _, _file in linksByFile.items():
            _file.close()
        del linksByFile

        linkTargetSet.clear()
        del fileByWord, linkTargetSet
        gc.collect()

        entry_url_fmt = self._glos.getInfo("entry_url")

        re_href = re.compile(
            b' href="[^<>"]*?"',
            re.I,
        )

        for fileIndex, filename in enumerate(filenameList):
            if not isfile(join(dirn, f"links{fileIndex}")):
                continue
            with open(join(dirn, filename), mode="rb") as inFile:
                with open(join(dirn, f"{filename}.new"), mode="wb") as outFile:
                    for linkLine in open(join(dirn, f"links{fileIndex}"),
                                         "rb"):
                        outFile.flush()
                        linkLine = linkLine.rstrip(b"\n")
                        x_start, x_size, target = linkLine.split(b"\t")
                        outFile.write(
                            inFile.read(int(x_start, 16) - inFile.tell()))
                        curLink = inFile.read(int(x_size, 16))

                        if target:
                            outFile.write(
                                re_href.sub(
                                    b' href="./' + target + b'"',
                                    curLink,
                                ))
                            continue

                        if not entry_url_fmt:
                            outFile.write(
                                curLink.replace(
                                    b' href="#',
                                    b' class="broken" href="#',
                                ))
                            continue

                        _st = curLink.decode("utf-8")
                        i = _st.find('href="#')
                        j = _st.find('"', i + 7)
                        word = _st[i + 7:j]
                        url = entry_url_fmt.format(word=word)
                        outFile.write(
                            (_st[:i] + f'class="broken" href="{url}"' +
                             _st[j + 1:]).encode("utf-8"))

                    outFile.write(inFile.read())

            os.rename(join(dirn, f"{filename}.new"), join(dirn, filename))
            os.remove(join(dirn, f"links{fileIndex}"))
Exemple #9
0
	def fixLinks(self, linkTargetSet):
		import gc
		from cachetools import LRUCache

		gc.collect()
		dirn = self._filename

		filenameList = self._filenameList

		fileByWord = {}
		for line in open(join(dirn, "index.txt"), encoding="utf-8"):
			line = line.rstrip("\n")
			if not line:
				continue
			word, filename, _ = line.split("\t")
			word = unescapeNTB(word)
			if word not in linkTargetSet:
				continue
			fileByWord[word] = filename

		linksByFile = LRUCache(maxsize=100)

		def getLinksByFile(fileIndex):
			_file = linksByFile.get(fileIndex)
			if _file is not None:
				return _file
			_file = open(
				join(dirn, f"links{fileIndex}"),
				mode="a",
				encoding="utf-8",
			)
			linksByFile[fileIndex] = _file
			return _file

		log.info("")
		for line in open(join(dirn, "links.txt"), encoding="utf-8"):
			line = line.rstrip("\n")
			if not line:
				continue
			target, fileIndex, x_start, x_size = line.split("\t")
			target = unescapeNTB(target)
			if target not in fileByWord:
				targetFilename = ""
			else:
				targetFilename = fileByWord[target]
				if targetFilename == filename:
					continue
			_file = getLinksByFile(int(fileIndex))
			_file.write(
				f"{x_start}\t{x_size}\t{targetFilename}\n"
			)
			_file.flush()

		for _, _file in linksByFile.items():
			_file.close()
		del linksByFile

		linkTargetSet.clear()
		del fileByWord, linkTargetSet
		gc.collect()

		entry_url_fmt = self._glos.getInfo("entry_url")

		for fileIndex, filename in enumerate(filenameList):
			with open(join(dirn, filename), mode="rb") as inFile:
				with open(join(dirn, f"{filename}.new"), mode="wb") as outFile:
					for linkLine in open(join(dirn, f"links{fileIndex}"), "rb"):
						outFile.flush()
						linkLine = linkLine.rstrip(b"\n")
						x_start, x_size, targetFilename = linkLine.split(b"\t")
						outFile.write(inFile.read(
							int(x_start, 16) - inFile.tell()
						))
						curLink = inFile.read(int(x_size, 16))

						if targetFilename:
							outFile.write(curLink.replace(
								b' href="#',
								b' href="./' + targetFilename + b'#',
							))
							continue

						if not entry_url_fmt:
							outFile.write(curLink.replace(
								b' href="#',
								b' class="broken" href="#',
							))
							continue

						_st = curLink.decode("utf-8")
						i = _st.find('href="#')
						j = _st.find('"', i + 7)
						word = _st[i + 7:j]
						url = entry_url_fmt.format(word=word)
						outFile.write((
							_st[:i] +
							f'class="broken" href="{url}"' +
							_st[j + 1:]
						).encode("utf-8"))

					outFile.write(inFile.read())

			os.rename(join(dirn, f"{filename}.new"), join(dirn, filename))
			os.remove(join(dirn, f"links{fileIndex}"))
Exemple #10
0
	def __iter__(self):
		if not self._rootPath:
			log.error("iterating over a reader which is not open")
			raise StopIteration

		wordCount = 0
		nextPath = self._rootPath
		while nextPath != "END":
			wordCount += 1
			# before or after reading word and defi
			# (and skipping empty entry)? FIXME

			with open(
				join(self._filename, nextPath),
				"r",
				encoding=self._encoding,
			) as fromFile:
				header = fromFile.readline().rstrip()
				if self._havePrevLink:
					self._prevPath, nextPath = header.split(" ")
				else:
					nextPath = header
				word = fromFile.readline()
				if not word:
					yield None  # update progressbar
					continue
				defi = fromFile.read()
				if not defi:
					log.warning(
						"Edlin Reader: no definition for word %r" % word +
						", skipping"
					)
					yield None  # update progressbar
					continue
				word = word.rstrip()
				defi = defi.rstrip()

			if self._glos.getPref("enable_alts", True):
				word = splitByBarUnescapeNTB(word)
				if len(word) == 1:
					word = word[0]
			else:
				word = unescapeNTB(word, bar=True)

			# defi = unescapeNTB(defi)
			yield self._glos.newEntry(word, defi)

		if wordCount != self._wordCount:
			log.warning(
				"%s words found, " % wordCount +
				"wordCount in info.json was %s" % self._wordCount
			)
			self._wordCount = wordCount

		resDir = self._resDir
		for fname in self._resFileNames:
			with open(join(resDir, fname), "rb") as fromFile:
				yield self._glos.newDataEntry(
					fname,
					fromFile.read(),
				)