Beispiel #1
0
    def write(
        self,
        filename: str,
        resources: bool = True,
    ) -> Generator[None, "BaseEntry", None]:
        glos = self._glos
        fileObj = open(filename, "w", encoding="utf-8")
        title = glos.getInfo("name")
        author = glos.getInfo("author")
        # didn't find any tag for author in existing glossaries
        publisher = glos.getInfo("publisher")
        copyright = glos.getInfo("copyright")
        creationTime = glos.getInfo("creationTime")

        fileObj.write(f"""<?xml version="1.0" encoding="UTF-8"?>
<TEI xmlns="http://www.tei-c.org/ns/1.0">
<teiHeader>
<fileDesc>
<titleStmt>
	<title>{title}</title>
	<respStmt><resp>converted with</resp><name>PyGlossary</name></respStmt>
</titleStmt>
<publicationStmt>
	<author>{author}</author>
	<publisher>{publisher}</publisher>
	<availability><p>{copyright}</p></availability>
	<date>{creationTime}</date>
</publicationStmt>
<sourceDesc><p>{filename}</p></sourceDesc>
</fileDesc>
</teiHeader>
<text><body>""")

        while True:
            entry = yield
            if entry is None:
                break
            if entry.isData():
                if resources:
                    entry.save(f"{filename}_res")
                continue
            word = xml_escape(entry.s_word)
            defi = xml_escape(entry.defi)
            fileObj.write(f"""<entry>
<form><orth>{word}</orth></form>
<trans><tr>{defi}</tr></trans>
</entry>""")
        fileObj.write("</body></text></TEI>")
        fileObj.close()
Beispiel #2
0
def write(glos, filename):
    fp = open(filename, 'wb')
    fp.write('<?xml version="1.0" encoding="utf-8" ?>\n<words>\n<xfardic>')
    for item in infoKeys:
        fp.write('<'+item+'>'+str(glos.getInfo(item))+'</'+item+'>')
    fp.write('</xfardic>\n')
    for entry in glos:
        words = entry.getWords()
        word, alts = words[0], words[1:]
        defi = entry.getDefi()
        #fp.write("<word><in>"+word+"</in><out>"+ defi+"</out></word>\n")
        fp.write('<word>\n    <in>%s</in>\n'%xml_escape(word))
        for alt in alts:
            fp.write('    <alt>%s</alt>\n'%xml_escape(alt))
        fp.write('    <out>%s</out>\n</word>\n'%xml_escape(defi))
    fp.write("</words>\n")
    fp.close()
Beispiel #3
0
def write(
    glos: GlossaryType,
    filename: str,
    resources: bool = True,
):
    fp = open(filename, "w", encoding="utf-8")
    title = glos.getInfo("name")
    publisher = glos.getInfo("author")
    copyright = glos.getInfo("copyright")
    creationTime = glos.getInfo("creationTime")

    fp.write(f"""<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE TEI.2 PUBLIC "-//TEI P3//DTD Main Document Type//EN"
"/usr/share/sgml/tei-3/tei2.dtd" [
<!ENTITY %% TEI.dictionaries "INCLUDE" > ]>
<tei.2>
<teiHeader>
<fileDesc>
<titleStmt>
	<title>{title}</title>
	<respStmt><resp>converted with</resp><name>PyGlossary</name></respStmt>
</titleStmt>
<publicationStmt>
	<publisher>{publisher}</publisher>
	<availability><p>{copyright}</p></availability>
	<date>{creationTime}</date>
</publicationStmt>
<sourceDesc><p>{filename}</p></sourceDesc>
</fileDesc>
</teiHeader>
<text><body>""")

    for entry in glos:
        if entry.isData():
            if resources:
                entry.save(filename + "_res")
            continue
        word = xml_escape(entry.getWord())
        defi = xml_escape(entry.getDefi())
        fp.write(f"""<entry>
<form><orth>{word}</orth></form>
<trans><tr>{defi}</tr></trans>
</entry>""")
    fp.write("</body></text></tei.2>")
    fp.close()
Beispiel #4
0
def write(glos, filename):
    fp = open(filename, 'wb')
    fp.write('<?xml version="1.0" encoding="utf-8" ?>\n<words>\n<xfardic>')
    for item in infoKeys:
        fp.write('<' + item + '>' + str(glos.getInfo(item)) + '</' + item +
                 '>')
    fp.write('</xfardic>\n')
    for entry in glos:
        words = entry.getWords()
        word, alts = words[0], words[1:]
        defi = entry.getDefi()
        #fp.write("<word><in>"+word+"</in><out>"+ defi+"</out></word>\n")
        fp.write('<word>\n    <in>%s</in>\n' % xml_escape(word))
        for alt in alts:
            fp.write('    <alt>%s</alt>\n' % xml_escape(alt))
        fp.write('    <out>%s</out>\n</word>\n' % xml_escape(defi))
    fp.write("</words>\n")
    fp.close()
Beispiel #5
0
def replaceHtmlEntryCB(u_match):
    """
	u_match: instance of _sre.SRE_Match
	Same as replaceHtmlEntryNoEscapeCB, but escapes result string

	Only <, >, & characters are escaped.
	"""
    u_res = replaceHtmlEntryNoEscapeCB(u_match)
    if u_match.group(0) == u_res:  # conversion failed
        return u_res
    else:
        return xml_escape(u_res)
Beispiel #6
0
def replaceHtmlEntryCB(u_match):
	"""
	u_match: instance of _sre.SRE_Match
	Same as replaceHtmlEntryNoEscapeCB, but escapes result string

	Only <, >, & characters are escaped.
	"""
	u_res = replaceHtmlEntryNoEscapeCB(u_match)
	if u_match.group(0) == u_res:  # conversion failed
		return u_res
	else:
		return xml_escape(u_res)
Beispiel #7
0
	def processDefi(self, b_defi, b_key):
		"""
		b_defi: bytes
		b_key: bytes

		return: u_defi_format
		"""

		fields = DefinitionFields()
		self.collectDefiFields(b_defi, b_key, fields)

		fields.u_defi, fields.singleEncoding = self.decodeCharsetTags(
			fields.b_defi,
			self.targetEncoding,
		)
		if fields.singleEncoding:
			fields.encoding = self.targetEncoding
		fields.u_defi = fixImgLinks(fields.u_defi)
		fields.u_defi = replaceHtmlEntries(fields.u_defi)
		fields.u_defi = removeControlChars(fields.u_defi)
		fields.u_defi = normalizeNewlines(fields.u_defi)
		fields.u_defi = fields.u_defi.strip()

		if fields.b_title:
			fields.u_title, singleEncoding = self.decodeCharsetTags(
				fields.b_title,
				self.sourceEncoding,
			)
			fields.u_title = replaceHtmlEntries(fields.u_title)
			fields.u_title = removeControlChars(fields.u_title)

		if fields.b_title_trans:
			# sourceEncoding or targetEncoding ?
			fields.u_title_trans, singleEncoding = self.decodeCharsetTags(
				fields.b_title_trans,
				self.sourceEncoding,
			)
			fields.u_title_trans = replaceHtmlEntries(fields.u_title_trans)
			fields.u_title_trans = removeControlChars(fields.u_title_trans)

		if fields.b_transcription_50:
			if fields.code_transcription_50 == 0x10:
				# contains values like this (char codes):
				# 00 18 00 19 00 1A 00 1B 00 1C 00 1D 00 1E 00 40 00 07
				# this is not utf-16
				# what is this?
				pass
			elif fields.code_transcription_50 == 0x1b:
				fields.u_transcription_50, singleEncoding = \
					self.decodeCharsetTags(
						fields.b_transcription_50,
						self.sourceEncoding,
					)
				fields.u_transcription_50 = \
					replaceHtmlEntries(fields.u_transcription_50)
				fields.u_transcription_50 = \
					removeControlChars(fields.u_transcription_50)
			elif fields.code_transcription_50 == 0x18:
				# incomplete text like:
				# t c=T>02D0;</charset>g<charset c=T>0259;</charset>-
				# This defi normally contains fields.b_transcription_60
				# in this case.
				pass
			else:
				log.debug(
					"processDefi(%s)\n" % b_defi +
					"b_key = %s:\n" % b_key +
					"defi field 50, " +
					"unknown code: %#.2x" % fields.code_transcription_50
				)

		if fields.b_transcription_60:
			if fields.code_transcription_60 == 0x1b:
				fields.u_transcription_60, singleEncoding = \
					self.decodeCharsetTags(
						fields.b_transcription_60,
						self.sourceEncoding,
					)
				fields.u_transcription_60 = \
					replaceHtmlEntries(fields.u_transcription_60)
				fields.u_transcription_60 = \
					removeControlChars(fields.u_transcription_60)
			else:
				log.debug(
					"processDefi(%s)\n" % b_defi +
					"b_key = %s:\n" % b_key +
					"defi field 60" +
					"unknown code: %#.2x" % fields.code_transcription_60,
				)

		if fields.b_field_1a:
			fields.u_field_1a, singleEncoding = self.decodeCharsetTags(
				fields.b_field_1a,
				self.sourceEncoding,
			)

		self.processDefiStat(fields, b_defi, b_key)

		u_defi_format = ""
		if fields.partOfSpeech or fields.u_title:
			if fields.partOfSpeech:
				u_defi_format += '<font color="#%s">%s</font>' % (
					self.partOfSpeechColor,
					xml_escape(fields.partOfSpeech),
				)
			if fields.u_title:
				if u_defi_format:
					u_defi_format += " "
				u_defi_format += fields.u_title
			u_defi_format += "<br>\n"
		if fields.u_title_trans:
			u_defi_format += fields.u_title_trans + "<br>\n"
		if fields.u_transcription_50:
			u_defi_format += "[%s]<br>\n" % fields.u_transcription_50
		if fields.u_transcription_60:
			u_defi_format += "[%s]<br>\n" % fields.u_transcription_60
		if fields.u_defi:
			u_defi_format += fields.u_defi
		return u_defi_format
Beispiel #8
0
	def processDefi(self, b_defi, b_key):
		"""
		b_defi: bytes
		b_key: bytes

		return: u_defi_format
		"""

		fields = DefinitionFields()
		self.collectDefiFields(b_defi, b_key, fields)

		fields.u_defi, fields.singleEncoding = self.decodeCharsetTags(
			fields.b_defi,
			self.targetEncoding,
		)
		if fields.singleEncoding:
			fields.encoding = self.targetEncoding
		fields.u_defi = fixImgLinks(fields.u_defi)
		fields.u_defi = replaceHtmlEntries(fields.u_defi)
		fields.u_defi = removeControlChars(fields.u_defi)
		fields.u_defi = normalizeNewlines(fields.u_defi)
		fields.u_defi = fields.u_defi.strip()

		if fields.b_title:
			fields.u_title, singleEncoding = self.decodeCharsetTags(
				fields.b_title,
				self.sourceEncoding,
			)
			fields.u_title = replaceHtmlEntries(fields.u_title)
			fields.u_title = removeControlChars(fields.u_title)

		if fields.b_title_trans:
			# sourceEncoding or targetEncoding ?
			fields.u_title_trans, singleEncoding = self.decodeCharsetTags(
				fields.b_title_trans,
				self.sourceEncoding,
			)
			fields.u_title_trans = replaceHtmlEntries(fields.u_title_trans)
			fields.u_title_trans = removeControlChars(fields.u_title_trans)

		if fields.b_transcription_50:
			if fields.code_transcription_50 == 0x10:
				# contains values like this (char codes):
				# 00 18 00 19 00 1A 00 1B 00 1C 00 1D 00 1E 00 40 00 07
				# this is not utf-16
				# what is this?
				pass
			elif fields.code_transcription_50 == 0x1b:
				fields.u_transcription_50, singleEncoding = \
					self.decodeCharsetTags(
						fields.b_transcription_50,
						self.sourceEncoding,
					)
				fields.u_transcription_50 = \
					replaceHtmlEntries(fields.u_transcription_50)
				fields.u_transcription_50 = \
					removeControlChars(fields.u_transcription_50)
			elif fields.code_transcription_50 == 0x18:
				# incomplete text like:
				# t c=T>02D0;</charset>g<charset c=T>0259;</charset>-
				# This defi normally contains fields.b_transcription_60
				# in this case.
				pass
			else:
				log.debug(
					f"processDefi({b_defi})\nb_key = {b_key}"
					f":\ndefi field 50"
					f", unknown code: {fields.code_transcription_50:#02x}"
				)

		if fields.b_transcription_60:
			if fields.code_transcription_60 == 0x1b:
				fields.u_transcription_60, singleEncoding = \
					self.decodeCharsetTags(
						fields.b_transcription_60,
						self.sourceEncoding,
					)
				fields.u_transcription_60 = \
					replaceHtmlEntries(fields.u_transcription_60)
				fields.u_transcription_60 = \
					removeControlChars(fields.u_transcription_60)
			else:
				log.debug(
					f"processDefi({b_defi})\nb_key = {b_key}"
					f":\ndefi field 60"
					f", unknown code: {fields.code_transcription_60:#02x}"
				)

		if fields.b_field_1a:
			fields.u_field_1a, singleEncoding = self.decodeCharsetTags(
				fields.b_field_1a,
				self.sourceEncoding,
			)

		self.processDefiStat(fields, b_defi, b_key)

		u_defi_format = ""
		if fields.partOfSpeech or fields.u_title:
			if fields.partOfSpeech:
				pos = xml_escape(fields.partOfSpeech)
				posColor = self.partOfSpeechColor
				u_defi_format += f'<font color="#{posColor}">{pos}</font>'
			if fields.u_title:
				if u_defi_format:
					u_defi_format += " "
				u_defi_format += fields.u_title
			u_defi_format += "<br>\n"
		if fields.u_title_trans:
			u_defi_format += fields.u_title_trans + "<br>\n"
		if fields.u_transcription_50:
			u_defi_format += f"[{fields.u_transcription_50}]<br>\n"
		if fields.u_transcription_60:
			u_defi_format += f"[{fields.u_transcription_60}]<br>\n"
		if fields.u_defi:
			u_defi_format += fields.u_defi
		return u_defi_format