def detokenize(self, markambiguous): def enspancharacters(text): result = "" for char in text: if char in u"āēīōūȳĀĒĪŌŪȲaeiouyAEIOUY": result = result + '<span>' + char + '</span>' else: result = result + char return result #enddef result = "" enclitic = "" for token in self.tokens: if token.isreordered: enclitic = token.macronized else: if token.token.lower( ) == "ne" and len(enclitic) > 0: ## Not nēque... result += token.token + enclitic else: unicodetext = postags.unicodeaccents(token.macronized) if markambiguous: unicodetext = enspancharacters(unicodetext) if token.isambiguous: unicodetext = '<span class="ambig">' + unicodetext + '</span>' elif token.isunknown: unicodetext = '<span class="unknown">' + unicodetext + '</span>' else: unicodetext = '<span class="auto">' + unicodetext + '</span>' result += unicodetext + enclitic enclitic = "" return result
def detokenize(self, markambiguous): def enspancharacters(text): result = "" for char in text: if char in u"āēīōūȳĀĒĪŌŪȲaeiouyAEIOUY": result = result + '<span>'+char+'</span>' else: result = result + char return result #enddef result = "" enclitic = "" for token in self.tokens: if token.isreordered: enclitic = token.macronized else: if token.token.lower() == "ne" and len(enclitic) > 0: ## Not nēque... result += token.token + enclitic else: unicodetext = postags.unicodeaccents(token.macronized) if markambiguous: unicodetext = enspancharacters(unicodetext) if token.isambiguous: unicodetext = '<span class="ambig">' + unicodetext + '</span>' elif token.isunknown: unicodetext = '<span class="unknown">' + unicodetext + '</span>' else: unicodetext = '<span class="auto">' + unicodetext + '</span>' result += unicodetext + enclitic enclitic = "" return result
def detokenize(self, markambiguous): result = [] for token in self.tokens: if token.isword: unicodetext = postags.unicodeaccents(token.macronized) if markambiguous: unicodetext = re.sub(r"([āēīōūȳĀĒĪŌŪȲaeiouyAEIOUY])", "<span>\\1</span>", unicodetext) if token.isunknown: unicodetext = '<span class="unknown">%s</span>' % unicodetext elif len(set([x.replace("^", "") for x in token.accented])) > 1: unicodetext = '<span class="ambig">%s</span>' % unicodetext else: unicodetext = '<span class="auto">%s</span>' % unicodetext result.append(unicodetext) else: if markambiguous: result.append(escape(token.macronized)) else: result.append(token.macronized) return "".join(result)
#!/usr/bin/python # -*- coding: utf-8 -*- import postags import codecs macronsfile = codecs.open("macrons.txt","r","utf8") lexicon = codecs.open("rftagger-lexicon.txt","w","utf8") tagtoaccents = {} for line in macronsfile: [wordform, tag, lemma, accented] = line.split() accented = accented.replace("_^", "").replace("^", "") tagtoaccents[tag] = tagtoaccents.get(tag,[]) + [postags.unicodeaccents(accented)] if accented[0].isupper(): wordform = wordform.title() tag = '.'.join(list(tag)) lexicon.write(wordform + '\t' + tag + '\t' + lemma + '\n') def escapedaccents(txt): for replacement, source in [("a_",u"ā"),("e_",u"ē"),("i_",u"ī"),("o_",u"ō"),("u_",u"ū"),("y_",u"ȳ"), ("A_",u"Ā"),("E_",u"Ē"),("I_",u"Ī"),("O_",u"Ō"),("U_",u"Ū"),("Y_",u"Ȳ")]: txt = txt.replace(source,replacement) return txt endingsfile = codecs.open("macronized-endings.txt","w","utf8") for tag in tagtoaccents: endingfreqs = {} for accented in tagtoaccents[tag]: for i in range(1,min(len(accented)-3, 12)):
import postags import codecs from collections import defaultdict import xml.etree.ElementTree as ET import pprint pp = pprint.PrettyPrinter() tag_to_accents = defaultdict(list) with codecs.open('macrons.txt', 'r', 'utf8') as macrons_file, \ codecs.open('rftagger-lexicon.txt', 'w', 'utf8') as lexicon_file: for line in macrons_file: [wordform, tag, lemma, accented] = line.split() accented = accented.replace('_^', '').replace('^', '') tag_to_accents[tag].append(postags.unicodeaccents(accented)) if accented[0].isupper(): wordform = wordform.title() tag = '.'.join(list(tag)) lexicon_file.write("%s\t%s\t%s\n" % (wordform, tag, lemma)) with codecs.open('macronized_endings.py', 'w', 'utf8') as endings_file: endings_file.write('tag_to_endings = {\n') for tag in sorted(tag_to_accents): ending_freqs = defaultdict(int) for accented in tag_to_accents[tag]: for i in range(1, min(len(accented)-3, 12)): ending = accented[-i:] ending_freqs[ending] += 1 relevant_endings = []
#!/usr/bin/python # -*- coding: utf-8 -*- import postags import codecs macronsfile = codecs.open("macrons.txt", "r", "utf8") lexicon = codecs.open("rftagger-lexicon.txt", "w", "utf8") tagtoaccents = {} for line in macronsfile: [wordform, tag, lemma, accented] = line.split() tagtoaccents[tag] = tagtoaccents.get( tag, []) + [postags.unicodeaccents(accented)] tag = '.'.join(list(tag)) lexicon.write(wordform + '\t' + tag + '\t' + lemma + '\n') def escapedaccents(txt): for replacement, source in [("a_", u"ā"), ("e_", u"ē"), ("i_", u"ī"), ("o_", u"ō"), ("u_", u"ū"), ("y_", u"ȳ"), ("A_", u"Ā"), ("E_", u"Ē"), ("I_", u"Ī"), ("O_", u"Ō"), ("U_", u"Ū"), ("Y_", u"Ȳ")]: txt = txt.replace(source, replacement) return txt #enddef endingsfile = codecs.open("macronized-endings.txt", "w", "utf8") for tag in tagtoaccents: