コード例 #1
0
ファイル: macronizer.py プロジェクト: gridl/latin-macronizer
    def detokenize(self, markambiguous):
        def enspancharacters(text):
            result = ""
            for char in text:
                if char in u"āēīōūȳĀĒĪŌŪȲaeiouyAEIOUY":
                    result = result + '<span>' + char + '</span>'
                else:
                    result = result + char
            return result

        #enddef
        result = ""
        enclitic = ""
        for token in self.tokens:
            if token.isreordered:
                enclitic = token.macronized
            else:
                if token.token.lower(
                ) == "ne" and len(enclitic) > 0:  ## Not nēque...
                    result += token.token + enclitic
                else:
                    unicodetext = postags.unicodeaccents(token.macronized)
                    if markambiguous:
                        unicodetext = enspancharacters(unicodetext)
                        if token.isambiguous:
                            unicodetext = '<span class="ambig">' + unicodetext + '</span>'
                        elif token.isunknown:
                            unicodetext = '<span class="unknown">' + unicodetext + '</span>'
                        else:
                            unicodetext = '<span class="auto">' + unicodetext + '</span>'
                    result += unicodetext + enclitic
                enclitic = ""
        return result
コード例 #2
0
ファイル: macronizer.py プロジェクト: mk270/latin-macronizer
 def detokenize(self, markambiguous):
     def enspancharacters(text):
         result = ""
         for char in text:
             if char in u"āēīōūȳĀĒĪŌŪȲaeiouyAEIOUY":
                 result = result + '<span>'+char+'</span>'
             else:
                 result = result + char
         return result
     #enddef
     result = ""
     enclitic = ""
     for token in self.tokens:
         if token.isreordered:
             enclitic = token.macronized
         else:
             if token.token.lower() == "ne" and len(enclitic) > 0: ## Not nēque...
                 result += token.token + enclitic
             else:
                 unicodetext = postags.unicodeaccents(token.macronized)
                 if markambiguous:
                     unicodetext = enspancharacters(unicodetext)
                     if token.isambiguous:
                         unicodetext = '<span class="ambig">' + unicodetext + '</span>'
                     elif token.isunknown:
                         unicodetext = '<span class="unknown">' + unicodetext + '</span>'
                     else:
                         unicodetext = '<span class="auto">' + unicodetext + '</span>'
                 result += unicodetext + enclitic
             enclitic = ""
     return result
コード例 #3
0
ファイル: macronizer.py プロジェクト: vbskr/latin-macronizer
 def detokenize(self, markambiguous):
     result = []
     for token in self.tokens:
         if token.isword:
             unicodetext = postags.unicodeaccents(token.macronized)
             if markambiguous:
                 unicodetext = re.sub(r"([āēīōūȳĀĒĪŌŪȲaeiouyAEIOUY])", "<span>\\1</span>", unicodetext)
                 if token.isunknown:
                     unicodetext = '<span class="unknown">%s</span>' % unicodetext
                 elif len(set([x.replace("^", "") for x in token.accented])) > 1:
                     unicodetext = '<span class="ambig">%s</span>' % unicodetext
                 else:
                     unicodetext = '<span class="auto">%s</span>' % unicodetext
             result.append(unicodetext)
         else:
             if markambiguous:
                 result.append(escape(token.macronized))
             else:
                 result.append(token.macronized)
     return "".join(result)
コード例 #4
0
#!/usr/bin/python
# -*- coding: utf-8 -*-

import postags
import codecs

macronsfile = codecs.open("macrons.txt","r","utf8")
lexicon = codecs.open("rftagger-lexicon.txt","w","utf8")

tagtoaccents = {}

for line in macronsfile:
    [wordform, tag, lemma, accented] = line.split()
    accented = accented.replace("_^", "").replace("^", "")
    tagtoaccents[tag] = tagtoaccents.get(tag,[]) + [postags.unicodeaccents(accented)]
    if accented[0].isupper():
        wordform = wordform.title()
    tag = '.'.join(list(tag))
    lexicon.write(wordform + '\t' + tag + '\t' + lemma + '\n')

def escapedaccents(txt):
    for replacement, source in [("a_",u"ā"),("e_",u"ē"),("i_",u"ī"),("o_",u"ō"),("u_",u"ū"),("y_",u"ȳ"),
                                ("A_",u"Ā"),("E_",u"Ē"),("I_",u"Ī"),("O_",u"Ō"),("U_",u"Ū"),("Y_",u"Ȳ")]:
        txt = txt.replace(source,replacement)
    return txt

endingsfile = codecs.open("macronized-endings.txt","w","utf8")
for tag in tagtoaccents:
    endingfreqs = {}
    for accented in tagtoaccents[tag]:
        for i in range(1,min(len(accented)-3, 12)):
コード例 #5
0
import postags
import codecs
from collections import defaultdict
import xml.etree.ElementTree as ET
import pprint

pp = pprint.PrettyPrinter()

tag_to_accents = defaultdict(list)
with codecs.open('macrons.txt', 'r', 'utf8') as macrons_file, \
     codecs.open('rftagger-lexicon.txt', 'w', 'utf8') as lexicon_file:
    for line in macrons_file:
        [wordform, tag, lemma, accented] = line.split()
        accented = accented.replace('_^', '').replace('^', '')
        tag_to_accents[tag].append(postags.unicodeaccents(accented))
        if accented[0].isupper():
            wordform = wordform.title()
        tag = '.'.join(list(tag))
        lexicon_file.write("%s\t%s\t%s\n" % (wordform, tag, lemma))


with codecs.open('macronized_endings.py', 'w', 'utf8') as endings_file:
    endings_file.write('tag_to_endings = {\n')
    for tag in sorted(tag_to_accents):
        ending_freqs = defaultdict(int)
        for accented in tag_to_accents[tag]:
            for i in range(1, min(len(accented)-3, 12)):
                ending = accented[-i:]
                ending_freqs[ending] += 1
        relevant_endings = []
コード例 #6
0
#!/usr/bin/python
# -*- coding: utf-8 -*-

import postags
import codecs

macronsfile = codecs.open("macrons.txt", "r", "utf8")
lexicon = codecs.open("rftagger-lexicon.txt", "w", "utf8")

tagtoaccents = {}

for line in macronsfile:
    [wordform, tag, lemma, accented] = line.split()
    tagtoaccents[tag] = tagtoaccents.get(
        tag, []) + [postags.unicodeaccents(accented)]
    tag = '.'.join(list(tag))
    lexicon.write(wordform + '\t' + tag + '\t' + lemma + '\n')


def escapedaccents(txt):
    for replacement, source in [("a_", u"ā"), ("e_", u"ē"), ("i_", u"ī"),
                                ("o_", u"ō"), ("u_", u"ū"), ("y_", u"ȳ"),
                                ("A_", u"Ā"), ("E_", u"Ē"), ("I_", u"Ī"),
                                ("O_", u"Ō"), ("U_", u"Ū"), ("Y_", u"Ȳ")]:
        txt = txt.replace(source, replacement)
    return txt


#enddef
endingsfile = codecs.open("macronized-endings.txt", "w", "utf8")
for tag in tagtoaccents: