Example #1
0
def evaluate(goldstandard, macronizedtext):
    vowelcount = 0
    lengthcorrect = 0
    outtext = []
    for (a, b) in zip(list(goldstandard), list(macronizedtext)):
        plaina = postags.removemacrons(a)
        plainb = postags.removemacrons(b)
        if touiorthography(toascii(plaina)) != touiorthography(toascii(plainb)):
            raise Exception("Error: Text mismatch.")
        if plaina in "AEIOUYaeiouy":
            vowelcount += 1
            if a == b:
                lengthcorrect += 1
        if toascii(touiorthography(a)) == toascii(touiorthography(b)):
            outtext.append(escape(b))
        else:
            outtext.append('<span class="wrong">%s</span>' % b)
    return lengthcorrect / float(vowelcount), "".join(outtext)
Example #2
0
 def __init__(self, token):
     self.tag = ""
     self.lemma = ""
     self.accented = ""
     self.macronized = ""
     self.token = postags.removemacrons(token)
     self.isword = re.match("[^\W\d_]", token, flags=re.UNICODE)
     self.isspace = re.match("\s", token, flags=re.UNICODE)
     self.isreordered = False
     self.startssentence = False
     self.endssentence = False
     self.isunknown = False
     self.isambiguous = False
Example #3
0
 def __init__(self, token):
     self.tag = ""
     self.lemma = ""
     self.accented = ""
     self.macronized = ""
     self.token = postags.removemacrons(token)
     self.isword = re.match("[^\W\d_]", token, flags=re.UNICODE)
     self.isspace = re.match("\s", token, flags=re.UNICODE)
     self.isreordered = False
     self.startssentence = False
     self.endssentence = False
     self.isunknown = False
     self.isambiguous = False
Example #4
0
 def __init__(self, text):
     self.tag = ""
     self.lemma = ""
     self.accented = [""]
     self.macronized = ""
     self.text = postags.removemacrons(text)
     self.isword = True if re.match("[^\W\d_]", text, flags=re.UNICODE) else False
     self.isspace = True if re.match("\s", text, flags=re.UNICODE) else False
     self.hasenclitic = False
     self.isenclitic = False
     self.startssentence = False
     self.endssentence = False
     self.isunknown = False
    tagtoaccents[tag] = tagtoaccents.get(tag,[]) + [postags.unicodeaccents(accented)]
    if accented[0].isupper():
        wordform = wordform.title()
    tag = '.'.join(list(tag))
    lexicon.write(wordform + '\t' + tag + '\t' + lemma + '\n')

def escapedaccents(txt):
    for replacement, source in [("a_",u"ā"),("e_",u"ē"),("i_",u"ī"),("o_",u"ō"),("u_",u"ū"),("y_",u"ȳ"),
                                ("A_",u"Ā"),("E_",u"Ē"),("I_",u"Ī"),("O_",u"Ō"),("U_",u"Ū"),("Y_",u"Ȳ")]:
        txt = txt.replace(source,replacement)
    return txt

endingsfile = codecs.open("macronized-endings.txt","w","utf8")
for tag in tagtoaccents:
    endingfreqs = {}
    for accented in tagtoaccents[tag]:
        for i in range(1,min(len(accented)-3, 12)):
            ending = accented[-i:]
            endingfreqs[ending] = endingfreqs.get(ending,0) + 1
    endingsfile.write(tag)
    relevantendings = []
    for ending in endingfreqs:
        endingwithoutmacrons = postags.removemacrons(ending)
        if ending[0] != endingwithoutmacrons[0] and endingfreqs[ending] > endingfreqs.get(endingwithoutmacrons, 1):
            relevantendings.append(ending)
    relevantendings.sort(lambda x,y: cmp(len(y), len(x)))
    for ending in relevantendings:
        endingsfile.write('\t' + escapedaccents(ending))
    endingsfile.write('\n')

Example #6
0
            wordform = wordform.title()
        tag = '.'.join(list(tag))
        lexicon_file.write("%s\t%s\t%s\n" % (wordform, tag, lemma))


with codecs.open('macronized_endings.py', 'w', 'utf8') as endings_file:
    endings_file.write('tag_to_endings = {\n')
    for tag in sorted(tag_to_accents):
        ending_freqs = defaultdict(int)
        for accented in tag_to_accents[tag]:
            for i in range(1, min(len(accented)-3, 12)):
                ending = accented[-i:]
                ending_freqs[ending] += 1
        relevant_endings = []
        for ending in ending_freqs:
            ending_without_macrons = postags.removemacrons(ending)
            if ending[0] != ending_without_macrons[0] and ending_freqs[ending] > ending_freqs.get(ending_without_macrons, 1):
                relevant_endings.append(ending)
        cleaned_list = [str(postags.escape_macrons(ending)) for ending in sorted(relevant_endings, key=lambda x: (-len(x), x))]
        endings_file.write("  '%s': %s,\n" % (str(tag), cleaned_list))
    endings_file.write('}\n')


with codecs.open('ldt-corpus.txt', 'w', 'utf8') as pos_corpus_file:
    xsegment = ''
    xsegmentbehind = ''
    for f in ['1999.02.0010',
              '2008.01.0002',
              '2007.01.0001',
              '1999.02.0060',
              'phi0448.phi001.perseus-lat1',
Example #7
0
 print '<input type="checkbox" name="itoj" value="on" %s> Convert i to j.<br>' % ("checked" if performitoj else "")
 print '<input type="submit" value="Submit"> (Please be patient!)<br>'
 print '</p></form>'
 
 if macronizedtext != "":
     print '<h2>Result</h2>'
     print '<p>(Ambiguous forms are marked <span class="ambig">yellow</span>; unknown forms are <span class="unknown">orange</span>. You may click on a vowel to add or remove a macron.)</p>'
     print tokenization.detokenize(True).replace("\n","<br>")
 
 if domacronize and any(i in texttomacronize for i in u"āēīōū"):
     print '<h2>Evaluation</h2>'
     sys.stdout.write('<div style="white-space: pre-wrap;">')
     vowelcount = 0
     lengthcorrect = 0
     for (a,b) in zip(list(texttomacronize),list(macronizedtext)):
         clean = postags.removemacrons(b)
         if touiorthography(toascii(clean)) != touiorthography(toascii(postags.removemacrons(a))):
             raise Exception("Error: Text mismatch.")
         if clean in "AEIOUYaeiouy":
             vowelcount += 1
             if a == b:
                 lengthcorrect += 1
         if toascii(touiorthography(a)) == toascii(touiorthography(b)):
             sys.stdout.write(escape(b))
         else:
             sys.stdout.write('<span class="wrong">'+escape(b)+'</span>')
     print '</div>'
     print '<p>Accuracy:',
     print "{0:.2f}".format(100 * lengthcorrect / float(vowelcount)),
     print '</p>'
 
Example #8
0

def escapedaccents(txt):
    for replacement, source in [("a_", u"ā"), ("e_", u"ē"), ("i_", u"ī"),
                                ("o_", u"ō"), ("u_", u"ū"), ("y_", u"ȳ"),
                                ("A_", u"Ā"), ("E_", u"Ē"), ("I_", u"Ī"),
                                ("O_", u"Ō"), ("U_", u"Ū"), ("Y_", u"Ȳ")]:
        txt = txt.replace(source, replacement)
    return txt


#enddef
endingsfile = codecs.open("macronized-endings.txt", "w", "utf8")
for tag in tagtoaccents:
    endingfreqs = {}
    for accented in tagtoaccents[tag]:
        for i in range(1, min(len(accented) - 3, 12)):
            ending = accented[-i:]
            endingfreqs[ending] = endingfreqs.get(ending, 0) + 1
    endingsfile.write(tag)
    relevantendings = []
    for ending in endingfreqs:
        endingwithoutmacrons = postags.removemacrons(ending)
        if ending[0] != endingwithoutmacrons[0] and endingfreqs[
                ending] > endingfreqs.get(endingwithoutmacrons, 1):
            relevantendings.append(ending)
    relevantendings.sort(lambda x, y: cmp(len(y), len(x)))
    for ending in relevantendings:
        endingsfile.write('\t' + escapedaccents(ending))
    endingsfile.write('\n')