Ejemplo n.º 1
0
 def test_pos_crf_tagger_old_english(self):
     """Test tagging Old English POS with CRF tagger."""
     tagger = POSTag('old_english')
     tagged = tagger.tag_crf(
         'Hwæt! We Gardena in geardagum, þeodcyninga, þrym gefrunon, hu ða æþelingas ellen fremedon.'
     )
     self.assertTrue(tagged)
Ejemplo n.º 2
0
 def test_pos_crf_tagger_old_english(self):
     """Test tagging Old English POS with CRF tagger."""
     tagger = POSTag('old_english')
     tagged = tagger.tag_crf('Hwæt! We Gardena in geardagum, þeodcyninga, þrym gefrunon, hu ða æþelingas ellen fremedon.')
     self.assertTrue(tagged)
Ejemplo n.º 3
0
def get_words_from_file(path, file_dict, new_system):
    with open(path, "r") as path_file:
        file_string = path_file.read().replace("...", " ").encode("utf-8")
    root = etree.fromstring(file_string)
    words = []
    nsmap = {'tei': "http://www.tei-c.org/ns/1.0"}
    bodies = root.findall('.//' + TEI_NS + 'body')
    textLang = root.find('.//' + TEI_NS + 'textLang')
    textRegion = root.find('.//' + TEI_NS + 'region')
    if textRegion != None:
        file_dict[path] = iip_file(path, textRegion.text)
    mainLang = ""
    if (textLang != None):
        mainLang = textLang.attrib['mainLang']
    for edition in (
            root.findall(".//tei:div[@type='edition']", namespaces=nsmap) +
            root.findall(".//tei:div[@type='translation']", namespaces=nsmap)):
        if mainLang.strip() == "":
            mainLang = "unk"
        edition_type = ""
        if 'subtype' in edition.attrib:
            edition_type = edition.attrib['subtype']
        if edition.attrib["type"] == "translation":
            edition_type = "translation"
            mainLang += "-transl"
        new_words = []
        if new_system:
            retrieved_words = get_words_from_element(edition)
            combined_words = ""
            for e in retrieved_words:
                combined_words += e.text + " "
            tagged_words = None
            if mainLang in LATIN_CODES:
                tagger = POSTag('latin')
                tagged_words = tagger.tag_crf(combined_words)
            elif mainLang in GREEK_CODES:
                tagger = POSTag('greek')
                tagged_words = tagger.tag_crf(combined_words)
            if "-transl" in mainLang:
                tagged_words = nltk.pos_tag(nltk.word_tokenize(combined_words))
            for e in retrieved_words:
                new_words.append(
                    iip_word_occurrence(edition_type, mainLang, e.text, path,
                                        textRegion.text,
                                        e.surrounding_elements))
                new_words[-1].internal_elements = e.internal_elements
                new_words[-1].alternatives = e.alternatives
                new_words[-1].preceding = e.preceding
                new_words[-1].following = e.following
                if tagged_words != None:
                    for tagged_word in tagged_words:
                        if tagged_word[0] == e.text:
                            new_words[-1].pos = standardize_pos(tagged_word[1])
            #endloop
        else:
            new_words = [
                iip_word_occurrence(edition_type, mainLang, "", path,
                                    textRegion.text, [])
            ]
            add_element_to_word_list(edition, new_words, edition, mainLang,
                                     path, textRegion.text, [])
        words += new_words
        #endif
    #endloop
    null_words = []
    for word in words:
        word.text = str(word.text)
        for pattern in IGNORE:
            word.text = word.text.replace(pattern, "")
        if (word.text.strip() == ""):
            null_words.append(word)
        if word.language.strip() == "":
            word.language = "unk"
    words = [x for x in words if x not in null_words]
    return words
Ejemplo n.º 4
0
 def test_pos_crf_tagger_latin(self):
     """Test tagging Latin POS with CRF tagger."""
     tagger = POSTag('latin')
     tagged = tagger.tag_crf('Gallia est omnis divisa in partes tres')
     self.assertTrue(tagged)
Ejemplo n.º 5
0
def tag(form):
    tagger = POSTag("latin")
    list = tagger.tag_crf(form)
    return json.dumps(list)
Ejemplo n.º 6
0
 def test_pos_crf_tagger_latin(self):
     """Test tagging Latin POS with CRF tagger."""
     tagger = POSTag('latin')
     tagged = tagger.tag_crf('Gallia est omnis divisa in partes tres')
     self.assertTrue(tagged)
Ejemplo n.º 7
0
from cltk.tag.pos import POSTag

aen = """arma virumque cano, Troiae qui primus ab oris
Italiam, fato profugus, Laviniaque venit
litora, multum ille et terris iactatus et alto
vi superum saevae memorem Iunonis ob iram;
multa quoque et bello passus, dum conderet urbem,               5
inferretque deos Latio, genus unde Latinum,
Albanique patres, atque altae moenia Romae."""

#remove line breaks
aen = aen.replace('\n', ' ')

tagger = POSTag('latin')
aen_tagged = tagger.tag_ngram_123_backoff(aen)
print(aen_tagged)

cae_tagged = tagger.tag_crf('Gallia est omnis divisa in partes tres')
print(cae_tagged)