def test_pos_crf_tagger_old_english(self): """Test tagging Old English POS with CRF tagger.""" tagger = POSTag('old_english') tagged = tagger.tag_crf( 'Hwæt! We Gardena in geardagum, þeodcyninga, þrym gefrunon, hu ða æþelingas ellen fremedon.' ) self.assertTrue(tagged)
def test_pos_crf_tagger_old_english(self): """Test tagging Old English POS with CRF tagger.""" tagger = POSTag('old_english') tagged = tagger.tag_crf('Hwæt! We Gardena in geardagum, þeodcyninga, þrym gefrunon, hu ða æþelingas ellen fremedon.') self.assertTrue(tagged)
def get_words_from_file(path, file_dict, new_system): with open(path, "r") as path_file: file_string = path_file.read().replace("...", " ").encode("utf-8") root = etree.fromstring(file_string) words = [] nsmap = {'tei': "http://www.tei-c.org/ns/1.0"} bodies = root.findall('.//' + TEI_NS + 'body') textLang = root.find('.//' + TEI_NS + 'textLang') textRegion = root.find('.//' + TEI_NS + 'region') if textRegion != None: file_dict[path] = iip_file(path, textRegion.text) mainLang = "" if (textLang != None): mainLang = textLang.attrib['mainLang'] for edition in ( root.findall(".//tei:div[@type='edition']", namespaces=nsmap) + root.findall(".//tei:div[@type='translation']", namespaces=nsmap)): if mainLang.strip() == "": mainLang = "unk" edition_type = "" if 'subtype' in edition.attrib: edition_type = edition.attrib['subtype'] if edition.attrib["type"] == "translation": edition_type = "translation" mainLang += "-transl" new_words = [] if new_system: retrieved_words = get_words_from_element(edition) combined_words = "" for e in retrieved_words: combined_words += e.text + " " tagged_words = None if mainLang in LATIN_CODES: tagger = POSTag('latin') tagged_words = tagger.tag_crf(combined_words) elif mainLang in GREEK_CODES: tagger = POSTag('greek') tagged_words = tagger.tag_crf(combined_words) if "-transl" in mainLang: tagged_words = nltk.pos_tag(nltk.word_tokenize(combined_words)) for e in retrieved_words: new_words.append( iip_word_occurrence(edition_type, mainLang, e.text, path, textRegion.text, e.surrounding_elements)) new_words[-1].internal_elements = e.internal_elements new_words[-1].alternatives = e.alternatives new_words[-1].preceding = e.preceding new_words[-1].following = e.following if tagged_words != None: for tagged_word in tagged_words: if tagged_word[0] == e.text: new_words[-1].pos = standardize_pos(tagged_word[1]) #endloop else: new_words = [ iip_word_occurrence(edition_type, mainLang, "", path, textRegion.text, []) ] add_element_to_word_list(edition, new_words, edition, mainLang, path, textRegion.text, []) words += new_words #endif #endloop null_words = [] for word in words: word.text = str(word.text) for pattern in IGNORE: word.text = word.text.replace(pattern, "") if (word.text.strip() == ""): null_words.append(word) if word.language.strip() == "": word.language = "unk" words = [x for x in words if x not in null_words] return words
def test_pos_crf_tagger_latin(self): """Test tagging Latin POS with CRF tagger.""" tagger = POSTag('latin') tagged = tagger.tag_crf('Gallia est omnis divisa in partes tres') self.assertTrue(tagged)
def tag(form): tagger = POSTag("latin") list = tagger.tag_crf(form) return json.dumps(list)
def test_pos_crf_tagger_latin(self): """Test tagging Latin POS with CRF tagger.""" tagger = POSTag('latin') tagged = tagger.tag_crf('Gallia est omnis divisa in partes tres') self.assertTrue(tagged)
from cltk.tag.pos import POSTag aen = """arma virumque cano, Troiae qui primus ab oris Italiam, fato profugus, Laviniaque venit litora, multum ille et terris iactatus et alto vi superum saevae memorem Iunonis ob iram; multa quoque et bello passus, dum conderet urbem, 5 inferretque deos Latio, genus unde Latinum, Albanique patres, atque altae moenia Romae.""" #remove line breaks aen = aen.replace('\n', ' ') tagger = POSTag('latin') aen_tagged = tagger.tag_ngram_123_backoff(aen) print(aen_tagged) cae_tagged = tagger.tag_crf('Gallia est omnis divisa in partes tres') print(cae_tagged)