Beispiel #1
0
 def test_pos_crf_tagger_old_english(self):
     """Test tagging Old English POS with CRF tagger."""
     tagger = POSTag("ang")
     tagged = tagger.tag_crf(
         "Hwæt! We Gardena in geardagum, þeodcyninga, þrym gefrunon, hu ða æþelingas ellen fremedon."
     )
     self.assertTrue(tagged)
Beispiel #2
0
def POSTagger(wordList):
    if opts.Progress:
        print('Going for the POSTagger.')

    tagger = POSTag('greek')

    listWithTags = []
    listWithSelected = []

    # Create a list first with all the words with tag
    for word in wordList:
        taggedItem = tagger.tag_tnt(word)
        listWithTags.append(taggedItem)
        print(taggedItem)

    # Select from this list only the words you want
    for entry in listWithTags:
        for word, tag in entry:
            if tag == None:
                break  # if tag.startswith("N"):
            elif tag.startswith("N"):
                listWithSelected.append(word)
            elif tag.startswith("V"):
                listWithSelected.append(word)
            elif tag.startswith("Unk"):
                listWithSelected.append(word)

    return listWithSelected
Beispiel #3
0
 def test_pos_ngram123_tagger_latin(self):
     """Test tagging Latin POS with a 1-, 2-, and 3-gram backoff tagger."""
     tagger = POSTag("lat")
     tagged = tagger.tag_ngram_123_backoff(
         "Gallia est omnis divisa in partes tres"
     )  # pylint: disable=line-too-long
     self.assertTrue(tagged)
Beispiel #4
0
 def test_pos_ngram123_tagger_greek(self):
     """Test tagging Greek POS with a 1-, 2-, and 3-gram backoff tagger."""
     tagger = POSTag("grc")
     tagged = tagger.tag_ngram_123_backoff(
         "θεοὺς μὲν αἰτῶ τῶνδ᾽ ἀπαλλαγὴν πόνων φρουρᾶς ἐτείας μῆκος"
     )  # pylint: disable=line-too-long
     self.assertTrue(tagged)
Beispiel #5
0
 def test_pos_trigram_greek(self):
     """Test tagging Greek POS with trigram tagger."""
     tagger = POSTag("grc")
     tagged = tagger.tag_trigram(
         "θεοὺς μὲν αἰτῶ τῶνδ᾽ ἀπαλλαγὴν πόνων φρουρᾶς ἐτείας μῆκος"
     )  # pylint: disable=line-too-long
     self.assertTrue(tagged)
Beispiel #6
0
    def tag(self, mode='123'):
        """Gives words marked up with parts-of-speech.

        Override's the cltk POS tagger and uses cltk's instead. Has different
        methods for providing a POS tagger, if desired.

        Args:
            mode (:obj:`str`) Tagging mode, either '123', or 'tnt'

        Returns:
            :obj:`list` of :obj:`tuple` 2-tuples with word, part-of-speech

        Example:
            >>> text = AncientGreekText('ἔστι δὲ σύμπαντα ταῦτα τὰ συγγράμματα ἐκείνῃ μάλιστα οὐκ ὠφέλιμα, ὅτι ὡς πρὸς εἰδότας συγγέγραπται.')
            >>> print(text.tag())
            [('ἔστι', 'V3SPIA---'), ('δὲ', 'G--------'), ('σύμπαντα', None), ('ταῦτα', 'A-P---NA-'), ('τὰ', 'L-P---NA-'), ('συγγράμματα', None), ('ἐκείνῃ', 'A-S---FD-'), ('μάλιστα', 'D--------'), ('οὐκ', 'D--------'), ('ὠφέλιμα', None), (',', 'U--------'), ('ὅτι', 'C--------'), ('ὡς', 'C--------'), ('πρὸς', 'R--------'), ('εἰδότας', 'T-PRPAMA-'), ('συγγέγραπται', None), ('.', '---------')]
        """ # noqa
        from cltk.tag.pos import POSTag
        tagger = POSTag(self.options['language'])
        mode = mode.lower()
        if mode != '123' and mode != 'tnt':
            raise Exception(
                'Invalid part of speech tagging mode specified.'
            )
        elif mode == '123':
            return tagger.tag_ngram_123_backoff(self.data)
        elif mode == 'tnt':
            return tagger.tag_tnt(self.data)
Beispiel #7
0
 def test_pos_trigram_old_english(self):
     """Test tagging old_english POS with trigram tagger."""
     tagger = POSTag('old_english')
     tagged = tagger.tag_trigram(
         'Hwæt! We Gardena in geardagum, þeodcyninga, þrym gefrunon, hu ða æþelingas ellen fremedon.'
     )
     self.assertTrue(tagged)
Beispiel #8
0
 def test_pos_perceptron_tagger_old_english(self):
     """Test tagging Old English POS with Perceptron tagger."""
     tagger = POSTag('old_english')
     tagged = tagger.tag_perceptron(
         'Hwæt! We Gardena in geardagum, þeodcyninga, þrym gefrunon, hu ða æþelingas ellen fremedon.'
     )
     self.assertTrue(tagged)
Beispiel #9
0
 def test_pos_ngram12_tagger_middle_low_german(self):
     """ Test MOG POS 12-backoff tagger"""
     tagger = POSTag('middle_low_german')
     tagged = tagger.tag_ngram_12_backoff(
         'Jck Johannes preister verwarer vnde voirs tender des Juncfrouwen kloisters to Mariendale'
     )
     self.assertTrue(tagged)
Beispiel #10
0
 def test_pos_ngram123_tagger_old_english(self):
     """Test tagging Old English POS with a 1-, 2-, and 3-gram backoff tagger."""
     tagger = POSTag("ang")
     tagged = tagger.tag_ngram_123_backoff(
         "Hwæt! We Gardena in geardagum, þeodcyninga, þrym gefrunon, hu ða æþelingas ellen fremedon."
     )  # pylint: disable=line-too-long
     self.assertTrue(tagged)
Beispiel #11
0
 def test_middle_high_german_trigram_pos_tagger(self):
     target = [('uns', 'PPER'), ('ist', 'VAFIN'), ('in', 'APPR'),
               ('alten', 'ADJA'), ('mæren', 'NA'), ('wunders', 'NA'),
               ('vil', None), ('geseit', None)]
     mhg_pos_tagger = POSTag("middle_high_german")
     res = mhg_pos_tagger.tag_trigram(
         "uns ist in alten mæren wunders vil geseit")
     self.assertEqual(target, res)
Beispiel #12
0
 def tag(self, mode='123'):
     tagger = POSTag(self.language)
     mode = mode.lower()
     if mode != '123' and mode != 'tnt':
         raise Exception('Invalid part of speech tagging mode specified.')
     elif mode == '123':
         return tagger.tag_ngram_123_backoff(self.data)
     elif mode == 'tnt':
         return tagger.tag_tnt(self.data)
Beispiel #13
0
    def _get_pos_tags(self, tokens):
        """Iterate through list of tokens and use POS tagger to build
        a corresponding list of tags.

        :param tokens: List of tokens to be POS-tagged
        :return: List with POS-tag for each token
        """
        # Import (and define tagger) with other imports?    
        from cltk.tag.pos import POSTag
        tagger = POSTag('latin')
        tokens = " ".join(tokens)
        tags = tagger.tag_ngram_123_backoff(tokens)
        tags = [tag[1][0].lower() if tag[1] else tag[1] for tag in tags]
        return tags
Beispiel #14
0
    def _get_pos_tags(self, tokens):
        """Iterate through list of tokens and use POS tagger to build
        a corresponding list of tags.

        :param tokens: List of tokens to be POS-tagged
        :return: List with POS-tag for each token
        """
        # Import (and define tagger) with other imports?
        from cltk.tag.pos import POSTag
        tagger = POSTag('latin')
        tokens = " ".join(tokens)
        tags = tagger.tag_ngram_123_backoff(tokens)
        tags = [tag[1][0].lower() if tag[1] else tag[1] for tag in tags]
        return tags
Beispiel #15
0
 def test_pos_tnt_middle_high_german(self):
     """Test tagging Middle High German with TnT tagger"""
     target = [
         ("uns", "PPER"),
         ("ist", "VAFIN"),
         ("in", "APPR"),
         ("alten", "ADJA"),
         ("mæren", "ADJA"),
         ("wunders", "NA"),
         ("vil", "AVD"),
         ("geseit", "VVPP"),
     ]
     tagger = POSTag("gmh")
     tagged = tagger.tag_tnt("uns ist in alten mæren wunders vil geseit")
     self.assertEqual(target, tagged)
Beispiel #16
0
    def _retrieve_tag(self, text):
        """Tag text with chosen tagger and clean tags.

        Tag format: [('word', 'tag')]

        :param text: string
        :return: list of tuples, with each tuple containing the word and its pos tag
        :rtype : list
        """
        if self.tagger == 'tag_ngram_123_backoff':  # Data format: Perseus Style (see https://github.com/cltk/latin_treebank_perseus)
            tags = POSTag('latin').tag_ngram_123_backoff(text.lower())
            return [(tag[0], tag[1]) for tag in tags]
        elif self.tagger == 'tag_tnt':
            tags = POSTag('latin').tag_tnt(text.lower())
            return [(tag[0], tag[1]) for tag in tags]
        elif self.tagger == 'tag_crf':
            tags = POSTag('latin').tag_crf(text.lower())
            return [(tag[0], tag[1]) for tag in tags]
Beispiel #17
0
def lemmatizeList(list):
    tagger = POSTag('greek')

    lemmatizer = LemmaReplacer('greek')
    lemmWords = lemmatizer.lemmatize(list)

    # Remove Stopwords and numbers and lowercases all words.
    lemmWords = [w.lower() for w in lemmWords if not w in STOPS_LIST]
    lemmWords = removeNumbers(lemmWords)

    return lemmWords
Beispiel #18
0
    def _retrieve_tag(self, text: str) -> List[Tuple[str, str]]:
        """Tag text with chosen tagger and clean tags.

        Tag format: ``[('word', 'tag')]``

        :param text: string

        :return: list of tuples, with each tuple containing the word and its pos tag

        """
        if (
            self.tagger == "tag_ngram_123_backoff"
        ):  # Data format: Perseus Style (see https://github.com/cltk/latin_treebank_perseus)
            tags = POSTag("lat").tag_ngram_123_backoff(text.lower())
            return [(tag[0], tag[1]) for tag in tags]
        elif self.tagger == "tag_tnt":
            tags = POSTag("lat").tag_tnt(text.lower())
            return [(tag[0], tag[1]) for tag in tags]
        elif self.tagger == "tag_crf":
            tags = POSTag("lat").tag_crf(text.lower())
            return [(tag[0], tag[1]) for tag in tags]
Beispiel #19
0
    def post(self):
        self.reqparse = reqparse.RequestParser()
        self.reqparse.add_argument('string', required=True)
        self.reqparse.add_argument('lang', required=True, choices=POS_METHODS.keys())
        self.reqparse.add_argument('method', required=False,
                                   default=DEFAULT_POS_METHOD)

        args = self.reqparse.parse_args()
        string = args['string']
        lang = args['lang']
        method = args['method']

        if method not in POS_METHODS[lang]:
            return {'message': {'method': method + ' is not a valid choice'}}

        tagger = POSTag(lang)
        tagged = []
        if method == 'unigram':
            tagged = tagger.tag_unigram(string)
        elif method == 'bigram':
            tagged = tagger.tag_bigram(string)
        elif method == 'trigram':
            tagged = tagger.tag_trigram(string)
        elif method == 'ngram123':
            tagged = tagger.tag_ngram_123_backoff(string)
        elif method == 'tnt':
            tagged = tagger.tag_tnt(string)

        return {'tags': [{'word': word, 'tag': tag}
                         if tag is not None else {'word': word, 'tag': 'None'}
                         for word, tag in tagged]}
    def lemmatizeList(self, lines):
        from cltk.corpus.utils.formatter import cltk_normalize

        tagger = POSTag('greek')

        lemmatizer = LemmaReplacer('greek')

        # can help when using certain texts (doc says it, so i does it)
        lines = cltk_normalize(lines)

        # print(lines)
        # exit(0)
        lines = lemmatizer.lemmatize(lines)

        # Remove Stopwords and numbers and lowercases all words.
        lines = [w.lower() for w in lines if not w in STOPS_LIST]
        # lemmWords = removeNumbers(lemmWords)

        return ' '.join(lines)
Beispiel #21
0
    def post(self):
        self.reqparse = reqparse.RequestParser()
        self.reqparse.add_argument('string', required=True)
        self.reqparse.add_argument('lang',
                                   required=True,
                                   choices=POS_METHODS.keys())
        self.reqparse.add_argument('method',
                                   required=False,
                                   default=DEFAULT_POS_METHOD)

        args = self.reqparse.parse_args()
        string = args['string']
        lang = args['lang']
        method = args['method']

        if method not in POS_METHODS[lang]:
            return {'message': {'method': method + ' is not a valid choice'}}

        tagger = POSTag(lang)
        tagged = []
        if method == 'unigram':
            tagged = tagger.tag_unigram(string)
        elif method == 'bigram':
            tagged = tagger.tag_bigram(string)
        elif method == 'trigram':
            tagged = tagger.tag_trigram(string)
        elif method == 'ngram123':
            tagged = tagger.tag_ngram_123_backoff(string)
        elif method == 'tnt':
            tagged = tagger.tag_tnt(string)

        return {
            'tags': [{
                'word': word,
                'tag': tag
            } if tag is not None else {
                'word': word,
                'tag': 'None'
            } for word, tag in tagged]
        }
Beispiel #22
0
import re
import timeit
from cltk.tag.pos import POSTag

#initialize POS tagger
tagger = POSTag('latin')

#get text to POS tag
with open('../ov_met_1_raw.txt') as f:
    raw = f.read()

#parse every token
#write xml as text strings (i know it's bad, sorry)
f = open('../ov_met_1_xml.txt', 'w+')
for (x, y) in tagger.tag_ngram_123_backoff(raw):
    f.write("<token postag='" + str(y) + "' cite=''>" + str(x) + '</token>')
Beispiel #23
0
 def getPos(self):
   tagger = POSTag('latin')
   return tagger.tag_unigram(self.text)
latin_pos_lemmatized_sents_path = os.path.join(path, file)
if os.path.isfile(latin_pos_lemmatized_sents_path):
    latin_pos_lemmatized_sents = open_pickle(latin_pos_lemmatized_sents_path)
else:
    latin_pos_lemmatized_sents = []
    print('The file %s is not available in cltk_data' % file)
la_lemmatizer = BackoffLatinLemmatizer(latin_pos_lemmatized_sents)

# Greek Lemmatizer
grc_corpus_importer = CorpusImporter('greek')
grc_corpus_importer.import_corpus('greek_models_cltk')
grc_lemmatizer = LemmaReplacer('greek')

# Initialize lemmatizers once outside of the loop,
# then select based on langauge inside the loop -- get_words_from_file()
tagLat = POSTag('latin')
tagGrk = POSTag('greek')


def lemmatize(word_list, copy):
	for word in word_list:
		if copy:
			word.lemmatization = word.text
			return
		if word.language in LATIN_CODES:
			word.lemmatization = \
			    remove_digits(la_lemmatizer.lemmatize([word.text])[0][1])
				# [] are needed to turn the string into a list.
				# Otherwise the lemmatizer splits up individual characters and produces garbage.
				# The [0][1] is needed because this new lemmatizer outputs a tuple
		elif word.language in GREEK_CODES:
Beispiel #25
0
 def test_pos_bigram_latin(self):
     """Test tagging Latin POS with bigram tagger."""
     tagger = POSTag('latin')
     tagged = tagger.tag_bigram('Gallia est omnis divisa in partes tres')
     self.assertTrue(tagged)
Beispiel #26
0
 def test_pos_ngram123_tagger_old_english(self):
     """Test tagging Old English POS with a 1-, 2-, and 3-gram backoff tagger."""
     tagger = POSTag('old_english')
     tagged = tagger.tag_ngram_123_backoff('Hwæt! We Gardena in geardagum, þeodcyninga, þrym gefrunon, hu ða æþelingas ellen fremedon.')  # pylint: disable=line-too-long
     self.assertTrue(tagged)
Beispiel #27
0
 def test_pos_crf_tagger_latin(self):
     """Test tagging Latin POS with CRF tagger."""
     tagger = POSTag('latin')
     tagged = tagger.tag_crf('Gallia est omnis divisa in partes tres')
     self.assertTrue(tagged)
Beispiel #28
0
def get_words_from_file(path, file_dict, new_system):
    with open(path, "r") as path_file:
        file_string = path_file.read().replace("...", " ").encode("utf-8")
    root = etree.fromstring(file_string)
    words = []
    nsmap = {'tei': "http://www.tei-c.org/ns/1.0"}
    bodies = root.findall('.//' + TEI_NS + 'body')
    textLang = root.find('.//' + TEI_NS + 'textLang')
    textRegion = root.find('.//' + TEI_NS + 'region')
    if textRegion != None:
        file_dict[path] = iip_file(path, textRegion.text)
    mainLang = ""
    if (textLang != None):
        mainLang = textLang.attrib['mainLang']
    for edition in (
            root.findall(".//tei:div[@type='edition']", namespaces=nsmap) +
            root.findall(".//tei:div[@type='translation']", namespaces=nsmap)):
        if mainLang.strip() == "":
            mainLang = "unk"
        edition_type = ""
        if 'subtype' in edition.attrib:
            edition_type = edition.attrib['subtype']
        if edition.attrib["type"] == "translation":
            edition_type = "translation"
            mainLang += "-transl"
        new_words = []
        if new_system:
            retrieved_words = get_words_from_element(edition)
            combined_words = ""
            for e in retrieved_words:
                combined_words += e.text + " "
            tagged_words = None
            if mainLang in LATIN_CODES:
                tagger = POSTag('latin')
                tagged_words = tagger.tag_crf(combined_words)
            elif mainLang in GREEK_CODES:
                tagger = POSTag('greek')
                tagged_words = tagger.tag_crf(combined_words)
            if "-transl" in mainLang:
                tagged_words = nltk.pos_tag(nltk.word_tokenize(combined_words))
            for e in retrieved_words:
                new_words.append(
                    iip_word_occurrence(edition_type, mainLang, e.text, path,
                                        textRegion.text,
                                        e.surrounding_elements))
                new_words[-1].internal_elements = e.internal_elements
                new_words[-1].alternatives = e.alternatives
                new_words[-1].preceding = e.preceding
                new_words[-1].following = e.following
                if tagged_words != None:
                    for tagged_word in tagged_words:
                        if tagged_word[0] == e.text:
                            new_words[-1].pos = standardize_pos(tagged_word[1])
            #endloop
        else:
            new_words = [
                iip_word_occurrence(edition_type, mainLang, "", path,
                                    textRegion.text, [])
            ]
            add_element_to_word_list(edition, new_words, edition, mainLang,
                                     path, textRegion.text, [])
        words += new_words
        #endif
    #endloop
    null_words = []
    for word in words:
        word.text = str(word.text)
        for pattern in IGNORE:
            word.text = word.text.replace(pattern, "")
        if (word.text.strip() == ""):
            null_words.append(word)
        if word.language.strip() == "":
            word.language = "unk"
    words = [x for x in words if x not in null_words]
    return words
Beispiel #29
0
 def test_pos_tnt_tagger_old_norse(self):
     """Test tagging Old Norse POS with TnT tagger."""
     tagger = POSTag('old_norse')
     tagged = tagger.tag_tnt('Hlióðs bið ek allar.')
     print(tagged)
     self.assertTrue(tagged)
Beispiel #30
0
 def test_pos_crf_tagger_latin(self):
     """Test tagging Latin POS with CRF tagger."""
     tagger = POSTag('latin')
     tagged = tagger.tag_crf('Gallia est omnis divisa in partes tres')
     self.assertTrue(tagged)
Beispiel #31
0
 def test_pos_ngram123_tagger_latin(self):
     """Test tagging Latin POS with a 1-, 2-, and 3-gram backoff tagger."""
     tagger = POSTag('latin')
     tagged = tagger.tag_ngram_123_backoff('Gallia est omnis divisa in partes tres')  # pylint: disable=line-too-long
     self.assertTrue(tagged)
Beispiel #32
0


# Import modules

# For XML
from xml.dom.minidom import parse, parseString
import codecs
# For CLTK
from cltk.stem.latin.j_v import JVReplacer
from cltk.stem.lemma import LemmaReplacer
from cltk.tag.pos import POSTag

# Initialize CLTK
lemmatizer = LemmaReplacer('latin')
tagger = POSTag('latin')
j = JVReplacer()

# Parse XML

xmldoc = parse('/home/ilbuonme/siti/paolo.monella/ursus/casanatensis.xml')
#xmldoc = parse('/home/ilbuonme/siti/paolo.monella/ursus/shorter_casanatensis.xml')
wordElementList = xmldoc.getElementsByTagName('w')

for w in wordElementList:
        form = w.attributes['ana'].value
        print(form)
        # Parse the inflected word
        try:
            lemmaList = lemmatizer.lemmatize(form.lower())
            lemma = lemmaList[0].replace('v', 'u')
Beispiel #33
0
 def test_pos_tnt_tagger_old_norse(self):
     """Test tagging Old Norse POS with TnT tagger."""
     tagger = POSTag('old_norse')
     tagged = tagger.tag_tnt('Hlióðs bið ek allar.')
     print(tagged)
     self.assertTrue(tagged)
Beispiel #34
0
 def test_pos_unigram_latin(self):
     """Test tagging Latin POS with unigram tagger."""
     tagger = POSTag('latin')
     tagged = tagger.tag_unigram('Gallia est omnis divisa in partes tres')
     self.assertTrue(tagged)
Beispiel #35
0
 def test_pos_tnt_tagger_greek(self):
     """Test tagging Greek POS with TnT tagger."""
     tagger = POSTag('greek')
     tagged = tagger.tag_tnt('θεοὺς μὲν αἰτῶ τῶνδ᾽ ἀπαλλαγὴν πόνων φρουρᾶς ἐτείας μῆκος')  # pylint: disable=line-too-long
     self.assertTrue(tagged)
Beispiel #36
0
def tag(form):
    tagger = POSTag("latin")
    list = tagger.tag_crf(form)
    return json.dumps(list)
Beispiel #37
0
# script to figure out frequency of each type of subjunctive

from cltk.tag.pos import POSTag
from cltk.tokenize.word import WordTokenizer
from os import listdir

tagger = POSTag('latin')
wt = WordTokenizer('latin')

filelist = sorted([f for f in listdir('./ovid_metamorphoses') if f.endswith('txt')])

present, imperfect, perfect, pluperfect = 0, 0, 0, 0
pres_ex, impf_ex, perf_ex, plup_ex = [], [], [], []

def count_subj(filename):

    global present, imperfect, perfect, pluperfect
    global pres_ex, impf_ex, perf_ex, plup_ex

    infile = open(filename)
    raw = infile.read()
    infile.close()

    tokenized = wt.tokenize(raw)
    tokenized = [t for t in tokenized if not None]

    for t in tokenized:

        tagged = tagger.tag_crf(t)

        if len(tagged) > 1 or len(tagged) == 0 or tagged[0][1] == None:
Beispiel #38
0
THIS ONE READS COLUMN FORMAT. THIS PROBABLY AFFECTS THE TAGGER (CONTEXT).

(venv) durian:lemmatiser_new pberck
python3 lemmatiser_cltk2.py  -f hdt_Books_forFrog.col.nutt
'''

debug = False
def DBG(*strs):
    if debug:
        sys.stderr.write("DBG:"+"".join(str(strs))+"\n")

try:
    from cltk.stem.lemma import LemmaReplacer
    cltk_lemmatiser = LemmaReplacer('greek')
    from cltk.tag.pos import POSTag
    cltk_tagger = POSTag('greek')
except:
    print(" No CLTK toolkit found." )
    sys.exit(1)

print( "CLTK ok." )

filename = None

try:
    opts, args = getopt.getopt(sys.argv[1:], "f:", [])
except getopt.GetoptError as err:
    print(str(err))
    sys.exit(1)
for o, a in opts:
    if o in ("-f"):
Beispiel #39
0
 def test_pos_ngram12_tagger_middle_low_german(self):
     """ Test MOG POS 12-backoff tagger"""
     tagger = POSTag('middle_low_german')
     tagged = tagger.tag_ngram_12_backoff('Jck Johannes preister verwarer vnde voirs tender des Juncfrouwen kloisters to Mariendale')
     self.assertTrue(tagged)
Beispiel #40
0
 def test_pos_tnt_tagger_latin(self):
     """Test tagging Latin POS with TnT tagger."""
     tagger = POSTag("lat")
     tagged = tagger.tag_tnt("Gallia est omnis divisa in partes tres")
     self.assertTrue(tagged)
Beispiel #41
0
 def test_pos_trigram_old_english(self):
     """Test tagging old_english POS with trigram tagger."""
     tagger = POSTag('old_english')
     tagged = tagger.tag_trigram('Hwæt! We Gardena in geardagum, þeodcyninga, þrym gefrunon, hu ða æþelingas ellen fremedon.')
     self.assertTrue(tagged)
Beispiel #42
0
 def test_pos_ngram123_tagger_greek(self):
     """Test tagging Greek POS with a 1-, 2-, and 3-gram backoff tagger."""
     tagger = POSTag('greek')
     tagged = tagger.tag_ngram_123_backoff('θεοὺς μὲν αἰτῶ τῶνδ᾽ ἀπαλλαγὴν πόνων φρουρᾶς ἐτείας μῆκος')  # pylint: disable=line-too-long
     self.assertTrue(tagged)
Beispiel #43
0
 def test_pos_perceptron_tagger_old_english(self):
     """Test tagging Old English POS with Perceptron tagger."""
     tagger = POSTag('old_english')
     tagged = tagger.tag_perceptron('Hwæt! We Gardena in geardagum, þeodcyninga, þrym gefrunon, hu ða æþelingas ellen fremedon.')
     self.assertTrue(tagged)
Beispiel #44
0
 def test_pos_tnt_tagger_greek(self):
     """Test tagging Greek POS with TnT tagger."""
     tagger = POSTag('greek')
     tagged = tagger.tag_tnt('θεοὺς μὲν αἰτῶ τῶνδ᾽ ἀπαλλαγὴν πόνων φρουρᾶς ἐτείας μῆκος')  # pylint: disable=line-too-long
     self.assertTrue(tagged)