def test_pos_ngram123_tagger_old_english(self): """Test tagging Old English POS with a 1-, 2-, and 3-gram backoff tagger.""" tagger = POSTag("ang") tagged = tagger.tag_ngram_123_backoff( "Hwæt! We Gardena in geardagum, þeodcyninga, þrym gefrunon, hu ða æþelingas ellen fremedon." ) # pylint: disable=line-too-long self.assertTrue(tagged)
def test_pos_ngram123_tagger_greek(self): """Test tagging Greek POS with a 1-, 2-, and 3-gram backoff tagger.""" tagger = POSTag("grc") tagged = tagger.tag_ngram_123_backoff( "θεοὺς μὲν αἰτῶ τῶνδ᾽ ἀπαλλαγὴν πόνων φρουρᾶς ἐτείας μῆκος" ) # pylint: disable=line-too-long self.assertTrue(tagged)
def tag(self, mode='123'): """Gives words marked up with parts-of-speech. Override's the cltk POS tagger and uses cltk's instead. Has different methods for providing a POS tagger, if desired. Args: mode (:obj:`str`) Tagging mode, either '123', or 'tnt' Returns: :obj:`list` of :obj:`tuple` 2-tuples with word, part-of-speech Example: >>> text = AncientGreekText('ἔστι δὲ σύμπαντα ταῦτα τὰ συγγράμματα ἐκείνῃ μάλιστα οὐκ ὠφέλιμα, ὅτι ὡς πρὸς εἰδότας συγγέγραπται.') >>> print(text.tag()) [('ἔστι', 'V3SPIA---'), ('δὲ', 'G--------'), ('σύμπαντα', None), ('ταῦτα', 'A-P---NA-'), ('τὰ', 'L-P---NA-'), ('συγγράμματα', None), ('ἐκείνῃ', 'A-S---FD-'), ('μάλιστα', 'D--------'), ('οὐκ', 'D--------'), ('ὠφέλιμα', None), (',', 'U--------'), ('ὅτι', 'C--------'), ('ὡς', 'C--------'), ('πρὸς', 'R--------'), ('εἰδότας', 'T-PRPAMA-'), ('συγγέγραπται', None), ('.', '---------')] """ # noqa from cltk.tag.pos import POSTag tagger = POSTag(self.options['language']) mode = mode.lower() if mode != '123' and mode != 'tnt': raise Exception( 'Invalid part of speech tagging mode specified.' ) elif mode == '123': return tagger.tag_ngram_123_backoff(self.data) elif mode == 'tnt': return tagger.tag_tnt(self.data)
def test_pos_ngram123_tagger_latin(self): """Test tagging Latin POS with a 1-, 2-, and 3-gram backoff tagger.""" tagger = POSTag("lat") tagged = tagger.tag_ngram_123_backoff( "Gallia est omnis divisa in partes tres" ) # pylint: disable=line-too-long self.assertTrue(tagged)
def post(self): self.reqparse = reqparse.RequestParser() self.reqparse.add_argument('string', required=True) self.reqparse.add_argument('lang', required=True, choices=POS_METHODS.keys()) self.reqparse.add_argument('method', required=False, default=DEFAULT_POS_METHOD) args = self.reqparse.parse_args() string = args['string'] lang = args['lang'] method = args['method'] if method not in POS_METHODS[lang]: return {'message': {'method': method + ' is not a valid choice'}} tagger = POSTag(lang) tagged = [] if method == 'unigram': tagged = tagger.tag_unigram(string) elif method == 'bigram': tagged = tagger.tag_bigram(string) elif method == 'trigram': tagged = tagger.tag_trigram(string) elif method == 'ngram123': tagged = tagger.tag_ngram_123_backoff(string) elif method == 'tnt': tagged = tagger.tag_tnt(string) return {'tags': [{'word': word, 'tag': tag} if tag is not None else {'word': word, 'tag': 'None'} for word, tag in tagged]}
def tag(self, mode='123'): tagger = POSTag(self.language) mode = mode.lower() if mode != '123' and mode != 'tnt': raise Exception('Invalid part of speech tagging mode specified.') elif mode == '123': return tagger.tag_ngram_123_backoff(self.data) elif mode == 'tnt': return tagger.tag_tnt(self.data)
def _get_pos_tags(self, tokens): """Iterate through list of tokens and use POS tagger to build a corresponding list of tags. :param tokens: List of tokens to be POS-tagged :return: List with POS-tag for each token """ # Import (and define tagger) with other imports? from cltk.tag.pos import POSTag tagger = POSTag('latin') tokens = " ".join(tokens) tags = tagger.tag_ngram_123_backoff(tokens) tags = [tag[1][0].lower() if tag[1] else tag[1] for tag in tags] return tags
def post(self): self.reqparse = reqparse.RequestParser() self.reqparse.add_argument('string', required=True) self.reqparse.add_argument('lang', required=True, choices=POS_METHODS.keys()) self.reqparse.add_argument('method', required=False, default=DEFAULT_POS_METHOD) args = self.reqparse.parse_args() string = args['string'] lang = args['lang'] method = args['method'] if method not in POS_METHODS[lang]: return {'message': {'method': method + ' is not a valid choice'}} tagger = POSTag(lang) tagged = [] if method == 'unigram': tagged = tagger.tag_unigram(string) elif method == 'bigram': tagged = tagger.tag_bigram(string) elif method == 'trigram': tagged = tagger.tag_trigram(string) elif method == 'ngram123': tagged = tagger.tag_ngram_123_backoff(string) elif method == 'tnt': tagged = tagger.tag_tnt(string) return { 'tags': [{ 'word': word, 'tag': tag } if tag is not None else { 'word': word, 'tag': 'None' } for word, tag in tagged] }
def test_pos_ngram123_tagger_old_english(self): """Test tagging Old English POS with a 1-, 2-, and 3-gram backoff tagger.""" tagger = POSTag('old_english') tagged = tagger.tag_ngram_123_backoff('Hwæt! We Gardena in geardagum, þeodcyninga, þrym gefrunon, hu ða æþelingas ellen fremedon.') # pylint: disable=line-too-long self.assertTrue(tagged)
import re import timeit from cltk.tag.pos import POSTag #initialize POS tagger tagger = POSTag('latin') #get text to POS tag with open('../ov_met_1_raw.txt') as f: raw = f.read() #parse every token #write xml as text strings (i know it's bad, sorry) f = open('../ov_met_1_xml.txt', 'w+') for (x, y) in tagger.tag_ngram_123_backoff(raw): f.write("<token postag='" + str(y) + "' cite=''>" + str(x) + '</token>')
def test_pos_ngram123_tagger_greek(self): """Test tagging Greek POS with a 1-, 2-, and 3-gram backoff tagger.""" tagger = POSTag('greek') tagged = tagger.tag_ngram_123_backoff('θεοὺς μὲν αἰτῶ τῶνδ᾽ ἀπαλλαγὴν πόνων φρουρᾶς ἐτείας μῆκος') # pylint: disable=line-too-long self.assertTrue(tagged)
def test_pos_ngram123_tagger_latin(self): """Test tagging Latin POS with a 1-, 2-, and 3-gram backoff tagger.""" tagger = POSTag('latin') tagged = tagger.tag_ngram_123_backoff('Gallia est omnis divisa in partes tres') # pylint: disable=line-too-long self.assertTrue(tagged)
from cltk.tag.pos import POSTag aen = """arma virumque cano, Troiae qui primus ab oris Italiam, fato profugus, Laviniaque venit litora, multum ille et terris iactatus et alto vi superum saevae memorem Iunonis ob iram; multa quoque et bello passus, dum conderet urbem, 5 inferretque deos Latio, genus unde Latinum, Albanique patres, atque altae moenia Romae.""" #remove line breaks aen = aen.replace('\n', ' ') tagger = POSTag('latin') aen_tagged = tagger.tag_ngram_123_backoff(aen) print(aen_tagged) cae_tagged = tagger.tag_crf('Gallia est omnis divisa in partes tres') print(cae_tagged)
else: assert False, "unhandled option" if not filename: sys.exit(0) outfilename = filename + ".CLTK-wlt.txt" lc = 0 with open(filename, 'r') as f: with open(outfilename, 'w') as of: for l in f: l = l.strip() ln = normalize('NFC', l) lemmas = cltk_lemmatiser.lemmatize(ln) tags = cltk_tagger.tag_ngram_123_backoff(ln) words = ln.split() ''' print( "words" ) for w in words: print( w ) print( "lemmas" ) for l in lemmas: print( l ) print( "tags" ) for t in tags: print( t ) ''' assert len(words) == len(lemmas) == len( tags), "Truncated output? %r %r %r %r" % ( len(words), len(lemmas), len(tags), lc)
language="greek") philippians_reader._fileids = [ 'new-testament__letter-to-the-philippians__grc.json' ] # print(list(perseus_reader.sents())) sentences = list(philippians_reader.sents()) sentence = cltk_normalize(sentences[0]) lemmatizer = LemmaReplacer('greek') word_list = lemmatizer.lemmatize(sentence) tagger = POSTag('greek') parts_of_speech = tagger.tag_ngram_123_backoff(sentence) # This is not a great lemmatizer standard_list = lemmatizer.lemmatize(list(philippians_reader.words()), return_raw=True) lemmatizer2 = BackoffGreekLemmatizer() # this one seems better backoff_list = lemmatizer2.lemmatize(list(philippians_reader.words())) # Find most names names_in_first_sentence = ner.tag_ner('greek', input_text=sentence, output_type=list)