def post(self): self.reqparse = reqparse.RequestParser() self.reqparse.add_argument('string', required=True) self.reqparse.add_argument('lang', required=True, choices=POS_METHODS.keys()) self.reqparse.add_argument('method', required=False, default=DEFAULT_POS_METHOD) args = self.reqparse.parse_args() string = args['string'] lang = args['lang'] method = args['method'] if method not in POS_METHODS[lang]: return {'message': {'method': method + ' is not a valid choice'}} tagger = POSTag(lang) tagged = [] if method == 'unigram': tagged = tagger.tag_unigram(string) elif method == 'bigram': tagged = tagger.tag_bigram(string) elif method == 'trigram': tagged = tagger.tag_trigram(string) elif method == 'ngram123': tagged = tagger.tag_ngram_123_backoff(string) elif method == 'tnt': tagged = tagger.tag_tnt(string) return {'tags': [{'word': word, 'tag': tag} if tag is not None else {'word': word, 'tag': 'None'} for word, tag in tagged]}
def tag(self, mode='123'): """Gives words marked up with parts-of-speech. Override's the cltk POS tagger and uses cltk's instead. Has different methods for providing a POS tagger, if desired. Args: mode (:obj:`str`) Tagging mode, either '123', or 'tnt' Returns: :obj:`list` of :obj:`tuple` 2-tuples with word, part-of-speech Example: >>> text = AncientGreekText('ἔστι δὲ σύμπαντα ταῦτα τὰ συγγράμματα ἐκείνῃ μάλιστα οὐκ ὠφέλιμα, ὅτι ὡς πρὸς εἰδότας συγγέγραπται.') >>> print(text.tag()) [('ἔστι', 'V3SPIA---'), ('δὲ', 'G--------'), ('σύμπαντα', None), ('ταῦτα', 'A-P---NA-'), ('τὰ', 'L-P---NA-'), ('συγγράμματα', None), ('ἐκείνῃ', 'A-S---FD-'), ('μάλιστα', 'D--------'), ('οὐκ', 'D--------'), ('ὠφέλιμα', None), (',', 'U--------'), ('ὅτι', 'C--------'), ('ὡς', 'C--------'), ('πρὸς', 'R--------'), ('εἰδότας', 'T-PRPAMA-'), ('συγγέγραπται', None), ('.', '---------')] """ # noqa from cltk.tag.pos import POSTag tagger = POSTag(self.options['language']) mode = mode.lower() if mode != '123' and mode != 'tnt': raise Exception( 'Invalid part of speech tagging mode specified.' ) elif mode == '123': return tagger.tag_ngram_123_backoff(self.data) elif mode == 'tnt': return tagger.tag_tnt(self.data)
def POSTagger(wordList): if opts.Progress: print('Going for the POSTagger.') tagger = POSTag('greek') listWithTags = [] listWithSelected = [] # Create a list first with all the words with tag for word in wordList: taggedItem = tagger.tag_tnt(word) listWithTags.append(taggedItem) print(taggedItem) # Select from this list only the words you want for entry in listWithTags: for word, tag in entry: if tag == None: break # if tag.startswith("N"): elif tag.startswith("N"): listWithSelected.append(word) elif tag.startswith("V"): listWithSelected.append(word) elif tag.startswith("Unk"): listWithSelected.append(word) return listWithSelected
def test_pos_tnt_tagger_greek(self): """Test tagging Greek POS with TnT tagger.""" tagger = POSTag("grc") tagged = tagger.tag_tnt( "θεοὺς μὲν αἰτῶ τῶνδ᾽ ἀπαλλαγὴν πόνων φρουρᾶς ἐτείας μῆκος" ) # pylint: disable=line-too-long self.assertTrue(tagged)
def test_middle_high_german_tnt_pos_tagger(self): target = [('uns', 'PPER'), ('ist', 'VAFIN'), ('in', 'APPR'), ('alten', 'ADJA'), ('mæren', 'ADJA'), ('wunders', 'NA'), ('vil', 'AVD'), ('geseit', 'VVPP')] mhg_pos_tagger = POSTag("middle_high_german") res = mhg_pos_tagger.tag_tnt( "uns ist in alten mæren wunders vil geseit") self.assertEqual(target, res)
def tag(self, mode='123'): tagger = POSTag(self.language) mode = mode.lower() if mode != '123' and mode != 'tnt': raise Exception('Invalid part of speech tagging mode specified.') elif mode == '123': return tagger.tag_ngram_123_backoff(self.data) elif mode == 'tnt': return tagger.tag_tnt(self.data)
def test_pos_tnt_middle_high_german(self): """Test tagging Middle High German with TnT tagger""" target = [ ("uns", "PPER"), ("ist", "VAFIN"), ("in", "APPR"), ("alten", "ADJA"), ("mæren", "ADJA"), ("wunders", "NA"), ("vil", "AVD"), ("geseit", "VVPP"), ] tagger = POSTag("gmh") tagged = tagger.tag_tnt("uns ist in alten mæren wunders vil geseit") self.assertEqual(target, tagged)
def post(self): self.reqparse = reqparse.RequestParser() self.reqparse.add_argument('string', required=True) self.reqparse.add_argument('lang', required=True, choices=POS_METHODS.keys()) self.reqparse.add_argument('method', required=False, default=DEFAULT_POS_METHOD) args = self.reqparse.parse_args() string = args['string'] lang = args['lang'] method = args['method'] if method not in POS_METHODS[lang]: return {'message': {'method': method + ' is not a valid choice'}} tagger = POSTag(lang) tagged = [] if method == 'unigram': tagged = tagger.tag_unigram(string) elif method == 'bigram': tagged = tagger.tag_bigram(string) elif method == 'trigram': tagged = tagger.tag_trigram(string) elif method == 'ngram123': tagged = tagger.tag_ngram_123_backoff(string) elif method == 'tnt': tagged = tagger.tag_tnt(string) return { 'tags': [{ 'word': word, 'tag': tag } if tag is not None else { 'word': word, 'tag': 'None' } for word, tag in tagged] }
def test_pos_tnt_tagger_old_norse(self): """Test tagging Old Norse POS with TnT tagger.""" tagger = POSTag('old_norse') tagged = tagger.tag_tnt('Hlióðs bið ek allar.') print(tagged) self.assertTrue(tagged)
def test_pos_tnt_tagger_latin(self): """Test tagging Latin POS with TnT tagger.""" tagger = POSTag('latin') tagged = tagger.tag_tnt('Gallia est omnis divisa in partes tres') self.assertTrue(tagged)
else: assert False, "unhandled option" if not filename: sys.exit(0) outfilename = filename + ".CLTK-wlt.txt" lc = 0 with open(filename, 'r') as f: with open(outfilename, 'w') as of: for l in f: l = l.strip() bits = l.split() if len(bits) != 3: continue w = normalize('NFC', bits[0]) l = normalize('NFC', bits[1]) t = bits[2] lemma = cltk_lemmatiser.lemmatize( w )[0] #tag = cltk_tagger.tag_ngram_123_backoff( w )[0] tag = cltk_tagger.tag_tnt( w )[0] # tags are all caps # καὶ [('καὶ', 'C--------')] # δι’ [('δι', None), ('’', '---------')] if '#' in lemma: hidx = lemma.find('#') lemma = lemma[0:hidx] print( w, "\t", lemma, "\t", tag[1], file=of ) lc += 1
def test_pos_tnt_tagger_greek(self): """Test tagging Greek POS with TnT tagger.""" tagger = POSTag('greek') tagged = tagger.tag_tnt('θεοὺς μὲν αἰτῶ τῶνδ᾽ ἀπαλλαγὴν πόνων φρουρᾶς ἐτείας μῆκος') # pylint: disable=line-too-long self.assertTrue(tagged)
j = JVReplacer() # Parse XML xmldoc = parse('/home/ilbuonme/siti/paolo.monella/ursus/casanatensis.xml') #xmldoc = parse('/home/ilbuonme/siti/paolo.monella/ursus/shorter_casanatensis.xml') wordElementList = xmldoc.getElementsByTagName('w') for w in wordElementList: form = w.attributes['ana'].value print(form) # Parse the inflected word try: lemmaList = lemmatizer.lemmatize(form.lower()) lemma = lemmaList[0].replace('v', 'u') posList = tagger.tag_tnt(j.replace(form.lower())) pos = posList[0][1] w.setAttribute('n', form) w.setAttribute('lemma', lemma) w.setAttribute('ana', pos) except: raise """ with open('output.xml', 'w') as f: f = codecs.lookup("utf-8")[3](f) xmldoc.writexml(f, encoding="utf-8") """ f = open('output.xml', 'wb') f = codecs.lookup("utf-8")[3](f)
def test_pos_tnt_tagger_old_norse(self): """Test tagging Old Norse POS with TnT tagger.""" tagger = POSTag("non") tagged = tagger.tag_tnt("Hlióðs bið ek allar.") self.assertTrue(tagged)
from cltk.tag.pos import POSTag lemmatizer = LemmaReplacer('greek') tagger = POSTag('greek') else: import spacy nlp = spacy.load("en_core_web_trf") # English result = [] count = 0 for sentence in data: sentence = re.sub( r"[\.\?·;]\s*$", '', sentence ) # remove sentence-ending punctuation; all other punctuation has already been removed if language == 'grc': lemmas = lemmatizer.lemmatize(sentence) tagged = tagger.tag_tnt(sentence) tagged = [[w[1], w[0]] for w in tagged if not cltk_ignored(w[1], w[0])] a = [] i = 0 for w in lemmas: if i >= len(tagged): break pos = tagged[i][0] a.append([ tagged[i][1], lemmas[i], cltk_pos_code_to_pos(pos), f"cltk:{pos}" ]) # original, lemma, part of speech, cltk part of speech i = i + 1 if len(lemmas) != len(tagged): print(lemmas, "\n", tagged, "\n", len(lemmas), len(tagged), "\n", a)