Ejemplo n.º 1
0
 def test_pos_ngram123_tagger_old_english(self):
     """Test tagging Old English POS with a 1-, 2-, and 3-gram backoff tagger."""
     tagger = POSTag("ang")
     tagged = tagger.tag_ngram_123_backoff(
         "Hwæt! We Gardena in geardagum, þeodcyninga, þrym gefrunon, hu ða æþelingas ellen fremedon."
     )  # pylint: disable=line-too-long
     self.assertTrue(tagged)
Ejemplo n.º 2
0
 def test_pos_ngram123_tagger_greek(self):
     """Test tagging Greek POS with a 1-, 2-, and 3-gram backoff tagger."""
     tagger = POSTag("grc")
     tagged = tagger.tag_ngram_123_backoff(
         "θεοὺς μὲν αἰτῶ τῶνδ᾽ ἀπαλλαγὴν πόνων φρουρᾶς ἐτείας μῆκος"
     )  # pylint: disable=line-too-long
     self.assertTrue(tagged)
Ejemplo n.º 3
0
    def tag(self, mode='123'):
        """Gives words marked up with parts-of-speech.

        Override's the cltk POS tagger and uses cltk's instead. Has different
        methods for providing a POS tagger, if desired.

        Args:
            mode (:obj:`str`) Tagging mode, either '123', or 'tnt'

        Returns:
            :obj:`list` of :obj:`tuple` 2-tuples with word, part-of-speech

        Example:
            >>> text = AncientGreekText('ἔστι δὲ σύμπαντα ταῦτα τὰ συγγράμματα ἐκείνῃ μάλιστα οὐκ ὠφέλιμα, ὅτι ὡς πρὸς εἰδότας συγγέγραπται.')
            >>> print(text.tag())
            [('ἔστι', 'V3SPIA---'), ('δὲ', 'G--------'), ('σύμπαντα', None), ('ταῦτα', 'A-P---NA-'), ('τὰ', 'L-P---NA-'), ('συγγράμματα', None), ('ἐκείνῃ', 'A-S---FD-'), ('μάλιστα', 'D--------'), ('οὐκ', 'D--------'), ('ὠφέλιμα', None), (',', 'U--------'), ('ὅτι', 'C--------'), ('ὡς', 'C--------'), ('πρὸς', 'R--------'), ('εἰδότας', 'T-PRPAMA-'), ('συγγέγραπται', None), ('.', '---------')]
        """ # noqa
        from cltk.tag.pos import POSTag
        tagger = POSTag(self.options['language'])
        mode = mode.lower()
        if mode != '123' and mode != 'tnt':
            raise Exception(
                'Invalid part of speech tagging mode specified.'
            )
        elif mode == '123':
            return tagger.tag_ngram_123_backoff(self.data)
        elif mode == 'tnt':
            return tagger.tag_tnt(self.data)
Ejemplo n.º 4
0
 def test_pos_ngram123_tagger_latin(self):
     """Test tagging Latin POS with a 1-, 2-, and 3-gram backoff tagger."""
     tagger = POSTag("lat")
     tagged = tagger.tag_ngram_123_backoff(
         "Gallia est omnis divisa in partes tres"
     )  # pylint: disable=line-too-long
     self.assertTrue(tagged)
Ejemplo n.º 5
0
    def post(self):
        self.reqparse = reqparse.RequestParser()
        self.reqparse.add_argument('string', required=True)
        self.reqparse.add_argument('lang', required=True, choices=POS_METHODS.keys())
        self.reqparse.add_argument('method', required=False,
                                   default=DEFAULT_POS_METHOD)

        args = self.reqparse.parse_args()
        string = args['string']
        lang = args['lang']
        method = args['method']

        if method not in POS_METHODS[lang]:
            return {'message': {'method': method + ' is not a valid choice'}}

        tagger = POSTag(lang)
        tagged = []
        if method == 'unigram':
            tagged = tagger.tag_unigram(string)
        elif method == 'bigram':
            tagged = tagger.tag_bigram(string)
        elif method == 'trigram':
            tagged = tagger.tag_trigram(string)
        elif method == 'ngram123':
            tagged = tagger.tag_ngram_123_backoff(string)
        elif method == 'tnt':
            tagged = tagger.tag_tnt(string)

        return {'tags': [{'word': word, 'tag': tag}
                         if tag is not None else {'word': word, 'tag': 'None'}
                         for word, tag in tagged]}
Ejemplo n.º 6
0
 def tag(self, mode='123'):
     tagger = POSTag(self.language)
     mode = mode.lower()
     if mode != '123' and mode != 'tnt':
         raise Exception('Invalid part of speech tagging mode specified.')
     elif mode == '123':
         return tagger.tag_ngram_123_backoff(self.data)
     elif mode == 'tnt':
         return tagger.tag_tnt(self.data)
Ejemplo n.º 7
0
    def _get_pos_tags(self, tokens):
        """Iterate through list of tokens and use POS tagger to build
        a corresponding list of tags.

        :param tokens: List of tokens to be POS-tagged
        :return: List with POS-tag for each token
        """
        # Import (and define tagger) with other imports?    
        from cltk.tag.pos import POSTag
        tagger = POSTag('latin')
        tokens = " ".join(tokens)
        tags = tagger.tag_ngram_123_backoff(tokens)
        tags = [tag[1][0].lower() if tag[1] else tag[1] for tag in tags]
        return tags
Ejemplo n.º 8
0
    def _get_pos_tags(self, tokens):
        """Iterate through list of tokens and use POS tagger to build
        a corresponding list of tags.

        :param tokens: List of tokens to be POS-tagged
        :return: List with POS-tag for each token
        """
        # Import (and define tagger) with other imports?
        from cltk.tag.pos import POSTag
        tagger = POSTag('latin')
        tokens = " ".join(tokens)
        tags = tagger.tag_ngram_123_backoff(tokens)
        tags = [tag[1][0].lower() if tag[1] else tag[1] for tag in tags]
        return tags
Ejemplo n.º 9
0
    def post(self):
        self.reqparse = reqparse.RequestParser()
        self.reqparse.add_argument('string', required=True)
        self.reqparse.add_argument('lang',
                                   required=True,
                                   choices=POS_METHODS.keys())
        self.reqparse.add_argument('method',
                                   required=False,
                                   default=DEFAULT_POS_METHOD)

        args = self.reqparse.parse_args()
        string = args['string']
        lang = args['lang']
        method = args['method']

        if method not in POS_METHODS[lang]:
            return {'message': {'method': method + ' is not a valid choice'}}

        tagger = POSTag(lang)
        tagged = []
        if method == 'unigram':
            tagged = tagger.tag_unigram(string)
        elif method == 'bigram':
            tagged = tagger.tag_bigram(string)
        elif method == 'trigram':
            tagged = tagger.tag_trigram(string)
        elif method == 'ngram123':
            tagged = tagger.tag_ngram_123_backoff(string)
        elif method == 'tnt':
            tagged = tagger.tag_tnt(string)

        return {
            'tags': [{
                'word': word,
                'tag': tag
            } if tag is not None else {
                'word': word,
                'tag': 'None'
            } for word, tag in tagged]
        }
Ejemplo n.º 10
0
 def test_pos_ngram123_tagger_old_english(self):
     """Test tagging Old English POS with a 1-, 2-, and 3-gram backoff tagger."""
     tagger = POSTag('old_english')
     tagged = tagger.tag_ngram_123_backoff('Hwæt! We Gardena in geardagum, þeodcyninga, þrym gefrunon, hu ða æþelingas ellen fremedon.')  # pylint: disable=line-too-long
     self.assertTrue(tagged)
Ejemplo n.º 11
0
import re
import timeit
from cltk.tag.pos import POSTag

#initialize POS tagger
tagger = POSTag('latin')

#get text to POS tag
with open('../ov_met_1_raw.txt') as f:
    raw = f.read()

#parse every token
#write xml as text strings (i know it's bad, sorry)
f = open('../ov_met_1_xml.txt', 'w+')
for (x, y) in tagger.tag_ngram_123_backoff(raw):
    f.write("<token postag='" + str(y) + "' cite=''>" + str(x) + '</token>')
Ejemplo n.º 12
0
 def test_pos_ngram123_tagger_greek(self):
     """Test tagging Greek POS with a 1-, 2-, and 3-gram backoff tagger."""
     tagger = POSTag('greek')
     tagged = tagger.tag_ngram_123_backoff('θεοὺς μὲν αἰτῶ τῶνδ᾽ ἀπαλλαγὴν πόνων φρουρᾶς ἐτείας μῆκος')  # pylint: disable=line-too-long
     self.assertTrue(tagged)
Ejemplo n.º 13
0
 def test_pos_ngram123_tagger_latin(self):
     """Test tagging Latin POS with a 1-, 2-, and 3-gram backoff tagger."""
     tagger = POSTag('latin')
     tagged = tagger.tag_ngram_123_backoff('Gallia est omnis divisa in partes tres')  # pylint: disable=line-too-long
     self.assertTrue(tagged)
Ejemplo n.º 14
0
from cltk.tag.pos import POSTag

aen = """arma virumque cano, Troiae qui primus ab oris
Italiam, fato profugus, Laviniaque venit
litora, multum ille et terris iactatus et alto
vi superum saevae memorem Iunonis ob iram;
multa quoque et bello passus, dum conderet urbem,               5
inferretque deos Latio, genus unde Latinum,
Albanique patres, atque altae moenia Romae."""

#remove line breaks
aen = aen.replace('\n', ' ')

tagger = POSTag('latin')
aen_tagged = tagger.tag_ngram_123_backoff(aen)
print(aen_tagged)

cae_tagged = tagger.tag_crf('Gallia est omnis divisa in partes tres')
print(cae_tagged)
Ejemplo n.º 15
0
    else:
        assert False, "unhandled option"

if not filename:
    sys.exit(0)

outfilename = filename + ".CLTK-wlt.txt"
lc = 0

with open(filename, 'r') as f:
    with open(outfilename, 'w') as of:
        for l in f:
            l = l.strip()
            ln = normalize('NFC', l)
            lemmas = cltk_lemmatiser.lemmatize(ln)
            tags = cltk_tagger.tag_ngram_123_backoff(ln)
            words = ln.split()
            '''
            print( "words" )
            for w in words:
                print( w )
            print( "lemmas" )
            for l in lemmas:
                print( l )
            print( "tags" )
            for t in tags:
                print( t )
            '''
            assert len(words) == len(lemmas) == len(
                tags), "Truncated output? %r %r %r %r" % (
                    len(words), len(lemmas), len(tags), lc)
Ejemplo n.º 16
0
                                       language="greek")

philippians_reader._fileids = [
    'new-testament__letter-to-the-philippians__grc.json'
]

# print(list(perseus_reader.sents()))

sentences = list(philippians_reader.sents())
sentence = cltk_normalize(sentences[0])
lemmatizer = LemmaReplacer('greek')
word_list = lemmatizer.lemmatize(sentence)

tagger = POSTag('greek')

parts_of_speech = tagger.tag_ngram_123_backoff(sentence)

# This is not a great lemmatizer
standard_list = lemmatizer.lemmatize(list(philippians_reader.words()),
                                     return_raw=True)

lemmatizer2 = BackoffGreekLemmatizer()

# this one seems better
backoff_list = lemmatizer2.lemmatize(list(philippians_reader.words()))

# Find most names
names_in_first_sentence = ner.tag_ner('greek',
                                      input_text=sentence,
                                      output_type=list)