Esempio n. 1
0
    def post(self):
        self.reqparse = reqparse.RequestParser()
        self.reqparse.add_argument('string', required=True)
        self.reqparse.add_argument('lang', required=True, choices=POS_METHODS.keys())
        self.reqparse.add_argument('method', required=False,
                                   default=DEFAULT_POS_METHOD)

        args = self.reqparse.parse_args()
        string = args['string']
        lang = args['lang']
        method = args['method']

        if method not in POS_METHODS[lang]:
            return {'message': {'method': method + ' is not a valid choice'}}

        tagger = POSTag(lang)
        tagged = []
        if method == 'unigram':
            tagged = tagger.tag_unigram(string)
        elif method == 'bigram':
            tagged = tagger.tag_bigram(string)
        elif method == 'trigram':
            tagged = tagger.tag_trigram(string)
        elif method == 'ngram123':
            tagged = tagger.tag_ngram_123_backoff(string)
        elif method == 'tnt':
            tagged = tagger.tag_tnt(string)

        return {'tags': [{'word': word, 'tag': tag}
                         if tag is not None else {'word': word, 'tag': 'None'}
                         for word, tag in tagged]}
Esempio n. 2
0
    def tag(self, mode='123'):
        """Gives words marked up with parts-of-speech.

        Override's the cltk POS tagger and uses cltk's instead. Has different
        methods for providing a POS tagger, if desired.

        Args:
            mode (:obj:`str`) Tagging mode, either '123', or 'tnt'

        Returns:
            :obj:`list` of :obj:`tuple` 2-tuples with word, part-of-speech

        Example:
            >>> text = AncientGreekText('ἔστι δὲ σύμπαντα ταῦτα τὰ συγγράμματα ἐκείνῃ μάλιστα οὐκ ὠφέλιμα, ὅτι ὡς πρὸς εἰδότας συγγέγραπται.')
            >>> print(text.tag())
            [('ἔστι', 'V3SPIA---'), ('δὲ', 'G--------'), ('σύμπαντα', None), ('ταῦτα', 'A-P---NA-'), ('τὰ', 'L-P---NA-'), ('συγγράμματα', None), ('ἐκείνῃ', 'A-S---FD-'), ('μάλιστα', 'D--------'), ('οὐκ', 'D--------'), ('ὠφέλιμα', None), (',', 'U--------'), ('ὅτι', 'C--------'), ('ὡς', 'C--------'), ('πρὸς', 'R--------'), ('εἰδότας', 'T-PRPAMA-'), ('συγγέγραπται', None), ('.', '---------')]
        """ # noqa
        from cltk.tag.pos import POSTag
        tagger = POSTag(self.options['language'])
        mode = mode.lower()
        if mode != '123' and mode != 'tnt':
            raise Exception(
                'Invalid part of speech tagging mode specified.'
            )
        elif mode == '123':
            return tagger.tag_ngram_123_backoff(self.data)
        elif mode == 'tnt':
            return tagger.tag_tnt(self.data)
Esempio n. 3
0
def POSTagger(wordList):
    if opts.Progress:
        print('Going for the POSTagger.')

    tagger = POSTag('greek')

    listWithTags = []
    listWithSelected = []

    # Create a list first with all the words with tag
    for word in wordList:
        taggedItem = tagger.tag_tnt(word)
        listWithTags.append(taggedItem)
        print(taggedItem)

    # Select from this list only the words you want
    for entry in listWithTags:
        for word, tag in entry:
            if tag == None:
                break  # if tag.startswith("N"):
            elif tag.startswith("N"):
                listWithSelected.append(word)
            elif tag.startswith("V"):
                listWithSelected.append(word)
            elif tag.startswith("Unk"):
                listWithSelected.append(word)

    return listWithSelected
Esempio n. 4
0
 def test_pos_tnt_tagger_greek(self):
     """Test tagging Greek POS with TnT tagger."""
     tagger = POSTag("grc")
     tagged = tagger.tag_tnt(
         "θεοὺς μὲν αἰτῶ τῶνδ᾽ ἀπαλλαγὴν πόνων φρουρᾶς ἐτείας μῆκος"
     )  # pylint: disable=line-too-long
     self.assertTrue(tagged)
Esempio n. 5
0
 def test_middle_high_german_tnt_pos_tagger(self):
     target = [('uns', 'PPER'), ('ist', 'VAFIN'), ('in', 'APPR'),
               ('alten', 'ADJA'), ('mæren', 'ADJA'), ('wunders', 'NA'),
               ('vil', 'AVD'), ('geseit', 'VVPP')]
     mhg_pos_tagger = POSTag("middle_high_german")
     res = mhg_pos_tagger.tag_tnt(
         "uns ist in alten mæren wunders vil geseit")
     self.assertEqual(target, res)
Esempio n. 6
0
 def tag(self, mode='123'):
     tagger = POSTag(self.language)
     mode = mode.lower()
     if mode != '123' and mode != 'tnt':
         raise Exception('Invalid part of speech tagging mode specified.')
     elif mode == '123':
         return tagger.tag_ngram_123_backoff(self.data)
     elif mode == 'tnt':
         return tagger.tag_tnt(self.data)
Esempio n. 7
0
 def test_pos_tnt_middle_high_german(self):
     """Test tagging Middle High German with TnT tagger"""
     target = [
         ("uns", "PPER"),
         ("ist", "VAFIN"),
         ("in", "APPR"),
         ("alten", "ADJA"),
         ("mæren", "ADJA"),
         ("wunders", "NA"),
         ("vil", "AVD"),
         ("geseit", "VVPP"),
     ]
     tagger = POSTag("gmh")
     tagged = tagger.tag_tnt("uns ist in alten mæren wunders vil geseit")
     self.assertEqual(target, tagged)
Esempio n. 8
0
    def post(self):
        self.reqparse = reqparse.RequestParser()
        self.reqparse.add_argument('string', required=True)
        self.reqparse.add_argument('lang',
                                   required=True,
                                   choices=POS_METHODS.keys())
        self.reqparse.add_argument('method',
                                   required=False,
                                   default=DEFAULT_POS_METHOD)

        args = self.reqparse.parse_args()
        string = args['string']
        lang = args['lang']
        method = args['method']

        if method not in POS_METHODS[lang]:
            return {'message': {'method': method + ' is not a valid choice'}}

        tagger = POSTag(lang)
        tagged = []
        if method == 'unigram':
            tagged = tagger.tag_unigram(string)
        elif method == 'bigram':
            tagged = tagger.tag_bigram(string)
        elif method == 'trigram':
            tagged = tagger.tag_trigram(string)
        elif method == 'ngram123':
            tagged = tagger.tag_ngram_123_backoff(string)
        elif method == 'tnt':
            tagged = tagger.tag_tnt(string)

        return {
            'tags': [{
                'word': word,
                'tag': tag
            } if tag is not None else {
                'word': word,
                'tag': 'None'
            } for word, tag in tagged]
        }
Esempio n. 9
0
 def test_pos_tnt_tagger_old_norse(self):
     """Test tagging Old Norse POS with TnT tagger."""
     tagger = POSTag('old_norse')
     tagged = tagger.tag_tnt('Hlióðs bið ek allar.')
     print(tagged)
     self.assertTrue(tagged)
Esempio n. 10
0
 def test_pos_tnt_tagger_latin(self):
     """Test tagging Latin POS with TnT tagger."""
     tagger = POSTag('latin')
     tagged = tagger.tag_tnt('Gallia est omnis divisa in partes tres')
     self.assertTrue(tagged)
Esempio n. 11
0
    else:
        assert False, "unhandled option"

if not filename:
    sys.exit(0)
    
outfilename = filename + ".CLTK-wlt.txt"
lc = 0

with open(filename, 'r') as f:
    with open(outfilename, 'w') as of:
        for l in f:
            l = l.strip()
            bits = l.split()
            if len(bits) != 3:
                continue
            w = normalize('NFC', bits[0])
            l = normalize('NFC', bits[1])
            t = bits[2]
            lemma = cltk_lemmatiser.lemmatize( w )[0]
            #tag   = cltk_tagger.tag_ngram_123_backoff( w )[0]
            tag   = cltk_tagger.tag_tnt( w )[0]
            # tags are all caps
            # καὶ [('καὶ', 'C--------')]
            # δι’ [('δι', None), ('’', '---------')]
            if '#' in lemma:
                hidx = lemma.find('#')
                lemma = lemma[0:hidx]
            print( w, "\t", lemma, "\t", tag[1], file=of )
            lc += 1
Esempio n. 12
0
 def test_pos_tnt_tagger_greek(self):
     """Test tagging Greek POS with TnT tagger."""
     tagger = POSTag('greek')
     tagged = tagger.tag_tnt('θεοὺς μὲν αἰτῶ τῶνδ᾽ ἀπαλλαγὴν πόνων φρουρᾶς ἐτείας μῆκος')  # pylint: disable=line-too-long
     self.assertTrue(tagged)
Esempio n. 13
0
 def test_pos_tnt_tagger_old_norse(self):
     """Test tagging Old Norse POS with TnT tagger."""
     tagger = POSTag('old_norse')
     tagged = tagger.tag_tnt('Hlióðs bið ek allar.')
     print(tagged)
     self.assertTrue(tagged)
Esempio n. 14
0
 def test_pos_tnt_tagger_latin(self):
     """Test tagging Latin POS with TnT tagger."""
     tagger = POSTag('latin')
     tagged = tagger.tag_tnt('Gallia est omnis divisa in partes tres')
     self.assertTrue(tagged)
Esempio n. 15
0
j = JVReplacer()

# Parse XML

xmldoc = parse('/home/ilbuonme/siti/paolo.monella/ursus/casanatensis.xml')
#xmldoc = parse('/home/ilbuonme/siti/paolo.monella/ursus/shorter_casanatensis.xml')
wordElementList = xmldoc.getElementsByTagName('w')

for w in wordElementList:
        form = w.attributes['ana'].value
        print(form)
        # Parse the inflected word
        try:
            lemmaList = lemmatizer.lemmatize(form.lower())
            lemma = lemmaList[0].replace('v', 'u')
            posList   = tagger.tag_tnt(j.replace(form.lower()))
            pos = posList[0][1]
            w.setAttribute('n', form)
            w.setAttribute('lemma', lemma)
            w.setAttribute('ana', pos)
        except:
            raise

"""
with open('output.xml', 'w') as f:
    f = codecs.lookup("utf-8")[3](f)
    xmldoc.writexml(f, encoding="utf-8")
"""

f = open('output.xml', 'wb')
f = codecs.lookup("utf-8")[3](f)
Esempio n. 16
0
 def test_pos_tnt_tagger_old_norse(self):
     """Test tagging Old Norse POS with TnT tagger."""
     tagger = POSTag("non")
     tagged = tagger.tag_tnt("Hlióðs bið ek allar.")
     self.assertTrue(tagged)
Esempio n. 17
0
    from cltk.tag.pos import POSTag
    lemmatizer = LemmaReplacer('greek')
    tagger = POSTag('greek')
else:
    import spacy
    nlp = spacy.load("en_core_web_trf")  # English

result = []
count = 0
for sentence in data:
    sentence = re.sub(
        r"[\.\?·;]\s*$", '', sentence
    )  # remove sentence-ending punctuation; all other punctuation has already been removed
    if language == 'grc':
        lemmas = lemmatizer.lemmatize(sentence)
        tagged = tagger.tag_tnt(sentence)
        tagged = [[w[1], w[0]] for w in tagged if not cltk_ignored(w[1], w[0])]
        a = []
        i = 0
        for w in lemmas:
            if i >= len(tagged):
                break
            pos = tagged[i][0]
            a.append([
                tagged[i][1], lemmas[i],
                cltk_pos_code_to_pos(pos), f"cltk:{pos}"
            ])  # original, lemma, part of speech, cltk part of speech
            i = i + 1
        if len(lemmas) != len(tagged):
            print(lemmas, "\n", tagged, "\n", len(lemmas), len(tagged), "\n",
                  a)