def test_pos_crf_tagger_old_english(self): """Test tagging Old English POS with CRF tagger.""" tagger = POSTag("ang") tagged = tagger.tag_crf( "Hwæt! We Gardena in geardagum, þeodcyninga, þrym gefrunon, hu ða æþelingas ellen fremedon." ) self.assertTrue(tagged)
def POSTagger(wordList): if opts.Progress: print('Going for the POSTagger.') tagger = POSTag('greek') listWithTags = [] listWithSelected = [] # Create a list first with all the words with tag for word in wordList: taggedItem = tagger.tag_tnt(word) listWithTags.append(taggedItem) print(taggedItem) # Select from this list only the words you want for entry in listWithTags: for word, tag in entry: if tag == None: break # if tag.startswith("N"): elif tag.startswith("N"): listWithSelected.append(word) elif tag.startswith("V"): listWithSelected.append(word) elif tag.startswith("Unk"): listWithSelected.append(word) return listWithSelected
def test_pos_ngram123_tagger_latin(self): """Test tagging Latin POS with a 1-, 2-, and 3-gram backoff tagger.""" tagger = POSTag("lat") tagged = tagger.tag_ngram_123_backoff( "Gallia est omnis divisa in partes tres" ) # pylint: disable=line-too-long self.assertTrue(tagged)
def test_pos_ngram123_tagger_greek(self): """Test tagging Greek POS with a 1-, 2-, and 3-gram backoff tagger.""" tagger = POSTag("grc") tagged = tagger.tag_ngram_123_backoff( "θεοὺς μὲν αἰτῶ τῶνδ᾽ ἀπαλλαγὴν πόνων φρουρᾶς ἐτείας μῆκος" ) # pylint: disable=line-too-long self.assertTrue(tagged)
def test_pos_trigram_greek(self): """Test tagging Greek POS with trigram tagger.""" tagger = POSTag("grc") tagged = tagger.tag_trigram( "θεοὺς μὲν αἰτῶ τῶνδ᾽ ἀπαλλαγὴν πόνων φρουρᾶς ἐτείας μῆκος" ) # pylint: disable=line-too-long self.assertTrue(tagged)
def tag(self, mode='123'): """Gives words marked up with parts-of-speech. Override's the cltk POS tagger and uses cltk's instead. Has different methods for providing a POS tagger, if desired. Args: mode (:obj:`str`) Tagging mode, either '123', or 'tnt' Returns: :obj:`list` of :obj:`tuple` 2-tuples with word, part-of-speech Example: >>> text = AncientGreekText('ἔστι δὲ σύμπαντα ταῦτα τὰ συγγράμματα ἐκείνῃ μάλιστα οὐκ ὠφέλιμα, ὅτι ὡς πρὸς εἰδότας συγγέγραπται.') >>> print(text.tag()) [('ἔστι', 'V3SPIA---'), ('δὲ', 'G--------'), ('σύμπαντα', None), ('ταῦτα', 'A-P---NA-'), ('τὰ', 'L-P---NA-'), ('συγγράμματα', None), ('ἐκείνῃ', 'A-S---FD-'), ('μάλιστα', 'D--------'), ('οὐκ', 'D--------'), ('ὠφέλιμα', None), (',', 'U--------'), ('ὅτι', 'C--------'), ('ὡς', 'C--------'), ('πρὸς', 'R--------'), ('εἰδότας', 'T-PRPAMA-'), ('συγγέγραπται', None), ('.', '---------')] """ # noqa from cltk.tag.pos import POSTag tagger = POSTag(self.options['language']) mode = mode.lower() if mode != '123' and mode != 'tnt': raise Exception( 'Invalid part of speech tagging mode specified.' ) elif mode == '123': return tagger.tag_ngram_123_backoff(self.data) elif mode == 'tnt': return tagger.tag_tnt(self.data)
def test_pos_trigram_old_english(self): """Test tagging old_english POS with trigram tagger.""" tagger = POSTag('old_english') tagged = tagger.tag_trigram( 'Hwæt! We Gardena in geardagum, þeodcyninga, þrym gefrunon, hu ða æþelingas ellen fremedon.' ) self.assertTrue(tagged)
def test_pos_perceptron_tagger_old_english(self): """Test tagging Old English POS with Perceptron tagger.""" tagger = POSTag('old_english') tagged = tagger.tag_perceptron( 'Hwæt! We Gardena in geardagum, þeodcyninga, þrym gefrunon, hu ða æþelingas ellen fremedon.' ) self.assertTrue(tagged)
def test_pos_ngram12_tagger_middle_low_german(self): """ Test MOG POS 12-backoff tagger""" tagger = POSTag('middle_low_german') tagged = tagger.tag_ngram_12_backoff( 'Jck Johannes preister verwarer vnde voirs tender des Juncfrouwen kloisters to Mariendale' ) self.assertTrue(tagged)
def test_pos_ngram123_tagger_old_english(self): """Test tagging Old English POS with a 1-, 2-, and 3-gram backoff tagger.""" tagger = POSTag("ang") tagged = tagger.tag_ngram_123_backoff( "Hwæt! We Gardena in geardagum, þeodcyninga, þrym gefrunon, hu ða æþelingas ellen fremedon." ) # pylint: disable=line-too-long self.assertTrue(tagged)
def test_middle_high_german_trigram_pos_tagger(self): target = [('uns', 'PPER'), ('ist', 'VAFIN'), ('in', 'APPR'), ('alten', 'ADJA'), ('mæren', 'NA'), ('wunders', 'NA'), ('vil', None), ('geseit', None)] mhg_pos_tagger = POSTag("middle_high_german") res = mhg_pos_tagger.tag_trigram( "uns ist in alten mæren wunders vil geseit") self.assertEqual(target, res)
def tag(self, mode='123'): tagger = POSTag(self.language) mode = mode.lower() if mode != '123' and mode != 'tnt': raise Exception('Invalid part of speech tagging mode specified.') elif mode == '123': return tagger.tag_ngram_123_backoff(self.data) elif mode == 'tnt': return tagger.tag_tnt(self.data)
def _get_pos_tags(self, tokens): """Iterate through list of tokens and use POS tagger to build a corresponding list of tags. :param tokens: List of tokens to be POS-tagged :return: List with POS-tag for each token """ # Import (and define tagger) with other imports? from cltk.tag.pos import POSTag tagger = POSTag('latin') tokens = " ".join(tokens) tags = tagger.tag_ngram_123_backoff(tokens) tags = [tag[1][0].lower() if tag[1] else tag[1] for tag in tags] return tags
def test_pos_tnt_middle_high_german(self): """Test tagging Middle High German with TnT tagger""" target = [ ("uns", "PPER"), ("ist", "VAFIN"), ("in", "APPR"), ("alten", "ADJA"), ("mæren", "ADJA"), ("wunders", "NA"), ("vil", "AVD"), ("geseit", "VVPP"), ] tagger = POSTag("gmh") tagged = tagger.tag_tnt("uns ist in alten mæren wunders vil geseit") self.assertEqual(target, tagged)
def _retrieve_tag(self, text): """Tag text with chosen tagger and clean tags. Tag format: [('word', 'tag')] :param text: string :return: list of tuples, with each tuple containing the word and its pos tag :rtype : list """ if self.tagger == 'tag_ngram_123_backoff': # Data format: Perseus Style (see https://github.com/cltk/latin_treebank_perseus) tags = POSTag('latin').tag_ngram_123_backoff(text.lower()) return [(tag[0], tag[1]) for tag in tags] elif self.tagger == 'tag_tnt': tags = POSTag('latin').tag_tnt(text.lower()) return [(tag[0], tag[1]) for tag in tags] elif self.tagger == 'tag_crf': tags = POSTag('latin').tag_crf(text.lower()) return [(tag[0], tag[1]) for tag in tags]
def lemmatizeList(list): tagger = POSTag('greek') lemmatizer = LemmaReplacer('greek') lemmWords = lemmatizer.lemmatize(list) # Remove Stopwords and numbers and lowercases all words. lemmWords = [w.lower() for w in lemmWords if not w in STOPS_LIST] lemmWords = removeNumbers(lemmWords) return lemmWords
def _retrieve_tag(self, text: str) -> List[Tuple[str, str]]: """Tag text with chosen tagger and clean tags. Tag format: ``[('word', 'tag')]`` :param text: string :return: list of tuples, with each tuple containing the word and its pos tag """ if ( self.tagger == "tag_ngram_123_backoff" ): # Data format: Perseus Style (see https://github.com/cltk/latin_treebank_perseus) tags = POSTag("lat").tag_ngram_123_backoff(text.lower()) return [(tag[0], tag[1]) for tag in tags] elif self.tagger == "tag_tnt": tags = POSTag("lat").tag_tnt(text.lower()) return [(tag[0], tag[1]) for tag in tags] elif self.tagger == "tag_crf": tags = POSTag("lat").tag_crf(text.lower()) return [(tag[0], tag[1]) for tag in tags]
def post(self): self.reqparse = reqparse.RequestParser() self.reqparse.add_argument('string', required=True) self.reqparse.add_argument('lang', required=True, choices=POS_METHODS.keys()) self.reqparse.add_argument('method', required=False, default=DEFAULT_POS_METHOD) args = self.reqparse.parse_args() string = args['string'] lang = args['lang'] method = args['method'] if method not in POS_METHODS[lang]: return {'message': {'method': method + ' is not a valid choice'}} tagger = POSTag(lang) tagged = [] if method == 'unigram': tagged = tagger.tag_unigram(string) elif method == 'bigram': tagged = tagger.tag_bigram(string) elif method == 'trigram': tagged = tagger.tag_trigram(string) elif method == 'ngram123': tagged = tagger.tag_ngram_123_backoff(string) elif method == 'tnt': tagged = tagger.tag_tnt(string) return {'tags': [{'word': word, 'tag': tag} if tag is not None else {'word': word, 'tag': 'None'} for word, tag in tagged]}
def lemmatizeList(self, lines): from cltk.corpus.utils.formatter import cltk_normalize tagger = POSTag('greek') lemmatizer = LemmaReplacer('greek') # can help when using certain texts (doc says it, so i does it) lines = cltk_normalize(lines) # print(lines) # exit(0) lines = lemmatizer.lemmatize(lines) # Remove Stopwords and numbers and lowercases all words. lines = [w.lower() for w in lines if not w in STOPS_LIST] # lemmWords = removeNumbers(lemmWords) return ' '.join(lines)
def post(self): self.reqparse = reqparse.RequestParser() self.reqparse.add_argument('string', required=True) self.reqparse.add_argument('lang', required=True, choices=POS_METHODS.keys()) self.reqparse.add_argument('method', required=False, default=DEFAULT_POS_METHOD) args = self.reqparse.parse_args() string = args['string'] lang = args['lang'] method = args['method'] if method not in POS_METHODS[lang]: return {'message': {'method': method + ' is not a valid choice'}} tagger = POSTag(lang) tagged = [] if method == 'unigram': tagged = tagger.tag_unigram(string) elif method == 'bigram': tagged = tagger.tag_bigram(string) elif method == 'trigram': tagged = tagger.tag_trigram(string) elif method == 'ngram123': tagged = tagger.tag_ngram_123_backoff(string) elif method == 'tnt': tagged = tagger.tag_tnt(string) return { 'tags': [{ 'word': word, 'tag': tag } if tag is not None else { 'word': word, 'tag': 'None' } for word, tag in tagged] }
import re import timeit from cltk.tag.pos import POSTag #initialize POS tagger tagger = POSTag('latin') #get text to POS tag with open('../ov_met_1_raw.txt') as f: raw = f.read() #parse every token #write xml as text strings (i know it's bad, sorry) f = open('../ov_met_1_xml.txt', 'w+') for (x, y) in tagger.tag_ngram_123_backoff(raw): f.write("<token postag='" + str(y) + "' cite=''>" + str(x) + '</token>')
def getPos(self): tagger = POSTag('latin') return tagger.tag_unigram(self.text)
latin_pos_lemmatized_sents_path = os.path.join(path, file) if os.path.isfile(latin_pos_lemmatized_sents_path): latin_pos_lemmatized_sents = open_pickle(latin_pos_lemmatized_sents_path) else: latin_pos_lemmatized_sents = [] print('The file %s is not available in cltk_data' % file) la_lemmatizer = BackoffLatinLemmatizer(latin_pos_lemmatized_sents) # Greek Lemmatizer grc_corpus_importer = CorpusImporter('greek') grc_corpus_importer.import_corpus('greek_models_cltk') grc_lemmatizer = LemmaReplacer('greek') # Initialize lemmatizers once outside of the loop, # then select based on langauge inside the loop -- get_words_from_file() tagLat = POSTag('latin') tagGrk = POSTag('greek') def lemmatize(word_list, copy): for word in word_list: if copy: word.lemmatization = word.text return if word.language in LATIN_CODES: word.lemmatization = \ remove_digits(la_lemmatizer.lemmatize([word.text])[0][1]) # [] are needed to turn the string into a list. # Otherwise the lemmatizer splits up individual characters and produces garbage. # The [0][1] is needed because this new lemmatizer outputs a tuple elif word.language in GREEK_CODES:
def test_pos_bigram_latin(self): """Test tagging Latin POS with bigram tagger.""" tagger = POSTag('latin') tagged = tagger.tag_bigram('Gallia est omnis divisa in partes tres') self.assertTrue(tagged)
def test_pos_ngram123_tagger_old_english(self): """Test tagging Old English POS with a 1-, 2-, and 3-gram backoff tagger.""" tagger = POSTag('old_english') tagged = tagger.tag_ngram_123_backoff('Hwæt! We Gardena in geardagum, þeodcyninga, þrym gefrunon, hu ða æþelingas ellen fremedon.') # pylint: disable=line-too-long self.assertTrue(tagged)
def test_pos_crf_tagger_latin(self): """Test tagging Latin POS with CRF tagger.""" tagger = POSTag('latin') tagged = tagger.tag_crf('Gallia est omnis divisa in partes tres') self.assertTrue(tagged)
def get_words_from_file(path, file_dict, new_system): with open(path, "r") as path_file: file_string = path_file.read().replace("...", " ").encode("utf-8") root = etree.fromstring(file_string) words = [] nsmap = {'tei': "http://www.tei-c.org/ns/1.0"} bodies = root.findall('.//' + TEI_NS + 'body') textLang = root.find('.//' + TEI_NS + 'textLang') textRegion = root.find('.//' + TEI_NS + 'region') if textRegion != None: file_dict[path] = iip_file(path, textRegion.text) mainLang = "" if (textLang != None): mainLang = textLang.attrib['mainLang'] for edition in ( root.findall(".//tei:div[@type='edition']", namespaces=nsmap) + root.findall(".//tei:div[@type='translation']", namespaces=nsmap)): if mainLang.strip() == "": mainLang = "unk" edition_type = "" if 'subtype' in edition.attrib: edition_type = edition.attrib['subtype'] if edition.attrib["type"] == "translation": edition_type = "translation" mainLang += "-transl" new_words = [] if new_system: retrieved_words = get_words_from_element(edition) combined_words = "" for e in retrieved_words: combined_words += e.text + " " tagged_words = None if mainLang in LATIN_CODES: tagger = POSTag('latin') tagged_words = tagger.tag_crf(combined_words) elif mainLang in GREEK_CODES: tagger = POSTag('greek') tagged_words = tagger.tag_crf(combined_words) if "-transl" in mainLang: tagged_words = nltk.pos_tag(nltk.word_tokenize(combined_words)) for e in retrieved_words: new_words.append( iip_word_occurrence(edition_type, mainLang, e.text, path, textRegion.text, e.surrounding_elements)) new_words[-1].internal_elements = e.internal_elements new_words[-1].alternatives = e.alternatives new_words[-1].preceding = e.preceding new_words[-1].following = e.following if tagged_words != None: for tagged_word in tagged_words: if tagged_word[0] == e.text: new_words[-1].pos = standardize_pos(tagged_word[1]) #endloop else: new_words = [ iip_word_occurrence(edition_type, mainLang, "", path, textRegion.text, []) ] add_element_to_word_list(edition, new_words, edition, mainLang, path, textRegion.text, []) words += new_words #endif #endloop null_words = [] for word in words: word.text = str(word.text) for pattern in IGNORE: word.text = word.text.replace(pattern, "") if (word.text.strip() == ""): null_words.append(word) if word.language.strip() == "": word.language = "unk" words = [x for x in words if x not in null_words] return words
def test_pos_tnt_tagger_old_norse(self): """Test tagging Old Norse POS with TnT tagger.""" tagger = POSTag('old_norse') tagged = tagger.tag_tnt('Hlióðs bið ek allar.') print(tagged) self.assertTrue(tagged)
def test_pos_ngram123_tagger_latin(self): """Test tagging Latin POS with a 1-, 2-, and 3-gram backoff tagger.""" tagger = POSTag('latin') tagged = tagger.tag_ngram_123_backoff('Gallia est omnis divisa in partes tres') # pylint: disable=line-too-long self.assertTrue(tagged)
# Import modules # For XML from xml.dom.minidom import parse, parseString import codecs # For CLTK from cltk.stem.latin.j_v import JVReplacer from cltk.stem.lemma import LemmaReplacer from cltk.tag.pos import POSTag # Initialize CLTK lemmatizer = LemmaReplacer('latin') tagger = POSTag('latin') j = JVReplacer() # Parse XML xmldoc = parse('/home/ilbuonme/siti/paolo.monella/ursus/casanatensis.xml') #xmldoc = parse('/home/ilbuonme/siti/paolo.monella/ursus/shorter_casanatensis.xml') wordElementList = xmldoc.getElementsByTagName('w') for w in wordElementList: form = w.attributes['ana'].value print(form) # Parse the inflected word try: lemmaList = lemmatizer.lemmatize(form.lower()) lemma = lemmaList[0].replace('v', 'u')
def test_pos_unigram_latin(self): """Test tagging Latin POS with unigram tagger.""" tagger = POSTag('latin') tagged = tagger.tag_unigram('Gallia est omnis divisa in partes tres') self.assertTrue(tagged)
def test_pos_tnt_tagger_greek(self): """Test tagging Greek POS with TnT tagger.""" tagger = POSTag('greek') tagged = tagger.tag_tnt('θεοὺς μὲν αἰτῶ τῶνδ᾽ ἀπαλλαγὴν πόνων φρουρᾶς ἐτείας μῆκος') # pylint: disable=line-too-long self.assertTrue(tagged)
def tag(form): tagger = POSTag("latin") list = tagger.tag_crf(form) return json.dumps(list)
# script to figure out frequency of each type of subjunctive from cltk.tag.pos import POSTag from cltk.tokenize.word import WordTokenizer from os import listdir tagger = POSTag('latin') wt = WordTokenizer('latin') filelist = sorted([f for f in listdir('./ovid_metamorphoses') if f.endswith('txt')]) present, imperfect, perfect, pluperfect = 0, 0, 0, 0 pres_ex, impf_ex, perf_ex, plup_ex = [], [], [], [] def count_subj(filename): global present, imperfect, perfect, pluperfect global pres_ex, impf_ex, perf_ex, plup_ex infile = open(filename) raw = infile.read() infile.close() tokenized = wt.tokenize(raw) tokenized = [t for t in tokenized if not None] for t in tokenized: tagged = tagger.tag_crf(t) if len(tagged) > 1 or len(tagged) == 0 or tagged[0][1] == None:
THIS ONE READS COLUMN FORMAT. THIS PROBABLY AFFECTS THE TAGGER (CONTEXT). (venv) durian:lemmatiser_new pberck python3 lemmatiser_cltk2.py -f hdt_Books_forFrog.col.nutt ''' debug = False def DBG(*strs): if debug: sys.stderr.write("DBG:"+"".join(str(strs))+"\n") try: from cltk.stem.lemma import LemmaReplacer cltk_lemmatiser = LemmaReplacer('greek') from cltk.tag.pos import POSTag cltk_tagger = POSTag('greek') except: print(" No CLTK toolkit found." ) sys.exit(1) print( "CLTK ok." ) filename = None try: opts, args = getopt.getopt(sys.argv[1:], "f:", []) except getopt.GetoptError as err: print(str(err)) sys.exit(1) for o, a in opts: if o in ("-f"):
def test_pos_ngram12_tagger_middle_low_german(self): """ Test MOG POS 12-backoff tagger""" tagger = POSTag('middle_low_german') tagged = tagger.tag_ngram_12_backoff('Jck Johannes preister verwarer vnde voirs tender des Juncfrouwen kloisters to Mariendale') self.assertTrue(tagged)
def test_pos_tnt_tagger_latin(self): """Test tagging Latin POS with TnT tagger.""" tagger = POSTag("lat") tagged = tagger.tag_tnt("Gallia est omnis divisa in partes tres") self.assertTrue(tagged)
def test_pos_trigram_old_english(self): """Test tagging old_english POS with trigram tagger.""" tagger = POSTag('old_english') tagged = tagger.tag_trigram('Hwæt! We Gardena in geardagum, þeodcyninga, þrym gefrunon, hu ða æþelingas ellen fremedon.') self.assertTrue(tagged)
def test_pos_ngram123_tagger_greek(self): """Test tagging Greek POS with a 1-, 2-, and 3-gram backoff tagger.""" tagger = POSTag('greek') tagged = tagger.tag_ngram_123_backoff('θεοὺς μὲν αἰτῶ τῶνδ᾽ ἀπαλλαγὴν πόνων φρουρᾶς ἐτείας μῆκος') # pylint: disable=line-too-long self.assertTrue(tagged)
def test_pos_perceptron_tagger_old_english(self): """Test tagging Old English POS with Perceptron tagger.""" tagger = POSTag('old_english') tagged = tagger.tag_perceptron('Hwæt! We Gardena in geardagum, þeodcyninga, þrym gefrunon, hu ða æþelingas ellen fremedon.') self.assertTrue(tagged)