def test(self): """CGN - Splitting PoS tags into features""" global CLASSES #Do it again, but supress exceptions (only stderr output for missing features so we have one big list) for poscls in CLASSES.split('\n'): if poscls: cgn.parse_cgn_postag(poscls, False) #Do it again, raising an exception this time for poscls in CLASSES.split('\n'): if poscls: cgn.parse_cgn_postag(poscls, True)
def process(target): print "Processing " + target if os.path.isdir(target): print "Descending into directory " + target for f in glob.glob(target + '/*'): process(f) elif os.path.isfile(target) and target[-4:] == '.xml': print "Loading " + target try: doc = folia.Document(file=target) except lxml.etree.XMLSyntaxError: print >>sys.stderr, "UNABLE TO LOAD " + target + " (XML SYNTAX ERROR!)" return None changed = False for word in doc.words(): try: pos = word.annotation(folia.PosAnnotation) except folia.NoSuchAnnotation: continue try: word.replace( cgn.parse_cgn_postag(pos.cls) ) changed = True except cgn.InvalidTagException: print >>sys.stderr, "WARNING: INVALID TAG " + pos.cls continue if changed: print "Saving..." doc.save()
def splittags(doc): print "\tResolving PoS tags:" for word in doc.words(): if word.hasannotation(folia.PosAnnotation): word.replace(cgn.parse_cgn_postag(word.pos())) else: errout("\t\tWARNING: No PoS tag for " + word.id) if not word.hasannotation(folia.LemmaAnnotation): errout("\t\tWARNING: No Lemma for " + word.id) return doc
def retag(doc, i): global threads print "\tRetagging:" r = re.compile('\[(.*)\]') frogclient = FrogClient('localhost', 9000 + (i % threads)) for sentence in doc.sentences(): words = " ".join([w.text() for w in sentence.words()]) for j, (word, lemma, morph, pos) in enumerate(frogclient.process(words)): wordelement = sentence.words(j) wordelement.replace(cgn.parse_cgn_postag(pos)) wordelement.replace(folia.LemmaAnnotation, cls=lemma) #parse mbma morphemes = r.findall(morph) if morphemes: layer = wordelement.append(folia.MorphologyLayer) for morpheme in morphemes: layer.append(folia.Morpheme, cls=morpheme)