Example #1
0
    def test(self):        
        """CGN - Splitting PoS tags into features"""
        global CLASSES
        #Do it again, but supress exceptions (only stderr output for missing features so we have one big list)
        for poscls in CLASSES.split('\n'):
            if poscls:
                cgn.parse_cgn_postag(poscls, False)

        #Do it again, raising an exception this time
        for poscls in CLASSES.split('\n'):
            if poscls:
                cgn.parse_cgn_postag(poscls, True)    
Example #2
0
    def test(self):
        """CGN - Splitting PoS tags into features"""
        global CLASSES
        #Do it again, but supress exceptions (only stderr output for missing features so we have one big list)
        for poscls in CLASSES.split('\n'):
            if poscls:
                cgn.parse_cgn_postag(poscls, False)

        #Do it again, raising an exception this time
        for poscls in CLASSES.split('\n'):
            if poscls:
                cgn.parse_cgn_postag(poscls, True)
Example #3
0
def process(target):
    print "Processing " + target
    if os.path.isdir(target):
        print "Descending into directory " + target
        for f in glob.glob(target + '/*'):
            process(f)
    elif os.path.isfile(target) and target[-4:] == '.xml':            
        print "Loading " + target
        try:
            doc = folia.Document(file=target)
        except lxml.etree.XMLSyntaxError:
            print >>sys.stderr, "UNABLE TO LOAD " + target + " (XML SYNTAX ERROR!)"
            return None
        changed = False
        for word in doc.words():
            try:
                pos = word.annotation(folia.PosAnnotation)                
            except folia.NoSuchAnnotation:
                continue
            try:
                word.replace( cgn.parse_cgn_postag(pos.cls) )
                changed = True
            except cgn.InvalidTagException:
                print >>sys.stderr, "WARNING: INVALID TAG " + pos.cls
                continue
        if changed:
            print "Saving..."
            doc.save()
def splittags(doc):
    print "\tResolving PoS tags:"
    for word in doc.words():
        if word.hasannotation(folia.PosAnnotation):
            word.replace(cgn.parse_cgn_postag(word.pos()))
        else:
            errout("\t\tWARNING: No PoS tag for " + word.id)
        if not word.hasannotation(folia.LemmaAnnotation):
            errout("\t\tWARNING: No Lemma for " + word.id)
    return doc
def retag(doc, i):
    global threads
    print "\tRetagging:"
    r = re.compile('\[(.*)\]')
    frogclient = FrogClient('localhost', 9000 + (i % threads))

    for sentence in doc.sentences():
        words = " ".join([w.text() for w in sentence.words()])
        for j, (word, lemma, morph,
                pos) in enumerate(frogclient.process(words)):
            wordelement = sentence.words(j)
            wordelement.replace(cgn.parse_cgn_postag(pos))
            wordelement.replace(folia.LemmaAnnotation, cls=lemma)

            #parse mbma
            morphemes = r.findall(morph)
            if morphemes:
                layer = wordelement.append(folia.MorphologyLayer)
                for morpheme in morphemes:
                    layer.append(folia.Morpheme, cls=morpheme)