Esempio n. 1
0
class FeatureExtractor(xml.sax.handler.ContentHandler):
    def __init__(self, label, document, features, fd):
        self.fd = fd
        self.arff = Arff()
        self.label = label
        self.document = document
        self.features = features
        self.cur_label = None
        self.cur_doc = None
        self.cur_feat = []
        self.docstack = 0
        self.labstack = 0
        self.featstack = 0

        
    def startElement(self, name, attrs):
        if name == self.label.split('[')[0]:
            if self.cur_label != None:
                self.labstack += 1
            elif match(self.label, name, attrs):
                self.labstack = 1
                self.cur_label = attrs['n']
        
        if name == self.document.split('[')[0]:
            if self.cur_doc != None:
                self.docstack += 1
            elif match(self.document, name, attrs):
                self.docstack = 1
                self.cur_doc = {'label' : self.cur_label}

        if name == 'f':
            self.cur_feat.append(attrs.get('name', '?'))

        if name == 'fs':
            self.cur_feat.append(attrs.get('type', '?'))

        if name == 'symbol' and self.cur_doc != None:
            key = "%s=%s" % ("_".join(self.cur_feat), attrs['value'])
            self.cur_doc[key.lstrip('?_')] = self.cur_doc.get('value', 0) + 1
                
    def endElement(self, name):
        if name == self.label.split('[')[0]:
            if self.cur_label != None:
                self.labstack -= 1
            if self.labstack == 0 and self.cur_label != None:
                self.cur_label = None

        if name == self.document.split('[')[0]:
            if self.cur_doc != None:
                self.docstack -= 1
            if self.docstack == 0 and self.cur_doc != None:
                self.arff.add_datum(self.cur_doc)
                self.cur_doc = None

        if name in ['f', 'fs']:
            self.cur_feat = self.cur_feat[0:-1]
        
    ## def characters(self, content):
    ##     if self.write == True:
    ##         self.fd.write("%s" % content.strip())
    ##     elif len(content.strip()) > 0:
    ##         self.fd.write(self.handler(content))

    def endDocument(self):
        self.arff.save(self.fd)
Esempio n. 2
0
    xml.sax.parse(options.input, FeatureExtractor(options.label, options.document, options.features, open(options.output, 'w')))
    sys.exit()

    a = Arff()

    #doc = libxml2.parseFile(options.input)
    doc = etree.parse(open(options.input))
    doc.xinclude()

    for labeled in doc.xpath(options.label):
        label = labeled
        if isinstance(labeled, etree._ElementStringResult):
            labeled = labeled.getparent()
        #labeled.xinclude()
        for text in labeled.xpath(options.document):
            if isinstance(text, etree._ElementStringResult):
                text = text.getparent()
            #print type(text)
            #text.xincludeProcessTree()
            attribs = {"LABEL" : label}
            #print text
            for feature in options.features:
                seq = [str(x) for x in text.xpath(feature)]
                #print seq
                for i in set(seq):
                    attribs[i] = seq.count(i) / float(len(seq))
            a.add_datum(attribs)

    a.save(fd)