Ejemplo n.º 1
0
 def __init__(self, label, document, features, fd):
     self.fd = fd
     self.arff = Arff()
     self.label = label
     self.document = document
     self.features = features
     self.cur_label = None
     self.cur_doc = None
     self.cur_feat = []
     self.docstack = 0
     self.labstack = 0
     self.featstack = 0
Ejemplo n.º 2
0
class FeatureExtractor(xml.sax.handler.ContentHandler):
    def __init__(self, label, document, features, fd):
        self.fd = fd
        self.arff = Arff()
        self.label = label
        self.document = document
        self.features = features
        self.cur_label = None
        self.cur_doc = None
        self.cur_feat = []
        self.docstack = 0
        self.labstack = 0
        self.featstack = 0

        
    def startElement(self, name, attrs):
        if name == self.label.split('[')[0]:
            if self.cur_label != None:
                self.labstack += 1
            elif match(self.label, name, attrs):
                self.labstack = 1
                self.cur_label = attrs['n']
        
        if name == self.document.split('[')[0]:
            if self.cur_doc != None:
                self.docstack += 1
            elif match(self.document, name, attrs):
                self.docstack = 1
                self.cur_doc = {'label' : self.cur_label}

        if name == 'f':
            self.cur_feat.append(attrs.get('name', '?'))

        if name == 'fs':
            self.cur_feat.append(attrs.get('type', '?'))

        if name == 'symbol' and self.cur_doc != None:
            key = "%s=%s" % ("_".join(self.cur_feat), attrs['value'])
            self.cur_doc[key.lstrip('?_')] = self.cur_doc.get('value', 0) + 1
                
    def endElement(self, name):
        if name == self.label.split('[')[0]:
            if self.cur_label != None:
                self.labstack -= 1
            if self.labstack == 0 and self.cur_label != None:
                self.cur_label = None

        if name == self.document.split('[')[0]:
            if self.cur_doc != None:
                self.docstack -= 1
            if self.docstack == 0 and self.cur_doc != None:
                self.arff.add_datum(self.cur_doc)
                self.cur_doc = None

        if name in ['f', 'fs']:
            self.cur_feat = self.cur_feat[0:-1]
        
    ## def characters(self, content):
    ##     if self.write == True:
    ##         self.fd.write("%s" % content.strip())
    ##     elif len(content.strip()) > 0:
    ##         self.fd.write(self.handler(content))

    def endDocument(self):
        self.arff.save(self.fd)
Ejemplo n.º 3
0
    parser.add_option('-i', '--input', dest='input')
    parser.add_option('-o', '--output', dest='output')
    parser.add_option('-f', '--features', dest='features', default=[], action='append')
    parser.add_option('-l', '--label', dest='label', help='xpath')
    parser.add_option('-d', '--document', dest='document', help='xpath')
    options, remainder = parser.parse_args()

    if options.output:
        fd = open(options.output, 'w')
    else:
        fd = sys.stdout

    xml.sax.parse(options.input, FeatureExtractor(options.label, options.document, options.features, open(options.output, 'w')))
    sys.exit()

    a = Arff()

    #doc = libxml2.parseFile(options.input)
    doc = etree.parse(open(options.input))
    doc.xinclude()

    for labeled in doc.xpath(options.label):
        label = labeled
        if isinstance(labeled, etree._ElementStringResult):
            labeled = labeled.getparent()
        #labeled.xinclude()
        for text in labeled.xpath(options.document):
            if isinstance(text, etree._ElementStringResult):
                text = text.getparent()
            #print type(text)
            #text.xincludeProcessTree()