class FeatureExtractor(xml.sax.handler.ContentHandler): def __init__(self, label, document, features, fd): self.fd = fd self.arff = Arff() self.label = label self.document = document self.features = features self.cur_label = None self.cur_doc = None self.cur_feat = [] self.docstack = 0 self.labstack = 0 self.featstack = 0 def startElement(self, name, attrs): if name == self.label.split('[')[0]: if self.cur_label != None: self.labstack += 1 elif match(self.label, name, attrs): self.labstack = 1 self.cur_label = attrs['n'] if name == self.document.split('[')[0]: if self.cur_doc != None: self.docstack += 1 elif match(self.document, name, attrs): self.docstack = 1 self.cur_doc = {'label' : self.cur_label} if name == 'f': self.cur_feat.append(attrs.get('name', '?')) if name == 'fs': self.cur_feat.append(attrs.get('type', '?')) if name == 'symbol' and self.cur_doc != None: key = "%s=%s" % ("_".join(self.cur_feat), attrs['value']) self.cur_doc[key.lstrip('?_')] = self.cur_doc.get('value', 0) + 1 def endElement(self, name): if name == self.label.split('[')[0]: if self.cur_label != None: self.labstack -= 1 if self.labstack == 0 and self.cur_label != None: self.cur_label = None if name == self.document.split('[')[0]: if self.cur_doc != None: self.docstack -= 1 if self.docstack == 0 and self.cur_doc != None: self.arff.add_datum(self.cur_doc) self.cur_doc = None if name in ['f', 'fs']: self.cur_feat = self.cur_feat[0:-1] ## def characters(self, content): ## if self.write == True: ## self.fd.write("%s" % content.strip()) ## elif len(content.strip()) > 0: ## self.fd.write(self.handler(content)) def endDocument(self): self.arff.save(self.fd)
xml.sax.parse(options.input, FeatureExtractor(options.label, options.document, options.features, open(options.output, 'w'))) sys.exit() a = Arff() #doc = libxml2.parseFile(options.input) doc = etree.parse(open(options.input)) doc.xinclude() for labeled in doc.xpath(options.label): label = labeled if isinstance(labeled, etree._ElementStringResult): labeled = labeled.getparent() #labeled.xinclude() for text in labeled.xpath(options.document): if isinstance(text, etree._ElementStringResult): text = text.getparent() #print type(text) #text.xincludeProcessTree() attribs = {"LABEL" : label} #print text for feature in options.features: seq = [str(x) for x in text.xpath(feature)] #print seq for i in set(seq): attribs[i] = seq.count(i) / float(len(seq)) a.add_datum(attribs) a.save(fd)