def process_document(self, session, doc): data = doc.get_raw(session) et = etree.parse(StringIO.StringIO(data), self.parser) rec = LxmlRecord(et) rec.byteCount = len(data) self._copyData(doc, rec) return rec
def process_document(self, session, doc): # input must be string or stream data = doc.get_raw(session) try: et = etree.parse(StringIO.StringIO(data), self.parser) except AssertionError: data = data.decode('utf8') et = etree.parse(StringIO.StringIO(data), self.parser) rec = LxmlRecord(et) rec.byteCount = len(data) self._copyData(doc, rec) return rec
def _processResult(self, session, data): """Parse XML to create and return dict of metadata items. Parse XML output from external program. Process parsed XML using self.sources. Populate and return a dictionary of metadata items. """ try: et = etree.fromstring(data) except AssertionError: data = data.decode('utf8') et = etree.fromstring(data) except etree.XMLSyntaxError: if session.logger is not None: # log debug level session.logger.log_lvl(session, 10, data) raise record = LxmlRecord(et) record.byteCount = len(data) mddict = {} for key, src in self.sources.iteritems(): (xpath, process, preprocess) = src['source'] if preprocess is not None: record = preprocess.process(session, record) if xpath is not None: rawlist = xpath.process_record(session, record) processed = process.process(session, rawlist) else: processed = process.process(session, record) if len(processed) > 1: mddict[key] = [] for pl, k in sorted([(val['proxLoc'], k) for k, val in processed.iteritems()]): for x in pl: mddict[key].append(k) elif len(processed) == 1: mddict[key] = processed.keys()[0] elif src['default'] is not None: mddict[key] = src['default'] return mddict