def process_document(self, session, doc): data = doc.get_raw(session) et = etree.parse(StringIO.StringIO(data), self.parser) rec = LxmlRecord(et) rec.byteCount = len(data) self._copyData(doc, rec) return rec
def process_document(self, session, doc): # input must be string or stream data = doc.get_raw(session) try: et = etree.parse(StringIO.StringIO(data), self.parser) except AssertionError: data = data.decode('utf8') et = etree.parse(StringIO.StringIO(data), self.parser) rec = LxmlRecord(et) rec.byteCount = len(data) self._copyData(doc, rec) return rec
def process_xpath(self, session, q, map={}): if self.dom is None: self.get_xml(session) try: return LxmlRecord.process_xpath(self, session, q, map) except etree.XPathEvalError: return self.process_sparql(session, q, map)
def getRecord(baseUrl, metadataPrefix, identifier): """Return (Header, metadata, about) tuple of record with specified identifier from the specified OAI-PMH server.""" args = { 'verb': "GetRecord", 'metadataPrefix': metadataPrefix, 'identifier': identifier } params = urllib.urlencode(args) url = "{0}?{1}".format(baseUrl, params) data = fetch_data(url) try: tree = etree.fromstring(data) except: sys.stderr.write(url + '\n') sys.stderr.write(data + '\n') sys.stderr.flush() raise hEl = tree.xpath('//oai:record[1]/oai:header', namespaces={'oai': NS_OAIPMH})[0] header = headerFromLxmlElement(hEl) recEl = tree.xpath('//oai:record[1]/oai:metadata/*', namespaces={'oai': NS_OAIPMH})[0] recString = etree.tostring(recEl) rec = LxmlRecord(recEl, xml=recString, docId=identifier, byteCount=len(recString)) return (header, rec, None)
def _get_test_records(self): for x in range(5): yield LxmlRecord(etree.XML('<record>' '<title>Title {0}</title>' '<content>Record {0} content.</content>' '</record>'.format(x)), docId=x )
def _processResult(self, session, data): """Parse XML to create and return dict of metadata items. Parse XML output from external program. Process parsed XML using self.sources. Populate and return a dictionary of metadata items. """ try: et = etree.fromstring(data) except AssertionError: data = data.decode('utf8') et = etree.fromstring(data) except etree.XMLSyntaxError: if session.logger is not None: # log debug level session.logger.log_lvl(session, 10, data) raise record = LxmlRecord(et) record.byteCount = len(data) mddict = {} for key, src in self.sources.iteritems(): (xpath, process, preprocess) = src['source'] if preprocess is not None: record = preprocess.process(session, record) if xpath is not None: rawlist = xpath.process_record(session, record) processed = process.process(session, rawlist) else: processed = process.process(session, record) if len(processed) > 1: mddict[key] = [] for pl, k in sorted([(val['proxLoc'], k) for k, val in processed.iteritems()]): for x in pl: mddict[key].append(k) elif len(processed) == 1: mddict[key] = processed.keys()[0] elif src['default'] is not None: mddict[key] = src['default'] return mddict
def listRecords(baseUrl, metadataPrefix, set=None, from_=None, until=None, cursor=0, batch_size=10): """Return a list of (Header, metadata, about) tuples for records which match the given parameters from the specified OAI-PMH server.""" args = {'verb': "ListRecords", 'metadataPrefix': metadataPrefix} if set is not None: args['set'] = set if from_ is not None: args['from'] = str(from_) if until is not None: args['until'] = str(until) params = urllib.urlencode(args) url = "{0}?{1}".format(baseUrl, params) data = fetch_data(url) records = [] i = 0 while (data is not None): try: tree = etree.fromstring(data) except: print url print data raise for recEl in tree.xpath('//oai:record', namespaces={'oai': NS_OAIPMH}): if i < cursor: i += 1 continue hEl = recEl.xpath('//oai:header', namespaces={'oai': NS_OAIPMH})[0] header = headerFromLxmlElement(hEl) mdEl = recEl.xpath('//oai:metadata/*', namespaces={'oai': NS_OAIPMH})[0] recString = etree.tostring(mdEl) rec = LxmlRecord(mdEl, xml=recString, docId=header.identifier(), byteCount=len(recString)) records.append((header, rec, None)) i += 1 if (len(headers) >= batch_size): return headers resTok = tree.xpath('string(//oai:resumptionToken)', namespaces={'oai': NS_OAIPMH}) if resTok: data = fetch_data(url + '&resumptionToken=' + cgi_encode(resTok)) else: break return records
def _get_testDataAndExpected(self): # Namespaced example yield (LxmlRecord( etree.XML(""" <div xmlns="http.cheshire3.org/schemas/tests"> <hr/> <p> Some text. </p> <hr/> </div>""")), [ re.compile( """^<c3:component.*?> <p> Some text. </p> </c3:component>$""", re.LOCALE) ])
def _get_testData(self): yield (LxmlRecord( etree.XML(""" <menu> <meal> <egg/> <bacon/> </meal> <meal> <egg/> <sausage/> <bacon/> </meal> <meal> <egg/> <spam/> </meal> <meal> <egg/> <bacon/> <spam/> </meal> </menu> """)), )
def _submit_userLxml(self, id, userNode): rec = LxmlRecord(userNode) rec.id = id userStore.store_record(session, rec) userStore.commit_storing(session)
def _get_test_recs(self): for data in self._get_test_data(): node = etree.Element('data') node.text = data xml = etree.tostring(node) yield LxmlRecord(node, xml=xml, byteCount=len(xml))
def _get_testDataAndExpected(self): # Simple example yield (LxmlRecord( etree.XML(""" <div> <hr/> <p> Some text. </p> <hr/> </div>""")), [ re.compile( """^<c3:?component.*?><hr></hr> <p> Some text. </p> <hr></hr></c3:?component>$""", re.LOCALE) ]) # Simple example with tail text yield (LxmlRecord( etree.XML(""" <div> <hr/> <p> Some text. </p> With tail text. <hr/> </div>""")), [ re.compile( """^<c3:?component.*?><hr></hr> <p> Some text. </p> With tail text. <hr></hr></c3:?component>$""", re.LOCALE) ]) # Example where endNode is not a sibling of startNode # Tail text should be excluded now yield (LxmlRecord( etree.XML(""" <div> <hr/> <p> Some text. <hr/> </p> With tail text. </div>""")), [ re.compile( """^<c3:?component.*?><hr></hr> <p> Some text. <hr></hr></p></c3:?component>$""", re.LOCALE) ]) # Example where endNode is sibling of startNode ancestor yield (LxmlRecord( etree.XML(""" <div> <p>Some text. <hr/> </p> With tail text <hr/> </div>""")), [ re.compile( """^<c3:?component.*?><p><hr></hr> </p> With tail text <hr></hr></c3:?component>$""", re.LOCALE) ])