def process_document(self, session, doc): data = doc.get_raw(session) m = MARC(data) return StringDocument(m.toMARCXML(), self.id, doc.processHistory, mimeType='text/xml', parent=doc.parent, filename=doc.filename)
class MarcRecord(Record): """For dealing with Library MARC Records.""" def __init__(self, data, xml="", docId=0, wordCount=0, byteCount=0): txt = doc.get_raw(session) self.marc = MARC(txt) self.id = docId # Estimate number of words... display = str(self.marc) if not wordCount: wordCount = len(display.split()) - (len(display.split('\n')) * 2) self.wordCount = wordCount if byteCount: self.byteCount = byteCount else: self.byteCount = len(display) self.decoder = MARC8_to_Unicode() self.asciiRe = re.compile('([\x0e-\x1f]|[\x7b-\xff])') def process_xpath(self, session, xpath, maps={}): if (not isinstance(xpath, list)): # Raw XPath # c = utils.verifyXPaths([xpath]) if (not c or not c[0][1]): return [] else: xpath = c[0] xp = xpath[1] # format: fldNNN/a try: fld = int(xp[0][1][3:]) except ValueError: # not a NNN not an int return [] if fld in self.marc.fields: data = self.marc.fields[fld] else: return [] if len(xp) > 1: subfield = xp[1][1] else: subfield = "" vals = [] if fld in [0, 1]: vals = data else: for d in data: if not subfield: vals.append(' '.join([x[1] for x in d[2]])) elif subfield == 'ind1': vals.append(d[0]) elif subfield == 'ind2': vals.append(d[1]) elif fld == 8: if not subfield: vals.append(d) elif subfield == 'lang': vals.append(d[35:38]) elif subfield == 'date': vals.append(d[:6]) elif subfield == 'pubStatus': vals.append(d[6]) elif subfield == 'date1': vals.append(d[7:11]) elif subfield == 'date2': vals.append(d[11:15]) elif subfield == 'pubPlace': vals.append(d[15:18]) else: for x in d[2]: try: if x[0] == subfield: vals.append(x[1]) except: # broken pass nvals = [] for v in vals: try: nvals.append(v.decode('utf-8')) except: try: convtd = self.decoder.translate(v) nvals.append(unicodedata.normalize('NFC', convtd)) except: # strip out any totally @^%(ed characters v = self.asciiRe.sub('?', v) nvals.append(v) return nvals def get_dom(self, session): raise(NotImplementedError) def get_sax(self, session): raise(NotImplementedError) def get_xml(self, session): return self.marc.toMARCXML() def fetch_vector(self, session, index, summary=False): return index.indexStore.fetch_vector(session, index, self, summary)
class MarcRecord(Record): """For dealing with Library MARC Records.""" def __init__(self, data, xml="", docId=0, wordCount=0, byteCount=0): txt = doc.get_raw(session) self.marc = MARC(txt) self.id = docId # Estimate number of words... display = str(self.marc) if not wordCount: wordCount = len(display.split()) - (len(display.split('\n')) * 2) self.wordCount = wordCount if byteCount: self.byteCount = byteCount else: self.byteCount = len(display) self.decoder = MARC8_to_Unicode() self.asciiRe = re.compile('([\x0e-\x1f]|[\x7b-\xff])') def process_xpath(self, session, xpath, maps={}): if (not isinstance(xpath, list)): # Raw XPath # c = utils.verifyXPaths([xpath]) if (not c or not c[0][1]): return [] else: xpath = c[0] xp = xpath[1] # format: fldNNN/a try: fld = int(xp[0][1][3:]) except ValueError: # not a NNN not an int return [] if fld in self.marc.fields: data = self.marc.fields[fld] else: return [] if len(xp) > 1: subfield = xp[1][1] else: subfield = "" vals = [] if fld in [0, 1]: vals = data else: for d in data: if not subfield: vals.append(' '.join([x[1] for x in d[2]])) elif subfield == 'ind1': vals.append(d[0]) elif subfield == 'ind2': vals.append(d[1]) elif fld == 8: if not subfield: vals.append(d) elif subfield == 'lang': vals.append(d[35:38]) elif subfield == 'date': vals.append(d[:6]) elif subfield == 'pubStatus': vals.append(d[6]) elif subfield == 'date1': vals.append(d[7:11]) elif subfield == 'date2': vals.append(d[11:15]) elif subfield == 'pubPlace': vals.append(d[15:18]) else: for x in d[2]: try: if x[0] == subfield: vals.append(x[1]) except: # broken pass nvals = [] for v in vals: try: nvals.append(v.decode('utf-8')) except: try: convtd = self.decoder.translate(v) nvals.append(unicodedata.normalize('NFC', convtd)) except: # strip out any totally @^%(ed characters v = self.asciiRe.sub('?', v) nvals.append(v) return nvals def get_dom(self, session): raise (NotImplementedError) def get_sax(self, session): raise (NotImplementedError) def get_xml(self, session): return self.marc.toMARCXML() def fetch_vector(self, session, index, summary=False): return index.indexStore.fetch_vector(session, index, self, summary)