def process_document(self, session, doc): data = doc.get_raw(session) m = MARC(data) return StringDocument(m.toSGML(), self.id, doc.processHistory, mimeType='text/sgml', parent=doc.parent, filename=doc.filename)
def __init__(self, data, xml="", docId=0, wordCount=0, byteCount=0): txt = doc.get_raw(session) self.marc = MARC(txt) self.id = docId # Estimate number of words... display = str(self.marc) if not wordCount: wordCount = len(display.split()) - (len(display.split('\n')) * 2) self.wordCount = wordCount if byteCount: self.byteCount = byteCount else: self.byteCount = len(display) self.decoder = MARC8_to_Unicode() self.asciiRe = re.compile('([\x0e-\x1f]|[\x7b-\xff])')
def process_record(self, session, rec): fields = {} tree = rec.get_dom(session) try: walker = tree.getiterator("controlfield") except AttributeError: # lxml 1.3 or later walker = tree.iter("controlfield") for element in walker: tag = self._process_tagName(element.get('tag')) contents = element.text if tag in fields: fields[tag].append(contents) else: fields[tag] = [contents] try: walker = tree.getiterator("datafield") except AttributeError: # lxml 1.3 or later walker = tree.iter("datafield") for element in walker: tag = self._process_tagName(element.get('tag')) try: children = element.getiterator('subfield') except AttributeError: # lxml 1.3 or later walker = element.iter('subfield') subelements = [(c.get('code'), c.text) for c in children] contents = (element.get('ind1'), element.get('ind2'), subelements) if tag in fields: fields[tag].append(contents) else: fields[tag] = [contents] leader = tree.xpath('//leader')[0] l = leader.text fields[0] = [''.join([l[5:10], l[17:20]])] marcObject = MARC() marcObject.fields = fields return StringDocument(marcObject.get_MARC())
class MarcRecord(Record): """For dealing with Library MARC Records.""" def __init__(self, data, xml="", docId=0, wordCount=0, byteCount=0): txt = doc.get_raw(session) self.marc = MARC(txt) self.id = docId # Estimate number of words... display = str(self.marc) if not wordCount: wordCount = len(display.split()) - (len(display.split('\n')) * 2) self.wordCount = wordCount if byteCount: self.byteCount = byteCount else: self.byteCount = len(display) self.decoder = MARC8_to_Unicode() self.asciiRe = re.compile('([\x0e-\x1f]|[\x7b-\xff])') def process_xpath(self, session, xpath, maps={}): if (not isinstance(xpath, list)): # Raw XPath # c = utils.verifyXPaths([xpath]) if (not c or not c[0][1]): return [] else: xpath = c[0] xp = xpath[1] # format: fldNNN/a try: fld = int(xp[0][1][3:]) except ValueError: # not a NNN not an int return [] if fld in self.marc.fields: data = self.marc.fields[fld] else: return [] if len(xp) > 1: subfield = xp[1][1] else: subfield = "" vals = [] if fld in [0, 1]: vals = data else: for d in data: if not subfield: vals.append(' '.join([x[1] for x in d[2]])) elif subfield == 'ind1': vals.append(d[0]) elif subfield == 'ind2': vals.append(d[1]) elif fld == 8: if not subfield: vals.append(d) elif subfield == 'lang': vals.append(d[35:38]) elif subfield == 'date': vals.append(d[:6]) elif subfield == 'pubStatus': vals.append(d[6]) elif subfield == 'date1': vals.append(d[7:11]) elif subfield == 'date2': vals.append(d[11:15]) elif subfield == 'pubPlace': vals.append(d[15:18]) else: for x in d[2]: try: if x[0] == subfield: vals.append(x[1]) except: # broken pass nvals = [] for v in vals: try: nvals.append(v.decode('utf-8')) except: try: convtd = self.decoder.translate(v) nvals.append(unicodedata.normalize('NFC', convtd)) except: # strip out any totally @^%(ed characters v = self.asciiRe.sub('?', v) nvals.append(v) return nvals def get_dom(self, session): raise(NotImplementedError) def get_sax(self, session): raise(NotImplementedError) def get_xml(self, session): return self.marc.toMARCXML() def fetch_vector(self, session, index, summary=False): return index.indexStore.fetch_vector(session, index, self, summary)
class MarcRecord(Record): """For dealing with Library MARC Records.""" def __init__(self, data, xml="", docId=0, wordCount=0, byteCount=0): txt = doc.get_raw(session) self.marc = MARC(txt) self.id = docId # Estimate number of words... display = str(self.marc) if not wordCount: wordCount = len(display.split()) - (len(display.split('\n')) * 2) self.wordCount = wordCount if byteCount: self.byteCount = byteCount else: self.byteCount = len(display) self.decoder = MARC8_to_Unicode() self.asciiRe = re.compile('([\x0e-\x1f]|[\x7b-\xff])') def process_xpath(self, session, xpath, maps={}): if (not isinstance(xpath, list)): # Raw XPath # c = utils.verifyXPaths([xpath]) if (not c or not c[0][1]): return [] else: xpath = c[0] xp = xpath[1] # format: fldNNN/a try: fld = int(xp[0][1][3:]) except ValueError: # not a NNN not an int return [] if fld in self.marc.fields: data = self.marc.fields[fld] else: return [] if len(xp) > 1: subfield = xp[1][1] else: subfield = "" vals = [] if fld in [0, 1]: vals = data else: for d in data: if not subfield: vals.append(' '.join([x[1] for x in d[2]])) elif subfield == 'ind1': vals.append(d[0]) elif subfield == 'ind2': vals.append(d[1]) elif fld == 8: if not subfield: vals.append(d) elif subfield == 'lang': vals.append(d[35:38]) elif subfield == 'date': vals.append(d[:6]) elif subfield == 'pubStatus': vals.append(d[6]) elif subfield == 'date1': vals.append(d[7:11]) elif subfield == 'date2': vals.append(d[11:15]) elif subfield == 'pubPlace': vals.append(d[15:18]) else: for x in d[2]: try: if x[0] == subfield: vals.append(x[1]) except: # broken pass nvals = [] for v in vals: try: nvals.append(v.decode('utf-8')) except: try: convtd = self.decoder.translate(v) nvals.append(unicodedata.normalize('NFC', convtd)) except: # strip out any totally @^%(ed characters v = self.asciiRe.sub('?', v) nvals.append(v) return nvals def get_dom(self, session): raise (NotImplementedError) def get_sax(self, session): raise (NotImplementedError) def get_xml(self, session): return self.marc.toMARCXML() def fetch_vector(self, session, index, summary=False): return index.indexStore.fetch_vector(session, index, self, summary)