예제 #1
0
 def process_document(self, session, doc):
     data = doc.get_raw(session)
     m = MARC(data)
     return StringDocument(m.toSGML(),
                           self.id,
                           doc.processHistory,
                           mimeType='text/sgml',
                           parent=doc.parent,
                           filename=doc.filename)
예제 #2
0
 def __init__(self, data, xml="", docId=0, wordCount=0, byteCount=0):
     txt = doc.get_raw(session)
     self.marc = MARC(txt)
     self.id = docId
     # Estimate number of words...
     display = str(self.marc)
     if not wordCount:
         wordCount = len(display.split()) - (len(display.split('\n')) * 2)
     self.wordCount = wordCount
     if byteCount:
         self.byteCount = byteCount
     else:
         self.byteCount = len(display)
     self.decoder = MARC8_to_Unicode()
     self.asciiRe = re.compile('([\x0e-\x1f]|[\x7b-\xff])')
    def process_record(self, session, rec):
        fields = {}
        tree = rec.get_dom(session)
        try:
            walker = tree.getiterator("controlfield")
        except AttributeError:
            # lxml 1.3 or later
            walker = tree.iter("controlfield")  
        for element in walker:
            tag = self._process_tagName(element.get('tag'))
            contents = element.text
            if tag in fields:
                fields[tag].append(contents)
            else:
                fields[tag] = [contents]
                
        try:
            walker = tree.getiterator("datafield")
        except AttributeError:
            # lxml 1.3 or later
            walker = tree.iter("datafield")  
        for element in walker:
            tag = self._process_tagName(element.get('tag'))
            try:
                children = element.getiterator('subfield')
            except AttributeError:
                # lxml 1.3 or later
                walker = element.iter('subfield') 
            subelements = [(c.get('code'), c.text) for c in children]
            contents = (element.get('ind1'), element.get('ind2'), subelements)         
            if tag in fields:
                fields[tag].append(contents)
            else:
                fields[tag] = [contents] 

        leader = tree.xpath('//leader')[0]
        l = leader.text
        fields[0] = [''.join([l[5:10], l[17:20]])]
        marcObject = MARC()
        marcObject.fields = fields
        return StringDocument(marcObject.get_MARC())
예제 #4
0
    def process_record(self, session, rec):
        fields = {}
        tree = rec.get_dom(session)
        try:
            walker = tree.getiterator("controlfield")
        except AttributeError:
            # lxml 1.3 or later
            walker = tree.iter("controlfield")
        for element in walker:
            tag = self._process_tagName(element.get('tag'))
            contents = element.text
            if tag in fields:
                fields[tag].append(contents)
            else:
                fields[tag] = [contents]

        try:
            walker = tree.getiterator("datafield")
        except AttributeError:
            # lxml 1.3 or later
            walker = tree.iter("datafield")
        for element in walker:
            tag = self._process_tagName(element.get('tag'))
            try:
                children = element.getiterator('subfield')
            except AttributeError:
                # lxml 1.3 or later
                walker = element.iter('subfield')
            subelements = [(c.get('code'), c.text) for c in children]
            contents = (element.get('ind1'), element.get('ind2'), subelements)
            if tag in fields:
                fields[tag].append(contents)
            else:
                fields[tag] = [contents]

        leader = tree.xpath('//leader')[0]
        l = leader.text
        fields[0] = [''.join([l[5:10], l[17:20]])]
        marcObject = MARC()
        marcObject.fields = fields
        return StringDocument(marcObject.get_MARC())
예제 #5
0
 def __init__(self, data, xml="", docId=0, wordCount=0, byteCount=0):
     txt = doc.get_raw(session)
     self.marc = MARC(txt)
     self.id = docId
     # Estimate number of words...
     display = str(self.marc)
     if not wordCount:
         wordCount = len(display.split()) - (len(display.split('\n')) * 2)
     self.wordCount = wordCount
     if byteCount:
         self.byteCount = byteCount
     else:
         self.byteCount = len(display)
     self.decoder = MARC8_to_Unicode()
     self.asciiRe = re.compile('([\x0e-\x1f]|[\x7b-\xff])')
예제 #6
0
 def process_document(self, session, doc):
     data = doc.get_raw(session)
     m = MARC(data)
     return StringDocument(m.toSGML(), self.id, doc.processHistory, mimeType='text/sgml', parent=doc.parent, filename=doc.filename)
예제 #7
0
class MarcRecord(Record):
    """For dealing with Library MARC Records."""

    def __init__(self, data, xml="", docId=0, wordCount=0, byteCount=0):
        txt = doc.get_raw(session)
        self.marc = MARC(txt)
        self.id = docId
        # Estimate number of words...
        display = str(self.marc)
        if not wordCount:
            wordCount = len(display.split()) - (len(display.split('\n')) * 2)
        self.wordCount = wordCount
        if byteCount:
            self.byteCount = byteCount
        else:
            self.byteCount = len(display)
        self.decoder = MARC8_to_Unicode()
        self.asciiRe = re.compile('([\x0e-\x1f]|[\x7b-\xff])')

    def process_xpath(self, session, xpath, maps={}):
        if (not isinstance(xpath, list)):
            # Raw XPath
            # c = utils.verifyXPaths([xpath])
            if (not c or not c[0][1]):
                return []
            else:
                xpath = c[0]

        xp = xpath[1]
        # format:  fldNNN/a
        try:
            fld = int(xp[0][1][3:])
        except ValueError:
            # not a NNN not an int
            return []
        if fld in self.marc.fields:
            data = self.marc.fields[fld]
        else:
            return []
        if len(xp) > 1:
            subfield = xp[1][1]
        else:
            subfield = ""

        vals = []
        if fld in [0, 1]:
            vals = data
        else:
            for d in data:
                if not subfield:
                    vals.append(' '.join([x[1] for x in d[2]]))
                elif subfield == 'ind1':
                    vals.append(d[0])
                elif subfield == 'ind2':
                    vals.append(d[1])
                elif fld == 8:
                    if not subfield:
                        vals.append(d)
                    elif subfield == 'lang':
                        vals.append(d[35:38])
                    elif subfield == 'date':
                        vals.append(d[:6])
                    elif subfield == 'pubStatus':
                        vals.append(d[6])
                    elif subfield == 'date1':
                        vals.append(d[7:11])
                    elif subfield == 'date2':
                        vals.append(d[11:15])
                    elif subfield == 'pubPlace':
                        vals.append(d[15:18])
                else:
                    for x in d[2]:
                        try:
                            if x[0] == subfield:
                                vals.append(x[1])
                        except:
                            # broken
                            pass
        nvals = []
        for v in vals:
            try:
                nvals.append(v.decode('utf-8'))
            except:
                try:
                    convtd = self.decoder.translate(v)
                    nvals.append(unicodedata.normalize('NFC', convtd))
                except:
                    # strip out any totally @^%(ed characters
                    v = self.asciiRe.sub('?', v)
                    nvals.append(v)
        return nvals

    def get_dom(self, session):
        raise(NotImplementedError)

    def get_sax(self, session):
        raise(NotImplementedError)

    def get_xml(self, session):
        return self.marc.toMARCXML()

    def fetch_vector(self, session, index, summary=False):
        return index.indexStore.fetch_vector(session, index, self, summary)
예제 #8
0
class MarcRecord(Record):
    """For dealing with Library MARC Records."""
    def __init__(self, data, xml="", docId=0, wordCount=0, byteCount=0):
        txt = doc.get_raw(session)
        self.marc = MARC(txt)
        self.id = docId
        # Estimate number of words...
        display = str(self.marc)
        if not wordCount:
            wordCount = len(display.split()) - (len(display.split('\n')) * 2)
        self.wordCount = wordCount
        if byteCount:
            self.byteCount = byteCount
        else:
            self.byteCount = len(display)
        self.decoder = MARC8_to_Unicode()
        self.asciiRe = re.compile('([\x0e-\x1f]|[\x7b-\xff])')

    def process_xpath(self, session, xpath, maps={}):
        if (not isinstance(xpath, list)):
            # Raw XPath
            # c = utils.verifyXPaths([xpath])
            if (not c or not c[0][1]):
                return []
            else:
                xpath = c[0]

        xp = xpath[1]
        # format:  fldNNN/a
        try:
            fld = int(xp[0][1][3:])
        except ValueError:
            # not a NNN not an int
            return []
        if fld in self.marc.fields:
            data = self.marc.fields[fld]
        else:
            return []
        if len(xp) > 1:
            subfield = xp[1][1]
        else:
            subfield = ""

        vals = []
        if fld in [0, 1]:
            vals = data
        else:
            for d in data:
                if not subfield:
                    vals.append(' '.join([x[1] for x in d[2]]))
                elif subfield == 'ind1':
                    vals.append(d[0])
                elif subfield == 'ind2':
                    vals.append(d[1])
                elif fld == 8:
                    if not subfield:
                        vals.append(d)
                    elif subfield == 'lang':
                        vals.append(d[35:38])
                    elif subfield == 'date':
                        vals.append(d[:6])
                    elif subfield == 'pubStatus':
                        vals.append(d[6])
                    elif subfield == 'date1':
                        vals.append(d[7:11])
                    elif subfield == 'date2':
                        vals.append(d[11:15])
                    elif subfield == 'pubPlace':
                        vals.append(d[15:18])
                else:
                    for x in d[2]:
                        try:
                            if x[0] == subfield:
                                vals.append(x[1])
                        except:
                            # broken
                            pass
        nvals = []
        for v in vals:
            try:
                nvals.append(v.decode('utf-8'))
            except:
                try:
                    convtd = self.decoder.translate(v)
                    nvals.append(unicodedata.normalize('NFC', convtd))
                except:
                    # strip out any totally @^%(ed characters
                    v = self.asciiRe.sub('?', v)
                    nvals.append(v)
        return nvals

    def get_dom(self, session):
        raise (NotImplementedError)

    def get_sax(self, session):
        raise (NotImplementedError)

    def get_xml(self, session):
        return self.marc.toMARCXML()

    def fetch_vector(self, session, index, summary=False):
        return index.indexStore.fetch_vector(session, index, self, summary)