def find_documents(self, session, cache=0): docs = [] curr = 1 while True: self.stream.startRecord = curr resp = self.binding.RPC( self.binding.url, "searchRetrieveRequest", self.stream, requestclass=SRW.types.SearchRetrieveRequest, replytype=SRW.types.SearchRetrieveResponse.typecode, readerclass=reader) total = resp.numberOfRecords curr += len(resp.records) for d in resp.records: doc = StringDocument(d.recordData, mimeType='text/xml') doc.recordSchema = d.recordSchema if cache == 0: yield doc elif cache == 2: docs.append(doc) else: raise NotImplementedError if curr > total: if cache == 0: raise StopIteration else: break self.documents = docs
def _getRecord(self): for oaiid in self.idcache: s = "%sverb=GetRecord&%s" % ( self.server, urllib.urlencode({ 'metadataPrefix': self.metadataPrefix, 'identifier': oaiid })) resp = self._fetchStream(s) data = resp.read() doc = StringDocument(data, self.id, mimeType='text/xml') rec = BSParser.process_document(None, doc) dom = rec.get_dom(session) for top in dom.childNodes: if top.nodeType == elementType: break for c in top.childNodes: if (c.nodeType == elementType and c.localName == 'GetRecord'): for c2 in c.childNodes: if (c2.nodeType == elementType and c2.localName == 'record'): for c3 in c2.childNodes: if (c3.nodeType == elementType and c3.localName == 'metadata'): for c4 in c3.childNodes: if (c4.nodeType == elementType): data = c4.toxml() yield StringDocument( data, self.id, mimeType='text/xml') break break break raise StopIteration
def save_collocates(self, collocates, id): string = Pickle.dumps(collocates) doc = StringDocument(string) doc.id = id self.collStore.store_document(self.session, doc) self.collStore.commit_storing(self.session) return id
def process_document(self, session, doc): # write out our temp file (qq, infn) = tempfile.mkstemp(".tfp") fh = file(infn, 'w') fh.write(doc.get_raw(session)) fh.close() # go to TFP directory and run o = os.getcwd() os.chdir(self.filePath) results = commands.getoutput( "%s -Xms%sm -Xmx%sm AprioriTFPapp -F../%s -S%s -C%s" % (self.java, self.memory, self.memory, infn, self.support, self.confidence)) os.chdir(o) # process results resultLines = results.split('\n') matches = [] for l in resultLines: m = freqRe.search(l) if m: (set, freq) = m.groups() matches.append((int(freq), set)) if not matches: # no FIS for some reason, return results?? return StringDocument(results) matches.sort(reverse=True) return StringDocument(matches)
def process_document(self, session, doc): # write out our temp file (qq, infn) = tempfile.mkstemp(".arm") fh = file(infn, 'w') fh.write(doc.get_raw(session)) fh.close() if self.absSupport: t = len(doc.get_raw(session).split('\n')) self.support = (float(self.absSupport) / float(t)) * 100 (qq, outfn) = tempfile.mkstemp(".txt") # go to directory and run o = os.getcwd() #os.chdir(self.filePath) if self.confidence > 0: cmd = "apriori %s %s %f %s" % (infn, outfn, self.support / 100, self.confidence / 100) else: cmd = "apriori %s %s %s" % (infn, outfn, self.support / 100) results = commands.getoutput(cmd) #os.chdir(o) inh = file(outfn) fis = self.fisre rule = self.rulere singleItems = self.singleItems matches = [] rules = [] for line in inh: # matching line looks like N N N (N) # rules look like N N ==> N (f, N) m = fis.match(line) if m: (set, freq) = m.groups() if singleItems or set.find(' ') > -1: matches.append((int(freq), set)) elif self.confidence > 0: m = rule.match(line) if m: (ante, conc, conf, supp) = m.groups() al = map(int, ante.split(' ')) cl = map(int, conc.split(' ')) rules.append((float(conf), int(supp), al, cl)) inh.close() # delete temp files! os.remove(outfn) os.remove(infn) if not matches: # no FIS for some reason, return results?? return StringDocument([results, []]) matches.sort(reverse=True) rules.sort(reverse=True) os.chdir(o) doc = StringDocument([matches, rules]) return doc
def fetch_document(self, session, id): p = self.permissionHandlers.get('info:srw/operation/2/retrieve', None) if p: if not session.user: msg = ("Authenticated user required to retrieve an object " "from %s" % self.id) raise PermissionException(msg) okay = p.hasPermission(session, session.user) if not okay: msg = ("Permission required to retrieve an object from " "%s" % self.id) raise PermissionException(msg) data = self.fetch_data(session, id) if (data): doc = StringDocument(data) if (self.outPreParser is not None): doc = self.outPreParser.process_document(session, doc) elif (self.outWorkflow is not None): doc = self.outWorkflow.process(session, doc) doc.id = id doc.documentStore = self.id doc.parent = ('document', self.id, id) return doc elif (isinstance(data, DeletedObject)): raise ObjectDeletedException(data) else: raise ObjectDoesNotExistException(id)
def create_document(self, session, doc=None): p = self.permissionHandlers.get('info:srw/operation/1/create', None) if p: if not session.user: msg = ("Authenticated user required to create an object in " "%s" % self.id) raise PermissionException(msg) okay = p.hasPermission(session, session.user) if not okay: msg = "Permission required to create an object in %s" % self.id raise PermissionException(msg) id = self.generate_id(session) if (doc is None): # Create a placeholder doc = StringDocument("") else: doc.id = id doc.documentStore = self.id try: self.store_document(session, doc) except ObjectAlreadyExistsException: # Back out id change if type(id) == long: self.currentId -= 1 raise except: raise return doc
def process_record(self, session, record): # Get RecordStore and identifier of parent record try: parentId = record.process_xpath(session, '/c3component/@parent')[0] except IndexError: parentId = record.process_xpath( session, '/c3:component/@c3:parent', maps={'c3': "http://www.cheshire3.org/schemas/component/"})[0] recStoreId, parentId = parentId.split('/', 1) # Get RecordStore object if isinstance(self.parent, Database): db = self.parent elif isinstance(self.parent, Server) and session.database: db = self.parent.get_object(session, session.database) elif (session.server and isinstance(session.server, Server) and session.database): db = session.server.get_object(session, session.database) elif not session.server: raise ValueError("No session.server") else: raise ValueError("No session.database") recStore = db.get_object(session, recStoreId) # Fetch parent record parentRec = recStore.fetch_record(session, parentId) # Return a new Document with parent data and identifier data = parentRec.get_xml(session) doc = StringDocument(data, self.id, byteCount=len(data), byteOffset=0) doc.id = parentId return doc
def find_documents(self, session, cache=0): docs = [] curr = 1 while True: self.stream.startRecord = curr resp = self.binding.RPC(self.binding.url, "searchRetrieveRequest", self.stream, requestclass=SRW.types.SearchRetrieveRequest, replytype=SRW.types.SearchRetrieveResponse.typecode, readerclass=reader) total = resp.numberOfRecords curr += len(resp.records) for d in resp.records: doc = StringDocument(d.recordData, mimeType='text/xml') doc.recordSchema = d.recordSchema if cache ==0: yield doc elif cache == 2: docs.append(doc) else: raise NotImplementedError if curr > total: if cache == 0: raise StopIteration else: break self.documents = docs
def create_document(self, session, doc=None): p = self.permissionHandlers.get('info:srw/operation/1/create', None) if p: if not session.user: msg = ("Authenticated user required to create an object in " "%s" % self.id) raise PermissionException(msg) okay = p.hasPermission(session, session.user) if not okay: msg = "Permission required to create an object in %s" % self.id raise PermissionException(msg) id = self.generate_id(session) if (doc is None): # Create a placeholder doc = StringDocument("") else: doc.id = id doc.documentStore = self.id try: self.store_document(session, doc) except ObjectAlreadyExistsException: # Back out id change if type(id) == long: self.currentId -= 1 raise except: raise return doc
def find_documents(self, session, cache=0): docs = [] locs = [] data = self.stream.read(1536) myTell = 0 while data: rt = data.find("\x1D") while (rt > -1): txt = data[:rt + 1] tlen = len(txt) if cache == 0: yield StringDocument(txt, mimeType="application/marc") elif cache == 1: locs.append((myTell, tlen)) elif cache == 2: docs.append( StringDocument(txt, mimeType="application/marc")) data = data[rt + 1:] myTell += tlen rt = data.find("\x1D") dlen = len(data) data += self.stream.read(1536) if (len(data) == dlen): # Junk at end of file data = "" self.stream.close() self.locations = locs self.documents = docs self.length = max(len(locs), len(docs))
def fetch_document(self, session, id): p = self.permissionHandlers.get('info:srw/operation/2/retrieve', None) if p: if not session.user: msg = ("Authenticated user required to retrieve an object " "from %s" % self.id) raise PermissionException(msg) okay = p.hasPermission(session, session.user) if not okay: msg = ("Permission required to retrieve an object from " "%s" % self.id) raise PermissionException(msg) data = self.fetch_data(session, id) if (data): doc = StringDocument(data) if (self.outPreParser is not None): doc = self.outPreParser.process_document(session, doc) elif (self.outWorkflow is not None): doc = self.outWorkflow.process(session, doc) doc.id = id doc.documentStore = self.id doc.parent = ('document', self.id, id) return doc elif (isinstance(data, DeletedObject)): raise ObjectDeletedException(data) else: raise ObjectDoesNotExistException(id)
def directoryDocumentStoreIter(store): session = Session() for id_, data in directoryStoreIter(store): doc = StringDocument(data) doc.id = id_ internalId = store._normalizeIdentifier(session, id_) doc.filename = store._getFilePath(session, internalId) yield doc
def process_record(self, session, record): u"""Apply Workflow to the Record, return the resulting Document.""" output = self.workflow.process(session, record) if isinstance(output, basestring): output = StringDocument(output) elif isinstance(output, Record): output = StringDocument(output.get_xml(session)) return output
def process_record(self, session, record): u"""Apply Workflow to the Record, return the resulting Document.""" output = self.workflow.process(session, record) if isinstance(output, basestring): output = StringDocument(output) elif isinstance(output, Record): output = StringDocument(output.get_xml(session)) return output
def find_documents(self, session, cache=0): if cache == 0: yield StringDocument(self.stream.read(), filename=self.streamLocation) elif cache == 2: self.documents = [ StringDocument(self.stream.read(), filename=self.streamLocation) ]
def accumulate(self, session, stream, format, tagName=None, codec=None, factory=None): doc = StringDocument(stream.get_xml(session)) #get rec into doc self.data.append(doc.get_raw(session))
def _parse_upload(self, data, interface='admin'): if (type(data) == unicode): try: data = data.encode('utf-8') except: try: data = data.encode('utf-16') except: pass # hope for the best! doc = StringDocument(data) del data doc = ppFlow.process(session, doc) try: rec = docParser.process_document(session, doc) except: newlineRe = re.compile('(\s\s+)') doc.text = newlineRe.sub('\n\g<1>', doc.get_raw(session)) # repeat parse with correct line numbers try: rec = docParser.process_document(session, doc) except: self.htmlTitle.append('Error') e = sys.exc_info() self.logger.log('*** %s: %s' % (repr(e[0]), e[1])) # try and highlight error in specified place lines = doc.get_raw(session).split('\n') positionRe = re.compile(':(\d+):(\d+):') mo = positionRe.search(str(e[1])) if (mo is None): positionRe = re.compile('line (\d+), column (\d+)') mo = positionRe.search(str(e[1])) line, posn = lines[int(mo.group(1))-1], int(mo.group(2)) try: startspace = newlineRe.match(line).group(0) except: if interface=='admin': link = '<a href="files.html">Back to file page</a>' else : link = '<a href="edit.html">Back to edit/create menu</a>' return '''<div id="single"><p class="error">An error occured while parsing your file. Please check the file is a valid ead file and try again.</p><p>%s</p></div>''' % link else: if interface=='admin': link = '<a href="files.html">Back to file page</a>' else : link = '<a href="edit.html">Back to edit/create menu</a>' return '''\ <div id="single"><p class="error">An error occured while parsing your file. Please check the file at the suggested location and try again.</p> <code>%s: %s</code> <pre> %s <span class="error">%s</span> </pre> <p>%s</p></div> ''' % (html_encode(repr(e[0])), e[1], html_encode(line[:posn+20]) + '...', startspace + str('-'*(posn-len(startspace))) +'^', link) del doc return rec
def find_documents(self, session, cache=0): # step through terms if cache == 0: for k in self.stream: yield StringDocument(k) raise StopIteration elif cache == 2: documents = [] for k in self.stream: documents.append(StringDocument(k)) self.documents = documents
def find_documents(self, session, cache=0): results = self.stream.search(self.query) docs = [] for r in results: doc = self._toXml(r) if cache == 0: yield StringDocument(doc) elif cache == 2: docs.append(StringDocument(doc)) else: raise NotImplementedError self.documents = docs
def _processFile(self, session, item): name = self._fetchName(item) if self.filterRe: m = self.filterRe.search(name) if not m: return None mimetype = mimetypes.guess_type(name, 0) if (mimetype[0] in [ 'text/sgml', 'text/xml', 'application/sgml', 'application/xml' ]): if mimetype[1] == 'gzip': raise NotImplementedError( 'XML files compressed using gzip are not yet supported. You could try using zip.' ) trip = ('stream', XmlDocumentStream, 'xml') elif (mimetype[0] == 'application/x-tar'): if mimetype[1] == 'gzip': trip = ('stream', TarDocumentStream, 'tar.gz') elif mimetype[1] == 'bzip2': trip = ('stream', TarDocumentStream, 'tar.bz2') else: trip = ('stream', TarDocumentStream, 'tar') elif (mimetype[0] == 'application/zip'): trip = ('stream', ZipDocumentStream, 'zip') elif (mimetype[0] == 'application/marc'): trip = ('stream', MarcDocumentStream, 'marc') else: if self.tagName is not None: trip = ('stream', XmlDocumentStream, 'xml') else: trip = ('document', None, mimetype[0]) s = self._fetchStream(item) if trip[0] == 'stream': cls = trip[1] nstream = cls(session, s, format=trip[2], tagName=self.tagName, codec=self.codec, factory=self.factory) # copy streamLocation in to copy to document nstream.streamLocation = item return ('stream', nstream) elif trip[0] == 'document': data = s.read() s.close() doc = StringDocument(data, mimeType=trip[2], filename=name) if mimetype[1]: doc.compression = mimetype[1] return ('document', doc)
def find_documents(self, session, cache=0): docs = [] linked = self.factory.get_setting(session, 'linkedItem', 0) for e in self.stream.entries: if linked == 0: doc = self._toXml(e) else: s = self._fetchStream(e.link) doc = s.read() if cache == 0: yield StringDocument(doc) elif cache == 2: docs.append(StringDocument(doc)) else: raise NotImplementedError self.documents = docs
def process_document(self, session, doc): (labels, vectors) = doc.get_raw(session) # find max attr all = {} for v in vectors: all.update(v) keys = all.keys() keys.sort() maxattr = keys[-1] nattrs = len(keys) # remap vectors to reduced space renumbers = range(self.offset, nattrs + self.offset) renumberhash = dict(zip(keys, renumbers)) newvectors = [] for vec in vectors: new = {} for (k, v) in vec.items(): new[renumberhash[k]] = v newvectors.append(new) # pickle renumberhash pick = cPickle.dumps(renumberhash) filename = self.get_path(session, 'modelPath', None) if not filename: dfp = self.get_path(session, 'defaultPath') filename = os.path.join(dfp, self.id + "_ATTRHASH.pickle") f = file(filename, 'w') f.write(pick) f.close() return StringDocument((labels, newvectors, nattrs))
def setUp(self): PreParserTestCase.setUp(self) self.testUc = self._get_testUnicode() if self.testUc: self.inDoc = StringDocument(self.testUc) self.outDoc = self.testObj.process_document( self.session, self.inDoc)
def find_documents(self, session, cache=0): # Construct SRU url, fetch, parse. start = 1 docs = [] while True: self.args['startRecord'] = start params = urllib.urlencode(self.args) req = urllib2.Request(url="%s%s" % (self.server, params)) f = urllib2.urlopen(req) data = f.read() f.close() # subst out xmldecl data = self.xmlver.sub("", data) soapy = '<SOAP:Envelope xmlns:SOAP="http://schemas.xmlsoap.org/soap/envelope/"><SOAP:Body>%s</SOAP:Body></SOAP:Envelope>' % data ps = ZSI.ParsedSoap(soapy, readerclass=reader) resp = ps.Parse(SRW.types.SearchRetrieveResponse) self.total = resp.numberOfRecords for d in resp.records: doc = StringDocument(d.recordData, mimeType='text/xml') if cache == 0: yield doc elif cache == 2: docs.append(doc) else: raise NotImplementedError start += len(resp.records) if start > self.total: if cache == 0: raise StopIteration else: break self.documents = docs
def process_document(self, session, doc): txt = doc.get_raw(session) txt = txt.replace('\n', ' ') txt = txt.replace('\r', ' ') for x in range(9, 14): txt = txt.replace('&#%d;' % (x), ' ') txt = self.doctype_re.sub('', txt) for e in self.entities.keys(): txt = txt.replace("&%s;" % (e), self.entities[e]) txt = self.amp_re.sub(self._loneAmpersand, txt) txt = txt.replace('&<', '&<') txt = self.attr_re.sub(self._attributeFix, txt) txt = self.elem_re.sub(self._lowerElement, txt) for t in self.emptyTags: empty_re = re.compile('<(%s( [^>/]+)?)[\s/]*>' % t) txt = empty_re.sub(self._emptyElement, txt) # strip processing instructions. txt = self.pi_re.sub('', txt) return StringDocument(txt, self.id, doc.processHistory, mimeType=doc.mimeType, parent=doc.parent, filename=doc.filename)
def process_document(self, session, doc): data = self.script.sub('', doc.get_raw(session)) data = self.style.sub('', data) data = self.comment.sub('', data) tm = self.title.search(data) if tm: title = data[tm.start():tm.end()] else: title = "" m = self.body.search(data) if m: body = data[m.start():m.end()] else: body = data text = self.tagstrip.sub(' ', body) text = text.replace('<', '<') text = text.replace('>', '>') text = text.replace(" ", ' ') text = text.replace(" ", ' ') l = text.split() text = ' '.join(l) data = "<html><head>%s</head><body>%s</body></html>" % (title, text) return StringDocument(data, self.id, doc.processHistory, mimeType=doc.mimeType, parent=doc.parent, filename=doc.filename)
def process_document(self, session, doc): txt = doc.get_raw(session) # Replace entities that can be represented with simple chars for (fromEnt, toEnt) in self.inane.iteritems(): txt = txt.replace("&%s;" % fromEnt, toEnt) # Fix some common mistakes for (fromEnt, toEnt) in self.preEntities.iteritems(): txt = txt.replace("&%s;" % fromEnt, "&%s;" % toEnt) # Fix straight forward entites for (s, enty) in enumerate(self.entities): txt = txt.replace("&%s;" % enty, "&#%s;" % (160 + s)) # Fix additional random entities for (fent, totxt) in self.otherEntities.iteritems(): txt = txt.replace("&%s;" % fent, "&%s;" % totxt) # Add missing # in &123; def hashed(mo): return '&#%s;' % mo.group(1) txt = self.numericalEntRe.sub(hashed, txt) # Fix made up fraction entities. (?) def fraction(mo): return '%s⁄%s' % (mo.group(1), mo.group(2)) txt = self.fractionRe.sub(fraction, txt) # Kill remaining invalid character entities txt = self.invalidRe.sub('', txt) return StringDocument(txt, self.id, doc.processHistory, mimeType=doc.mimeType, parent=doc.parent, filename=doc.filename)
def process_record(self, session, rec): doc = [] doc.append( '<article id="%s" date="%s">\n', '' % (rec.process_xpath(session, '/article/@id')[0], rec.process_xpath(session, '/article/@date')[0])) head = rec.process_xpath(session, '/article/head')[0] headstr = etree.tounicode(head) doc.append(headstr.encode('utf-8')) doc.append("\n<body>\n") body = rec.process_xpath(session, '/article/body')[0] # walk tree looking for <s> tags, and duplicate out any non s tag eid = 0 for sub in body: if sub.tag == "p": bits = ['<p eid="%s"' % eid] eid += 1 for (name, val) in sub.items(): bits.append("%s=\"%s\"" % (name, val)) bits.append(">") doc.append(' '.join(bits)) for s in sub: # sentences bits = ['<s eid="%s"' % eid] eid += 1 for (name, val) in s.items(): bits.append("%s=\"%s\"" % (name, val)) bits.append(">") doc.append(' '.join(bits)) t = s.text if t: try: toks = self.geniafy(t) ttxt = ''.join(toks) val = '<txt>%s</txt><toks>%s</toks>' % (escape(t), ttxt) doc.append(val.encode('utf8')) except: raise doc.append("</s>") doc.append("</p>\n") elif sub.tag in ["headline", "lead"]: # tag headline and lead too doc.append('<%s>' % sub.tag) t = sub.text if t: try: toks = self.geniafy(t) ttxt = ''.join(toks) val = '<txt>%s</txt><toks>%s</toks>' % (escape(t), ttxt) doc.append(val.encode('utf8')) except: raise doc.append('</%s>' % sub.tag) else: # just useless <br/> tags pass doc.append("\n</body>\n</article>\n") return StringDocument(''.join(doc))
def _listIdentifiers(self): s = "%sverb=ListIdentifiers&" % (self.server) s += urllib.urlencode(self.params) resp = self._fetchStream(s) data = resp.read() # self.lastResponse = resp # Now use existing infrastructure to parse doc = StringDocument(data, self.id, mimeType='text/xml') rec = BSParser.process_document(None, doc) dom = rec.get_dom(session) for top in dom.childNodes: if (top.nodeType == elementType): break for c in top.childNodes: if (c.nodeType == elementType and c.localName == 'ListIdentifiers'): for c2 in c.childNodes: if (c2.nodeType == elementType and c2.localName == 'header'): for c3 in c2.childNodes: if (c3.nodeType == elementType and c3.localName == 'identifier'): self.ids.append(getFirstData(c3)) elif (c2.nodeType == elementType and c2.localName == 'resumptionToken'): t = getFirstData(c2) if (t): self.token = t try: self.total = c2.getAttr('completeListSize') except: pass
def test_process_document_returnProcessHistory(self): "Check that returned Record has parser in history." for data in self._get_data(): rec = self.testObj.process_document(self.session, StringDocument(data)) self.assertEqual(len(rec.processHistory), 1) self.assertEqual(rec.processHistory[0], self.testObj.id)
def process_document(self, session, doc): data = doc.get_raw(session) return StringDocument(binascii.b2a_base64(data), self.id, doc.processHistory, parent=doc.parent, filename=doc.filename)
def process_record(self, session, rec): doc = [] for c in self.copyElems: res = rec.process_xpath(session, c[0], c[1]) for match in res: txt = rec.get_xml(session, match) doc.append(txt) for t in self.tagElems: res = rec.process_xpath(session, t[0], t[1]) for match in res: # Process all text nodes together totag = [] for event in match: if event[0] == '3': totag.append(event[1:]) tagtxt = ''.join(totag) tagged = self.tag(session, tagtxt) tagged = ''.join(tagged) if match[0][0] != '3': (name, attrhash) = rec._convert_elem(match[0]) attrs = [] for a in attrhash: attrs.append('%s="%s"' % (a, attribs[a])) attribtxt = ' '.join(attrs) if (attribtxt): attribtxt = " " + attribtxt txt = "<%s%s>%s</%s>" % (name, attribtxt, tagged, name) else: txt = "<text>%s</text>" % (tagged) doc.append(txt) doctxt = "<record>%s</record>" % '\n'.join(doc) strdoc = StringDocument(doctxt, self.id, rec.processHistory, 'text/xml') return strdoc
def find_documents(self, session, cache=0): # read in single file doc = StringDocument(self.stream.read(), filename=self.stream.getName()) # attach any iRODS metadata umd = self.stream.getUserMetadata() self.stream.close() self.cxn.disconnect() md = {} for x in umd: md[x[0]] = icatValToPy(x[1], x[2]) if len(md): doc.metadata['iRODS'] = md if cache == 0: yield doc elif cache == 2: self.documents = [doc]
def process_record(self, session, rec): if isinstance(rec, GraphRecord): fmt = self.get_setting(session, 'format', 'xml') data = rec.graph.serialize(format=fmt) return StringDocument(data) else: raise NotImplementedError("Can only transform GraphRecords")
def process_document(self, session, doc): global METS_NAMESPACES mets = self._get_metsWrapper(doc) objid = mets.get("OBJID") # Get the fileSec element fileGrp = mets.xpath('/mets:mets/mets:fileSec/mets:fileGrp[1]', namespaces=METS_NAMESPACES)[0] file_ = self._get_metsFile( '/'.join([objid, mets.attrib.get("LABEL", "file0001")]), doc.get_raw(session), doc.byteCount, doc.mimeType) # Append the file element to fileGrp fileGrp.append(file_) # Update last modification date mets.attrib['LASTMODDATE'] = time.strftime('%Y-%m-%dT%H:%M:%S%Z') # Serialize METS data = etree.tostring(mets, pretty_print=True) # Return a Document return StringDocument(data, self.id, doc.processHistory, self.outMimeType, parent=doc.parent, filename=doc.filename, byteCount=len(data), byteOffset=0)
def process_document(self, session, doc): text = doc.get_raw(session) tt = self.tag(session, text, xml=1) ttj = '\n'.join(tt) ttj = "<text>" + ttj + "</text>" return StringDocument(ttj, self.id, doc.processHistory, 'text/xml', doc.parent)
def process_record(self, session, rec): p = self.permissionHandlers.get('info:srw/operation/2/transform', None) if p: if not session.user: raise PermissionException("Authenticated user required to transform using %s" % self.id) okay = p.hasPermission(session, session.user) if not okay: raise PermissionException("Permission required to transform using %s" % self.id) self.initState() try: rec.saxify(session, self) except AttributeError: saxp = session.server.get_object(session, 'SaxParser') saxRec = saxp.process_document(session, StringDocument(rec.get_xml(session))) saxRec.saxify(session, self) return StringDocument(self.top, self.id, rec.processHistory, parent=rec.parent)
def unpack_record(self, session, req): declre = re.compile('<\?xml(.*?)\?>') if req.record: packing = req.record.recordPacking if packing == "string": data = req.record.recordData data = declre.sub('', data) doc = StringDocument(data) elif packing == "url": raise NotImplementedError elif packing == "xml": # Should be a DOM node, not string repr? doc = StringDocument(req.record.recordData) else: diag = Diagnostic1() raise diag doc._schema = req.record.recordSchema else: doc = None return doc
def _processFile(self, session, item): name = self._fetchName(item) if self.filterRe: m = self.filterRe.search(name) if not m: return None mimetype = mimetypes.guess_type(name, 0) if (mimetype[0] in ['text/sgml', 'text/xml', 'application/sgml', 'application/xml']): if mimetype[1] == 'gzip': raise NotImplementedError('XML files compressed using gzip are not yet supported. You could try using zip.') trip = ('stream', XmlDocumentStream, 'xml') elif (mimetype[0] == 'application/x-tar'): if mimetype[1] == 'gzip': trip = ('stream', TarDocumentStream, 'tar.gz') elif mimetype[1] == 'bzip2': trip = ('stream', TarDocumentStream, 'tar.bz2') else: trip = ('stream', TarDocumentStream, 'tar') elif (mimetype[0] == 'application/zip'): trip = ('stream', ZipDocumentStream, 'zip') elif (mimetype[0] == 'application/marc'): trip = ('stream', MarcDocumentStream, 'marc') else: if self.tagName is not None: trip = ('stream', XmlDocumentStream, 'xml') else: trip = ('document', None, mimetype[0]) s = self._fetchStream(item) if trip[0] == 'stream': cls = trip[1] nstream = cls(session, s, format=trip[2], tagName=self.tagName, codec=self.codec, factory=self.factory) # copy streamLocation in to copy to document nstream.streamLocation = item return ('stream', nstream) elif trip[0] == 'document': data = s.read() s.close() doc = StringDocument(data, mimeType=trip[2], filename=name) if mimetype[1]: doc.compression = mimetype[1] return ('document', doc)
def save_concordance(self, clines, id, wordWindow): global maxSize # self.logger.log('saving concordance - %d' % len(clines)) if len(clines) > maxSize : i = 1 for j in range(0, len(clines), maxSize): slice = clines[j:j+maxSize] slice.insert(0, [len(clines), wordWindow]) string = Pickle.dumps(slice) doc = StringDocument(string) doc.id = '%s_%d' % (id, i) i += 1 self.concStore.store_document(self.session, doc) else : clines.insert(0, [len(clines), wordWindow]) string = Pickle.dumps(clines) doc = StringDocument(string) doc.id = '%s_1' % id self.concStore.store_document(self.session, doc) self.concStore.commit_storing(self.session) return id
def process_record(self, session, record): # Get RecordStore and identifier of parent record try: parentId = record.process_xpath(session, '/c3component/@parent')[0] except IndexError: parentId = record.process_xpath( session, '/c3:component/@c3:parent', maps={'c3': "http://www.cheshire3.org/schemas/component/"} )[0] recStoreId, parentId = parentId.split('/', 1) # Get RecordStore object if isinstance(self.parent, Database): db = self.parent elif isinstance(self.parent, Server) and session.database: db = self.parent.get_object(session, session.database) elif ( session.server and isinstance(session.server, Server) and session.database ): db = session.server.get_object(session, session.database) elif not session.server: raise ValueError("No session.server") else: raise ValueError("No session.database") recStore = db.get_object(session, recStoreId) # Fetch parent record parentRec = recStore.fetch_record(session, parentId) # Return a new Document with parent data and identifier data = parentRec.get_xml(session) doc = StringDocument( data, self.id, byteCount=len(data), byteOffset=0 ) doc.id = parentId return doc
def mercurialDocumentStoreIter(store): session = Session() for id_, data in directoryStoreIter(store): doc = StringDocument(data) doc.id = id_ internalId = store._normalizeIdentifier(session, id_) doc.filename = store._getFilePath(session, internalId) doc.documentStore = store.id # Assign byteCount and other useful metadata stat = os.stat(doc.filename) doc.byteCount = stat.st_size doc.metadata['lastModified'] = stat.st_mtime yield doc
def _process_data(self, session, id, data, preParser=None): # Split from fetch record for Iterators if (preParser is not None): doc = StringDocument(data) doc = preParser.process_document(session, doc) elif (self.outPreParser is not None): doc = StringDocument(data) doc = self.outPreParser.process_document(session, doc) elif (self.outWorkflow is not None): doc = StringDocument(data) doc = self.outWorkflow.process(session, doc) else: doc = StringDocument(data) # Ensure basic required info doc.id = id doc.documentStore = self.id return doc
def find_documents(self, session, cache=0): doc = StringDocument([self.classes, self.vectors]) doc.totalAttributes = self.totalAttributes yield doc
def next(self): d = BdbIter.next(self) doc = StringDocument(d[1]) doc.id = d[0] return doc
def accumulate(self, session, stream, format, tagName=None, codec=None, factory=None): doc = StringDocument(stream.get_xml(session)) # get rec into doc self.data.append(doc.get_raw(session))
def process_document(self, session, doc): # take in Doc with match list, return doc with rule object list (matches, armrules) = doc.get_raw(session) out = StringDocument([]) # Initial setup termHash = {} termFreqHash = {} termRuleFreq = {} rules = [] ruleLengths = {} if self.recordStore: totalDocs = self.recordStore.get_dbSize(session) else: # get default from session's database db = session.server.get_object(session, session.database) recStore = db.get_path(session, 'recordStore', None) if recStore: totalDocs = recStore.get_dbSize(session) if totalDocs == 0: # avoid e_divzero totalDocs = 1 totalDocs = float(totalDocs) # step through rules and turn into objects, do math, do global stats for m in matches: r = FrequentSet(session, m, out, self.unrenumber) freqs = [] for t in r.termids: try: termFreq = termFreqHash[t] termRuleFreq[t] += 1 except: termRuleFreq[t] = 1 term = self.index.fetch_termById(session, t) termHash[t] = term termFreq = self.index.fetch_term(session, term, summary=True)[1] termFreqHash[t] = termFreq freqs.append(termFreq) r.freqs = freqs if self.calcRankings: if self.calcRuleLengths: try: ruleLengths[(len(r.termids))] += 1 except: ruleLengths[(len(r.termids))] = 1 # some basic stats needed avgs = [] entropy = [] gini = [] ftd = float(totalDocs) for t in freqs: bit = float(t)/ftd avgs.append(bit) entropy.append((0-bit) * math.log(bit, 2)) gini.append(bit**2) r.pctg = reduce(operator.mul, avgs) r.avg = r.pctg * float(totalDocs) r.opctg = (float(r.freq) / ftd) r.entropy = reduce(operator.add, entropy) r.gini = 1.0 - reduce(operator.add, gini) # This is log-likelihood. Better than just support ei = float(totalDocs * (r.avg + r.freq)) / (totalDocs * 2.0) g2 = 2 * ((r.avg * math.log( r.avg / ei,10)) + (r.freq * math.log(r.freq / ei,10))) if r.freq < r.avg: g2 = 0 - g2 r.ll = g2 # Dunno what this is but it works quite well (for some things) r.surprise = (totalDocs / r.avg) * r.freq # r.surprise2 = (1.0/r.pctg) * r.freq rules.append(r) if self.sortBy: rules.sort(key=self.sortFuncs[self.sortBy], reverse=True) nrules = [] if armrules: # unrenumber arm found rules # conf, supp, [antes], [concs] for r in armrules: d = StringDocument([r[2], r[3]]) if self.unrenumber: d = self.unrenumber.process_document(session, d) antes = [] concs = [] renmbrd = d.get_raw(session) for a in renmbrd[0]: antes.append(termHash[a]) for c in renmbrd[1]: concs.append(termHash[c]) nrules.append([r[0], r[1], antes, concs]) out.text = [rules, nrules] out.termHash = termHash out.termRuleFreq = termRuleFreq out.ruleLengths = ruleLengths # XXX this is even nastier, but useful out.sortFuncs = self.sortFuncs return out