def couch_rev_check_recs_old(docs, src): """ Insert revisions for all records into structure using CouchDB bulk interface. Uses key ranges to narrow bulk query to the source being ingested. Deprecated: has performance issue """ uri = join(COUCH_DATABASE,'_all_docs') start = quote(COUCH_ID_BUILDER(src,'')) end = quote(COUCH_ID_BUILDER(src,'Z'*100)) # FIXME. Is this correct? uri += '?startkey=%s&endkey=%s'%(start,end) # REVU: it fetches all docs from db again and again for each doc bulk # by killing performance and can cause memory issues with big collections # so, if you need to set revisions for each 100 doc among 10000, you # will be getting by 10000 docs for each hundred (100 times) # # new version is implemented in couch_rev_check_recs2, see details resp, cont = H.request(join(COUCH_DATABASE,'_all_docs'), 'GET', headers=COUCH_AUTH_HEADER) if str(resp.status).startswith('2'): rows = json.loads(cont)["rows"] #revs = { r["id"]:r["value"]["rev"] for r in rows } # 2.7 specific revs = {} for r in rows: revs[r["id"]] = r["value"]["rev"] for doc in docs: id = doc['_id'] if id in revs: doc['_rev'] = revs[id] else: logger.warn('Unable to retrieve document revisions via bulk interface: ' + repr(resp)) logger.warn('Request old: ' + uri)
def save_document(document): """ Saves the document in the couchdb. Arguments: document - document to save Returns: If saving succeeded: the value returned by akara. If saving failed: a bunch of error logs is written - returns False. """ logging.info("Updating document in database") h = httplib2.Http() h.force_exception_as_status_code = True url = join(conf['AKARA_SERVER'], conf['UPDATE_DOCUMENT_URL'], document[u'id']) logging.debug("Calling url: " + url) doc = json.dumps(document[u'value']) resp, content = h.request(url, 'POST', body=doc) if str(resp.status).startswith('2'): return content else: logging.error("Couldn't update document [id=%s]" % (document[u'id'])) logging.error(" … with data: %s" % (pp.pformat(document))) logging.error(" … with raw data: %s" % (doc,)) return False
def couch_rev_check_recs(docs): """ Insert revisions for all records into structure using CouchDB bulk interface. Uses key ranges to narrow bulk query to the source being ingested. Performance improved version of couch_rev_check_recs_old, but it uses another input format: Input: {doc["_id"]: doc, ...} """ if not docs: return uri = join(COUCH_DATABASE, '_all_docs') docs_ids = sorted(docs) start = docs_ids[0] end = docs_ids[-1:][0] # uri += "?" + urlencode({"startkey": start, "endkey": end}) uri += '?startkey="%s"&endkey="%s"' % (quote_plus(start), quote_plus(end)) response, content = H.request(uri, 'GET', headers=COUCH_AUTH_HEADER) if str(response.status).startswith('2'): rows = json.loads(content)["rows"] for r in rows: if r["id"] in docs: docs[r["id"]]["_rev"] = r["value"]["rev"] else: logger.warn('Unable to retrieve document revisions via bulk interface: ' + repr(response)) logger.warn('Request: ' + uri)
def target(environ): wiki_id = shift_path_info(environ) full_incoming_request = request_uri(environ) if wiki_id not in TARGET_WIKIS: raise BadTargetError(fronturl=request_uri(environ), target=wiki_id) original_page = join(TARGET_WIKIS[wiki_id].rstrip('/')+'/', environ['PATH_INFO'].lstrip('/')) #relative_to_wrapped = relativize(, full_incoming_request) wrapped_wiki_base = full_incoming_request[:-len(environ['PATH_INFO'])] return wiki_id, TARGET_WIKIS[wiki_id], TARGET_WIKI_OPENERS.get(wiki_id), original_page, wrapped_wiki_base
def update_resource(self, path=None): ''' Update a resource based on WSGI environment or a uri path ''' if path: docid = path if is_absolute(path): docid = relativize(path, self.remotedb) else: docid = self.environ['PATH_INFO'].lstrip('/').rsplit(self.space_tag, 1)[1].lstrip('/') #e.g. '/mydb/MyDoc' -> 'MyDoc' if logger: logger.debug('query ' + repr((self.remotedb, docid, join(self.remotedb, docid)))) body = self.environ['wsgi.input'].read() # If the document already exists, we need to determine its current rev and add it to the # input body, skipping the process if rev is provided in the PUT request body body_js = json.loads(body) rev = json.loads(body).get('_rev',None) if not rev: # Need to GET the rev resp, content = self.h.request(join(self.remotedb, docid), "GET") if str(resp.status).startswith('2'): rev = json.loads(content).get('_rev',None) logger.debug('update_resource: found existing rev = '+repr(rev)) if rev: body_js['_rev'] = rev body = json.dumps(body_js) headers = {'content-type':self.environ['CONTENT_TYPE']} resp, content = self.h.request(join(self.remotedb, docid), "PUT", body=body, headers=headers) if logger: logger.debug('resp ' + repr((content[:100], resp))) self.prep_slave_response(resp) if not (self.resp_status.startswith('2') or self.resp_status.startswith('304')): if logger: logger.debug("Error looking up resource: %s: %s\n" % (content, self.resp_status)) return '' #No resource could be retrieved return content
def get_rulesheet(self): rsheet = self.data['zen:metadata']['zen:rulesheet'] if rsheet == '.': #The rulesheet is in a standalone attachment to thios doc rev = self.data['_rev'] self.rulesheet = join(self.slave_uri, u'attachment?rev=' + rev) else: #self.rulesheet = UNSPECIFIED self.rulesheet = rsheet if self.space: self.space.logger.debug('resource_type.get_rulesheet slave_uri, rulesheet: ' + repr((self.slave_uri, self.rulesheet))) return self.rulesheet
def listrecords(limit=100): import httplib h = httplib2.Http() h.force_exception_as_status_code = True url = join(COUCH_DATABASE, '_design', VIEW_APP, '_view', VIEW_NAME) url += '?limit=' + str(limit) logger.debug(url) resp, content = h.request(url, "GET", headers=COUCH_AUTH_HEADER) logger.debug("Content: " + content) if str(resp.status).startswith('2'): return content else: logger.error("Couldn't get documents via: " + repr(resp))
def couch_rev_check_recs(docs,src): ''' Insert revisions for all records into structure using CouchDB bulk interface. Uses key ranges to narrow bulk query to the source being ingested. ''' uri = join(COUCH_DATABASE,'_all_docs') start = quote(COUCH_ID_BUILDER(src,'')) end = quote(COUCH_ID_BUILDER(src,'Z'*100)) # FIXME. Is this correct? uri += '?startkey=%s&endkey=%s'%(start,end) resp, cont = H.request(join(COUCH_DATABASE,'_all_docs'),'GET',headers=COUCH_AUTH_HEADER) if str(resp.status).startswith('2'): rows = json.loads(cont)["rows"] #revs = { r["id"]:r["value"]["rev"] for r in rows } # 2.7 specific revs = {} for r in rows: revs[r["id"]] = r["value"]["rev"] for doc in docs: id = doc['_id'] if id in revs: doc['_rev'] = revs[id] else: logger.debug('Unable to retrieve document revisions via bulk interface: '+repr(resp))
def target(environ): wiki_id = shift_path_info(environ) full_incoming_request = request_uri(environ) if wiki_id not in TARGET_WIKIS: raise BadTargetError(fronturl=request_uri(environ), target=wiki_id) original_page = join(TARGET_WIKIS[wiki_id].rstrip('/') + '/', environ['PATH_INFO'].lstrip('/')) #relative_to_wrapped = relativize(, full_incoming_request) if len(environ['PATH_INFO']) > 0: wrapped_wiki_base = full_incoming_request[:-len(environ['PATH_INFO'])] else: wrapped_wiki_base = full_incoming_request return wiki_id, TARGET_WIKIS[wiki_id], TARGET_WIKI_OPENERS.get( wiki_id), original_page, wrapped_wiki_base
def delete_resource(self, path=None): ''' Delete a resource based on WSGI environment or a uri path ''' if path: docid = path if is_absolute(path): docid = relativize(path, self.remotedb) else: docid = self.environ['PATH_INFO'].lstrip('/').rsplit(self.space_tag, 1)[1].lstrip('/') #e.g. '/mydb/MyDoc' -> 'MyDoc' if logger: logger.debug('query ' + repr((self.remotedb, docid, join(self.remotedb, docid)))) resp, content = self.h.request(join(self.remotedb, docid), "DELETE")#, headers=headers) if logger: logger.debug('resp ' + repr((content[:100], resp))) self.prep_slave_response(resp) if not (self.resp_status.startswith('2') or self.resp_status.startswith('304')): if logger: logger.debug("Error looking up resource: %s: %s\n" % (content, self.resp_status)) return '' #No resource could be retrieved return content
def __init__(self, space, docid, data, rtype=None): ''' ''' self.docid = docid self.space = space self.slave_uri = join(space.remotedb, docid) self.data = data self.rulesheet = None if logger: logger.debug('GRIPPO: ' + repr(rtype)) if isinstance(rtype, basestring) and rtype != RESOURCE_TYPE_TYPE: self.type = space.resource_factory(rtype) else: self.type = rtype return
def resource_factory(self, path=None): ''' Look up and retrieve a new resource based on WSGI environment or a uri path ''' if path: docid = path if is_absolute(path): docid = relativize(path, self.remotedb) else: docid = self.environ['PATH_INFO'].lstrip('/').rsplit(self.space_tag, 1)[1].lstrip('/') #e.g. '/mydb/MyDoc' -> 'MyDoc' #resp, content = self.h.request(slave_uri + ';history', "GET", headers=auth_headers) if logger: logger.debug('query ' + repr((self.remotedb, docid, join(self.remotedb, docid)))) resp, content = self.h.request(join(self.remotedb, urllib.quote_plus(docid))) if logger: logger.debug('resp ' + repr((content[:100], resp))) self.prep_slave_response(resp) if not (self.resp_status.startswith('2') or self.resp_status.startswith('304')): if logger: logger.debug("Error looking up resource: %s: %s\n" % (content, self.resp_status)) return '' #No resource could be retrieved data = json.loads(content) return resource.factory(self, docid, data)
def update_document(body, ctype): from StringIO import StringIO io = StringIO(body) parsed_doc = json.load(io) document_id = parsed_doc[u"id"] document = body logger.debug("Storing the document: " + document_id) import httplib h = httplib2.Http() h.force_exception_as_status_code = True url = join(COUCH_DATABASE, document_id) resp, content = h.request(url, 'PUT', body=document, headers=COUCH_AUTH_HEADER) if str(resp.status).startswith('2'): return content else: logger.error("Couldn't store the document %s with the id: %s. " % (document, document_id, ) )
def setup_request(self, environ): ''' Prepare to service a forwarded call from Zen central environ - the WSGI environ of the original invocation ''' #Prepare the WSGI start_response function, which covers response headers and status self.resp_status = None self.resp_headers = None self.exc_info = None self.environ = environ #FIXME: Use akara to get the right cache location self.h = httplib2.Http('/tmp/.cache') self.h.force_exception_to_status_code = True #Set up utility environ variable for rulesheets self.environ['zen.RESOURCE_URI'] = join(self.ZEN_BASEURI, environ['PATH_INFO'].lstrip('/').split('/')[0]) self.environ['couchdb.RESOURCE_URI'] = self.remotedb return
def update_document(body, ctype): logger.debug(body) from StringIO import StringIO io = StringIO(body) parsed_doc = json.load(io) document_id = parsed_doc[u"id"] document = body logger.debug("Storing the document: " + document_id) import httplib h = httplib2.Http() h.force_exception_as_status_code = True url = join(COUCH_DATABASE, document_id) resp, content = h.request(url, 'PUT', body=document, headers=COUCH_AUTH_HEADER) if str(resp.status).startswith('2'): return content else: logger.error("Couldn't store the document %s with the id: %s. " % (document, document_id, ) )
def find_peer_service(peer_id, environ=None): ''' DEPRECATED! Use discover_service() and work with the resulting query_template property Find a peer service endpoint, by ID, mounted on this same Akara instance Must be caled from a running akara service, and it is highly recommended to call at the top of service functions, or at least before the request environ has been manipulated ''' if not in_akara(): raise RuntimeError('find_peer_service is meant to be called from within Akara process space') from akara.registry import _current_registry from akara import request if environ: serverbase = guess_self_uri(environ) else: serverbase = getattr(global_config, 'server_root') for (path, s) in _current_registry._registered_services.iteritems(): if s.ident == peer_id: return join(serverbase, '..', path) return None
def find_peer_service(peer_id, environ=None): ''' DEPRECATED! Use discover_service() and work with the resulting query_template property Find a peer service endpoint, by ID, mounted on this same Akara instance Must be caled from a running akara service, and it is highly recommended to call at the top of service functions, or at least before the request environ has been manipulated ''' if not in_akara(): raise RuntimeError( 'find_peer_service is meant to be called from within Akara process space' ) from akara.registry import _current_registry from akara import request if environ: serverbase = guess_self_uri(environ) else: serverbase = getattr(global_config, 'server_root') for (path, s) in _current_registry._registered_services.iteritems(): if s.ident == peer_id: return join(serverbase, '..', path) return None
def get_documents(): """ Downloads a set of documents from couchdb. If there is an error with downloading the docuemtns, the script exits. Arguments: None Returns: None """ logging.info('Getting documents from akara.') h = httplib2.Http() h.force_exception_as_status_code = True url = join(conf['AKARA_SERVER'], conf['GET_DOCUMENTS_URL']) + "?limit=%s" % conf['GET_DOCUMENTS_LIMIT'] logging.debug('Using akara url: ' + url) resp, content = h.request(url, 'GET') if str(resp.status).startswith('2'): return content else: logging.error("Couldn't get documents using: " + url) logging.error("Emergency exit…") exit(1)
def akara_calendar(highlight=None): ''' Return a calendar in HTML Generates a calendar along the lines of: < January, 2007 > Mo Tu We Th Fr Sa Su 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 Marks present date and those that have entries with archive links Defines the following classes (for use in CSS customization): - akaraCalCalendar - calendar table (note: month/year header e.g. January 2007 is in table/th) - akaraCalCalendarWeekHeaders - week header (Su, Mo, Tu, ...) - akaraCalCalendarEmpty - filler cell (e.g. days after Jan 31) - akaraCalCalendarLive - day for which there is an entry (also has links to that day's archives) And the following IDs: - akaraCalCalendarToday - today's calendar day - akaraCalCalendarSpecificDay - specific day being rendered (if any) Some ideas (e.g. CSS styling of the table) from pycalendar.py by Will Guaraldi Sample request: curl http://localhost:8880/akara.calendar curl http://localhost:8880/akara.calendar/2008/12 curl http://localhost:8880/akara.calendar/2008/12?highlight=2008-12-03 ''' baseuri = request.environ['SCRIPT_NAME'] + '/' today = date.today() year = shift_path_info(request.environ) month = shift_path_info(request.environ) if highlight: #Fun axiom: date(*map(int, date.today().isoformat().split('-'))) highlight = date(*map(int, highlight.split('-'))) if year and month: #Use specified year & month year, month = int(year), int(month) if (year, month) == (today.year, today.month): present_day = today.day else: present_day = None else: #XXX We might want to return Bad Request of they specified year but not day #Use present year & month year, month = today.year, today.month present_day = today.day #logger.debug("year: " + repr(year)) dayheaders = ''.join( ['<td>%s</td>' % dh for dh in calendar.weekheader(3).split()]) monthcal = calendar.monthcalendar(year, month) c = [] for wk in monthcal: c.append('<tr>\n') for d in wk: d_int = int(d) attrs = '' if d_int < 1: d = ' ' fulldate = date.max #never to be found in archives attrs += ' class="akaraCalCalendarEmpty"' else: fulldate = date(year, month, d_int) # "today" trumps "specific day" if d_int == present_day: attrs += ' id="akaraCalCalendarToday"' elif highlight and d_int == highlight.day: attrs += ' id="akaraCalCalendarSpecificDay"' #if fulldate in archives: # attrs += ' class="akaraCalCalendarLive"' #d = '<a href="%s%i/%i/%s/">%s</a>'%(self.weblog_base_url, year, month, d, d) # d = '%s'%(d) c.append('\t<td%s>%s</td>\n' % (attrs, d)) c.append('\n</tr>\n') monthname = calendar.month_name[month] prevmonth = date(year, month, 1) + relativedelta(months=-1) nextmonth = date(year, month, 1) + relativedelta(months=+1) #Yes, even checking if prevmonth > today, so if someone surfs #3 month in the future, there will be no month nav links if prevmonth > today: prevmonth = '' else: #prevmonth = '<th><a href="%s%i/%i/"><<</a></th>'%(self.weblog_base_url, prevmonth.year, prevmonth.month) prevmonth = '<th><a href="%s"><<</a></th>' % (join( baseuri, str(prevmonth.year), str(prevmonth.month))) if nextmonth > today: nextmonth = '' else: nextmonth = '<th><a href="%s">>></a></th>' % (join( baseuri, str(nextmonth.year), str(nextmonth.month))) month = ''.join(c) cal = CAL_TEMPLATE.safe_substitute(locals()) return cal
def enrich(body,ctype): ''' Establishes a pipeline of services identified by an ordered list of URIs provided in two request headers, one for collections and one for records ''' request_headers = copy_headers_to_dict(request.environ) source_name = request_headers.get('Source') collection_name = request_headers.get('Collection') if not (collection_name or source_name): response.code = 500 response.add_header('content-type','text/plain') return "Source and Collection request headers are required" coll_enrichments = request_headers.get(u'Pipeline-Coll','').split(',') rec_enrichments = request_headers.get(u'Pipeline-Rec','').split(',') data = json.loads(body) # First, we run the collection representation through its enrichment pipeline cid = "%s-%s"%(source_name,collection_name) at_id = "http://dp.la/api/collections/" + cid COLL = { "_id": cid, "@id": at_id, } enriched_coll_text = pipe(COLL, ctype, coll_enrichments, 'HTTP_PIPELINE_COLL') enriched_collection = json.loads(enriched_coll_text) if COUCH_DATABASE: docuri = couch_rev_check(join(COUCH_DATABASE,cid)) resp, cont = H.request(docuri,'PUT',body=enriched_coll_text,headers=CT_JSON) if not str(resp.status).startswith('2'): logger.debug("Error storing collection in Couch: "+repr((resp,content))) # Then the records for record in data[u'items']: # Preserve record prior to any enrichments record['original_record'] = record.copy() # Add collection information record[u'collection'] = { '@id' : at_id, 'title' : enriched_collection.get('title',"") } # Set id to value of the first handle, disambiguated w source. Not sure if # one is guaranteed or on what scale it's unique rid = "%s-%s"%(source_name,record[u'handle'][0].strip()) record[u'_id'] = rid enriched_record = pipe(record, ctype, rec_enrichments, 'HTTP_PIPELINE_REC') if COUCH_DATABASE: docuri = couch_rev_check(join(COUCH_DATABASE,rid)) resp, cont = H.request(docuri,'PUT',body=enriched_record,headers=CT_JSON) if not str(resp.status).startswith('2'): logger.debug("Error storing record in Couch: "+repr((resp,content))) continue return json.dumps({})
form_vars["savetext"] = open(temp_fpath, "r").read() url = absolutize(page, base) data = urllib.urlencode(form_vars) request = urllib2.Request(url, data, req_headers) try: logger.debug('Prior to urllib2.opener') with closing(opener.open(request)) as resp: logger.debug('Return from urllib2.opener') doc = htmlparse(resp) raise_embedded_error(doc) logger.debug('HTML parse complete post urllib2.opener') except urllib2.URLError, e: raise UnexpectedResponseError(url=url, code=e.code, error=str(e)) wrapped_url = join(wrapped_wiki_base, page) msg = 'Page updated OK: %s (%s)' % (url, wrapped_url) #response.add_header("Content-Length", str(len(msg))) moin_base_info = base + ' ' + wrapped_wiki_base + ' ' + original_page headers = [ ("Content-Type", "text/plain"), ("Content-Location", wrapped_url), (moin.ORIG_BASE_HEADER, moin_base_info), (moin.WIKI_RELATIVE_HEADER, relativize(wrapped_url, wrapped_wiki_base)), ] start_response(status_response(httplib.CREATED), headers) return [msg] # POST handler
form_vars["savetext"] = open(temp_fpath, "r").read() url = absolutize(page, base) data = urllib.urlencode(form_vars) request = urllib2.Request(url, data, req_headers) try: logger.debug('Prior to urllib2.opener') with closing(opener.open(request)) as resp: logger.debug('Return from urllib2.opener') doc = htmlparse(resp) raise_embedded_error(doc) logger.debug('HTML parse complete post urllib2.opener') except urllib2.URLError,e: raise UnexpectedResponseError(url=url,code=e.code,error=str(e)) wrapped_url = join(wrapped_wiki_base, page) msg = 'Page updated OK: %s (%s)'%(url, wrapped_url) #response.add_header("Content-Length", str(len(msg))) moin_base_info = base + ' ' + wrapped_wiki_base + ' ' + original_page headers = [ ("Content-Type", "text/plain"), ("Content-Location", wrapped_url), (moin.ORIG_BASE_HEADER, moin_base_info), (moin.WIKI_RELATIVE_HEADER, relativize(wrapped_url, wrapped_wiki_base)), ] start_response(status_response(httplib.CREATED), headers) return [msg] # POST handler @dispatcher.method("POST")
def enrich(body,ctype): ''' Establishes a pipeline of services identified by an ordered list of URIs provided in two request headers, one for collections and one for records ''' request_headers = copy_headers_to_dict(request.environ) source_name = request_headers.get('Source') collection_name = request_headers.get('Collection') if not (collection_name or source_name): response.code = 500 response.add_header('content-type','text/plain') return "Source and Collection request headers are required" coll_enrichments = request_headers.get(u'Pipeline-Coll','').split(',') rec_enrichments = request_headers.get(u'Pipeline-Rec','').split(',') data = json.loads(body) # First, we run the collection representation through its enrichment pipeline cid = COUCH_ID_BUILDER(source_name,collection_name) at_id = "http://dp.la/api/collections/" + cid COLL = { "_id": cid, "@id": at_id, "ingestType": "collection" } # Set collection title field from collection_name if no sets if not coll_enrichments[0]: COLL['title'] = collection_name set_ingested_date(COLL) enriched_coll_text = pipe(COLL, ctype, coll_enrichments, 'HTTP_PIPELINE_COLL') enriched_collection = json.loads(enriched_coll_text) # FIXME. Integrate collection storage into bulk call below if COUCH_DATABASE: docuri = join(COUCH_DATABASE,cid) couch_rev_check_coll(docuri,enriched_collection) resp, cont = H.request(docuri,'PUT',body=json.dumps(enriched_collection),headers=dict(CT_JSON.items()+COUCH_AUTH_HEADER.items())) if not str(resp.status).startswith('2'): logger.debug("Error storing collection in Couch: "+repr((resp,cont))) # Then the records docs = [] for record in data[u'items']: # Preserve record prior to any enrichments record['originalRecord'] = record.copy() # Add collection information record[u'collection'] = { '@id' : at_id, 'name' : enriched_collection.get('title',"") } if 'description' in enriched_collection: record[u'collection']['description'] = enriched_collection.get('description',"") record[u'ingestType'] = 'item' set_ingested_date(record) doc_text = pipe(record, ctype, rec_enrichments, 'HTTP_PIPELINE_REC') doc = json.loads(doc_text) docs.append(doc) couch_rev_check_recs(docs,source_name) couch_docs_text = json.dumps({"docs":docs}) if COUCH_DATABASE: resp, content = H.request(join(COUCH_DATABASE,'_bulk_docs'),'POST',body=couch_docs_text,headers=dict(CT_JSON.items()+COUCH_AUTH_HEADER.items())) logger.debug("Couch bulk update response: "+content) if not str(resp.status).startswith('2'): logger.debug('HTTP error posting to CouchDB: '+repr((resp,content))) return json.dumps({'docs' : docs})
def akara_calendar(highlight=None): ''' Return a calendar in HTML Generates a calendar along the lines of: < January, 2007 > Mo Tu We Th Fr Sa Su 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 Marks present date and those that have entries with archive links Defines the following classes (for use in CSS customization): - akaraCalCalendar - calendar table (note: month/year header e.g. January 2007 is in table/th) - akaraCalCalendarWeekHeaders - week header (Su, Mo, Tu, ...) - akaraCalCalendarEmpty - filler cell (e.g. days after Jan 31) - akaraCalCalendarLive - day for which there is an entry (also has links to that day's archives) And the following IDs: - akaraCalCalendarToday - today's calendar day - akaraCalCalendarSpecificDay - specific day being rendered (if any) Some ideas (e.g. CSS styling of the table) from pycalendar.py by Will Guaraldi Sample request: curl http://localhost:8880/akara.calendar curl http://localhost:8880/akara.calendar/2008/12 curl http://localhost:8880/akara.calendar/2008/12?highlight=2008-12-03 ''' baseuri = request.environ['SCRIPT_NAME'] + '/' today = date.today() year = shift_path_info(request.environ) month = shift_path_info(request.environ) if highlight: #Fun axiom: date(*map(int, date.today().isoformat().split('-'))) highlight = date(*map(int, highlight.split('-'))) if year and month: #Use specified year & month year, month = int(year), int(month) if (year, month) == (today.year, today.month): present_day = today.day else: present_day = None else: #XXX We might want to return Bad Request of they specified year but not day #Use present year & month year, month = today.year, today.month present_day = today.day #logger.debug("year: " + repr(year)) dayheaders = ''.join( ['<td>%s</td>' % dh for dh in calendar.weekheader(3).split()] ) monthcal = calendar.monthcalendar(year, month) c = [] for wk in monthcal: c.append('<tr>\n') for d in wk: d_int = int(d) attrs = '' if d_int < 1: d = ' ' fulldate = date.max #never to be found in archives attrs += ' class="akaraCalCalendarEmpty"' else: fulldate = date(year, month, d_int) # "today" trumps "specific day" if d_int == present_day: attrs += ' id="akaraCalCalendarToday"' elif highlight and d_int == highlight.day: attrs += ' id="akaraCalCalendarSpecificDay"' #if fulldate in archives: # attrs += ' class="akaraCalCalendarLive"' #d = '<a href="%s%i/%i/%s/">%s</a>'%(self.weblog_base_url, year, month, d, d) # d = '%s'%(d) c.append('\t<td%s>%s</td>\n' % (attrs, d)) c.append('\n</tr>\n') monthname = calendar.month_name[month] prevmonth = date(year, month, 1) + relativedelta(months=-1) nextmonth = date(year, month, 1) + relativedelta(months=+1) #Yes, even checking if prevmonth > today, so if someone surfs #3 month in the future, there will be no month nav links if prevmonth > today: prevmonth = '' else: #prevmonth = '<th><a href="%s%i/%i/"><<</a></th>'%(self.weblog_base_url, prevmonth.year, prevmonth.month) prevmonth = '<th><a href="%s"><<</a></th>'%(join(baseuri, str(prevmonth.year), str(prevmonth.month))) if nextmonth > today: nextmonth = '' else: nextmonth = '<th><a href="%s">>></a></th>'%(join(baseuri, str(nextmonth.year), str(nextmonth.month))) month = ''.join(c) cal = CAL_TEMPLATE.safe_substitute(locals()) return cal
def read_contentdm(site, collection=None, query=None, limit=None, logger=logging, proxy=None, cachedir='/tmp/.cache'): ''' A generator of CDM records First generates header info >>> from zen.contentdm import read_contentdm >>> results = read_contentdm('http://digital.library.louisville.edu/cdm4/', collection='/jthom', query=None, limit=None) >>> results.next() {'basequeryurl': 'http://digital.library.louisville.edu/cdm4/results.php?CISOOP1=any&CISOROOT=%2Fjthom&CISOBOX1=&CISOFIELD1=CISOSEARCHALL'} >>> results.next() {u'Title': u'60 years in darkness. ', u'Object_Type': u'Negatives, ', u'Source': u"4 x 5 in. b&w safety negative. Item no. 1979.33.1026 in the Jean Thomas, The Traipsin' Woman, Collection, University of Louisville Photographic Archives. ", u'Collection': u"Jean Thomas, The Traipsin' Woman, Collection, ",...} The first yielded value is global metadata; the second is the record for the first item in the collection/query, and so on until all the items are returned, or the limit reached. If you want to see the debug messages, just do (before calling read_contentdm for the first time): >>> import logging; logging.basicConfig(level=logging.DEBUG) for a nice-sized collection to try: >>> read_contentdm('http://digital.library.louisville.edu/cdm4/', collection='/maps') Auburn theater collection: >>> read_contentdm('http://content.lib.auburn.edu', collection='/theatre01') >>> read_contentdm('http://content.lib.auburn.edu', collection='/football') i.e.: http://digital.library.louisville.edu/cdm4/browse.php?CISOROOT=/maps See also: * /cdm4/browse.php?CISOROOT=/football (51 items) >>> results = read_contentdm('http://digital.library.louisville.edu/cdm4/', collection='/jthom', query=None, limit=None, proxy="http://*****:*****@name="searchResultsForm"]//a[starts-with(@href, "item_viewer.php")]') def follow_pagination(doc): #e.g. of page 1: http://digital.library.louisville.edu/cdm4/browse.php?CISOROOT=/afamoh #e.g. of page 2: http://digital.library.louisville.edu/cdm4/browse.php?CISOROOT=/afamoh&CISOSTART=1,21 page_start = 1 while True: items = doc.xml_select(u'//a[contains(@href, "item_viewer.php") or contains(@href, "document.php")]') #items = list(items) #for i in items: yield i for i in items: #logger.debug("item: {0}".format(i.title.encode('utf-8'))) yield i next = [ l.href for l in doc.xml_select(u'//a[@class="res_submenu"]') if int(l.href.split(u',')[-1]) > page_start ] if not next: #e.g. http://vilda.alaska.edu/ uses yet another pattern with just @class=submenu links *sigh* next = [ l.href for l in doc.xml_select(u'//a[@class="submenu"]') if u'CISOSTART' in l.href and int(l.href.split(u',')[-1]) > page_start ] if not next: break page_start = int(next[0].split(u',')[-1]) url = absolutize(next[0], site) resp, doc = cdmsite.index_page(url, "Next page URL: {0}") return items = follow_pagination(resultsdoc) at_least_one = False count = 0 for it in items: at_least_one = True pageuri = absolutize(it.href, site) if pageuri in seen_links: continue seen_links.add(pageuri) entry = {} logger.debug("Processing item URL: {0}".format(pageuri)) (scheme, netloc, path, query, fragment) = split_uri_ref(pageuri) entry['domain'] = netloc params = parse_qs(query) entry['cdm-coll'] = params['CISOROOT'][0].strip('/').split('/')[0] entry['id'] = params['CISOPTR'][0] logger.debug("Item id: {0}".format(entry['id'])) if entry['id'] in seen_ids: continue seen_ids.add(entry['id']) entry['link'] = unicode(pageuri) entry['local_link'] = '#' + entry['id'] resp, page, cachekey, cached = cdmsite.item_page(pageuri) if cached: entry = cached else: image = first_item(page.xml_select(u'//td[@class="tdimage"]//img')) if image: imageuri = absolutize(image.src, site) entry['imageuri'] = imageuri try: entry['thumbnail'] = absolutize(dict(it.xml_parent.a.img.xml_attributes.items())[None, u'src'], site) except AttributeError: logger.debug("No thumbnail") #entry['thumbnail'] = DEFAULT_RESOLVER.normalize(it.xml_parent.a.img.src, root) #fields = page.xml_select(u'//tr[td[@class="tdtext"]]') #fields = page.xml_select(u'//table[@class="metatable"]/tr') fields = chain(page.xml_select(u'//tr[td[@class="tdtext"]]'), page.xml_select(u'//table[@class="metatable"]//tr')) for f in fields: #key = unicode(f.td[0].span.b).replace(' ', '_') key = UNSUPPORTED_IN_EXHIBITKEY.sub(u'_', U(f.xml_select(u'td[1]//b'))) #logger.debug("{0}".format(key)) value = u''.join(CONTENT.dispatch(f.td[1])) #value = u''.join(CONTENT.dispatch(f.xml_select(u'td[2]'))) entry[key] = unicode(value) if u'Title' in entry: #logger.debug("{0}".format(entry['Title'])) entry['label'] = entry['Title'] else: entry['label'] = u'[NO LABEL AVAILABLE]' if u"Location_Depicted" in entry: locations = entry[u"Location_Depicted"].split(u', ') #locations = [ l.replace(' (', ', ').replace(')', '').replace(' ', '+') for l in locations if l.strip() ] locations = [ l.replace(' (', ', ').replace(')', '').replace('.', '') for l in locations if l.strip() ] #print >> sys.stderr, "LOCATIONS", repr(locations) entry[u"Locations_Depicted"] = locations if u"Date_Original" in entry: entry[u"Estimated_Original_Date"] = entry[u"Date_Original"].strip().replace('-', '5').replace('?', '') entry[u"Subject"] = [ s for s in entry.get(u"Subject", u'').split(', ') if s.strip() ] if cachedir: try: json_stream = open(os.path.join(cachedir, cachekey+'.extract.js'), 'w') json.dump(entry, json_stream) except IOError, ValueError: pass yield entry count += 1 if limit and count >= limit: logger.debug("Limit reached") break