def _put_page(environ, start_response): ''' ''' req_headers = copy_headers_to_dict(environ,exclude=['HTTP_ACCEPT_ENCODING']) wiki_id, base, opener, original_page, wrapped_wiki_base = target(environ) page = environ['PATH_INFO'].lstrip('/') check_auth(environ, start_response, base, opener, req_headers) ctype = environ.get('CONTENT_TYPE', 'application/unknown') temp_fpath = read_http_body_to_temp(environ, start_response) form_vars = fill_page_edit_form(page, wiki_id, base, opener, req_headers) form_vars["savetext"] = open(temp_fpath, "r").read() url = absolutize(page, base) data = urllib.urlencode(form_vars) request = urllib2.Request(url, data, req_headers) try: logger.debug('Prior to urllib2.opener') with closing(opener.open(request)) as resp: logger.debug('Return from urllib2.opener') doc = htmlparse(resp) raise_embedded_error(doc) logger.debug('HTML parse complete post urllib2.opener') except urllib2.URLError,e: raise UnexpectedResponseError(url=url,code=e.code,error=str(e))
def _delete_page(environ, start_response): ''' Deletes a Wiki page, returning 200 if successful. Does not yet support the deletion of attachments. ''' #The Moin form asks that this be in multipart-form format, but the multipart handler #fallsback to url-encoding unless you pass it a file. Luckily, the equivalent #url-encoded request works... for now. req_headers = copy_headers_to_dict(environ,exclude=['HTTP_ACCEPT_ENCODING']) wiki_id, base, opener, original_page, wrapped_wiki_base = target(environ) page = environ['PATH_INFO'].lstrip('/') check_auth(environ, start_response, base, opener, req_headers) form_vars = fill_page_delete_form(page, wiki_id, base, opener, req_headers) url = absolutize(page, base) request = urllib2.Request(url, form_vars, req_headers) try: with closing(opener.open(request)) as resp: doc = htmlparse(resp) raise_embedded_error(doc) except urllib2.URLError,e: if e.code == 404: # Moin returns 404 on a succcessful DeletePage POST; recast as a 200 pass else: raise UnexpectedResponseError(url=url,code=e.code,error=str(e))
def pipe(content, ctype, enrichments, wsgi_header): body = json.dumps(content) error = None for uri in enrichments: if not uri: continue # in case there's no pipeline if not is_absolute(uri): prefix = request.environ["wsgi.url_scheme"] + "://" if request.environ.get("HTTP_HOST"): prefix += request.environ["HTTP_HOST"] else: prefix += request.environ["SERVER_NAME"] # Join the prefix and given pipeline module path, ensuring the # path starts with "/". uri = prefix + re.sub(r"^(?!/)", "/", uri) headers = copy_headers_to_dict(request.environ, exclude=[wsgi_header]) headers["content-type"] = ctype logger.debug("Calling url: %s " % uri) resp, cont = H.request(uri, "POST", body=body, headers=headers) if not str(resp.status).startswith("2"): error = "Error in enrichment pipeline at %s" % uri logger.error(error) continue body = cont return error, body
def _delete_page(environ, start_response): ''' Deletes a Wiki page, returning 200 if successful. Does not yet support the deletion of attachments. ''' #The Moin form asks that this be in multipart-form format, but the multipart handler #fallsback to url-encoding unless you pass it a file. Luckily, the equivalent #url-encoded request works... for now. req_headers = copy_headers_to_dict(environ, exclude=['HTTP_ACCEPT_ENCODING']) wiki_id, base, opener, original_page, wrapped_wiki_base = target(environ) page = environ['PATH_INFO'].lstrip('/') check_auth(environ, start_response, base, opener, req_headers) form_vars = fill_page_delete_form(page, wiki_id, base, opener, req_headers) url = absolutize(page, base) request = urllib2.Request(url, form_vars, req_headers) try: with closing(opener.open(request)) as resp: doc = htmlparse(resp) raise_embedded_error(doc) except urllib2.URLError, e: if e.code == 404: # Moin returns 404 on a succcessful DeletePage POST; recast as a 200 pass else: raise UnexpectedResponseError(url=url, code=e.code, error=str(e))
def _put_page(environ, start_response): ''' ''' req_headers = copy_headers_to_dict(environ, exclude=['HTTP_ACCEPT_ENCODING']) wiki_id, base, opener, original_page, wrapped_wiki_base = target(environ) page = environ['PATH_INFO'].lstrip('/') check_auth(environ, start_response, base, opener, req_headers) ctype = environ.get('CONTENT_TYPE', 'application/unknown') temp_fpath = read_http_body_to_temp(environ, start_response) form_vars = fill_page_edit_form(page, wiki_id, base, opener, req_headers) form_vars["savetext"] = open(temp_fpath, "r").read() url = absolutize(page, base) data = urllib.urlencode(form_vars) request = urllib2.Request(url, data, req_headers) try: logger.debug('Prior to urllib2.opener') with closing(opener.open(request)) as resp: logger.debug('Return from urllib2.opener') doc = htmlparse(resp) raise_embedded_error(doc) logger.debug('HTML parse complete post urllib2.opener') except urllib2.URLError, e: raise UnexpectedResponseError(url=url, code=e.code, error=str(e))
def ucsb_aleph_marc_id(body, ctype): '''MARC sucks''' try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" ident = None for field in data['fields']: if '856' in field: subfields = field['856']['subfields'] for subf in subfields: if 'u' in subf: # restrict to ones that have url like # http://www.library.ucsb.edu/OBJID/Cylinder0002 if 'OBJID' in subf['u']: ident = subf['u'] if not ident: logger.error('NO 856 u for doc leader:{}'.format(data['leader'])) response.code = 500 response.add_header('content-type', 'text/plain') return "No id property was found" request_headers = copy_headers_to_dict(request.environ) source_name = request_headers.get('Source') data[u'_id'] = COUCH_REC_ID_BUILDER(source_name, ident) data[u'id'] = hashlib.md5(data[u'_id']).hexdigest() return json.dumps(data)
def sfpl_marc_id(body, ctype): '''MARC sucks''' try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" ident = None for field in data['fields']: if '010' in field: subfields = field['010']['subfields'] for subf in subfields: if 'a' in subf: ident = subf['a'] if not ident: response.code = 500 response.add_header('content-type', 'text/plain') return "No id property was found" request_headers = copy_headers_to_dict(request.environ) source_name = request_headers.get('Source') data[u'_id'] = COUCH_REC_ID_BUILDER(source_name, ident) data[u'id'] = hashlib.md5(data[u'_id']).hexdigest() return json.dumps(data)
def enrich_storage(body, ctype): """Establishes a pipeline of services identified by an ordered list of URIs provided in request header "Pipeline-Item" """ request_headers = copy_headers_to_dict(request.environ) rec_enrichments = request_headers.get(u"Pipeline-Item","").split(",") records = json.loads(body) # Counts enriched_coll_count = 0 enriched_item_count = 0 missing_id_count = 0 missing_source_resource_count = 0 errors = [] enriched_records = {} for record in records: error, enriched_record_text = pipe(record, ctype, rec_enrichments, "HTTP_PIPELINE_ITEM") if error: errors.append(error) enriched_record = json.loads(enriched_record_text) if enriched_record.get("_id", None): ingest_type = enriched_record.get("ingestType") # Item records should have sourceResource if (ingest_type == "item" and not "sourceResource" in enriched_record): logger.error("Record %s does not have sourceResource: %s" % (enriched_record["_id"], enriched_record)) missing_source_resource_count += 1 else: enriched_records[enriched_record["_id"]] = enriched_record if ingest_type == "item": enriched_item_count += 1 else: enriched_coll_count += 1 else: logger.error("Found a record without an _id %s" % enriched_record) missing_id_count += 1 data = { "enriched_records": enriched_records, "enriched_coll_count": enriched_coll_count, "enriched_item_count": enriched_item_count, "missing_id_count": missing_id_count, "missing_source_resource_count": missing_source_resource_count, "errors": errors } return json.dumps(data) return json.dumps(docs)
def enrich_storage(body, ctype): """Establishes a pipeline of services identified by an ordered list of URIs provided in request header "Pipeline-Item" """ request_headers = copy_headers_to_dict(request.environ) rec_enrichments = request_headers.get(u"Pipeline-Item", "").split(",") records = json.loads(body) # Counts enriched_coll_count = 0 enriched_item_count = 0 missing_id_count = 0 missing_source_resource_count = 0 errors = [] enriched_records = {} for record in records: error, enriched_record_text = pipe(record, ctype, rec_enrichments, "HTTP_PIPELINE_ITEM") if error: errors.append(error) enriched_record = json.loads(enriched_record_text) if enriched_record.get("_id", None): ingest_type = enriched_record.get("ingestType") # Item records should have sourceResource if (ingest_type == "item" and not "sourceResource" in enriched_record): logger.error("Record %s does not have sourceResource: %s" % (enriched_record["_id"], enriched_record)) missing_source_resource_count += 1 else: enriched_records[enriched_record["_id"]] = enriched_record if ingest_type == "item": enriched_item_count += 1 else: enriched_coll_count += 1 else: logger.error("Found a record without an _id %s" % enriched_record) missing_id_count += 1 data = { "enriched_records": enriched_records, "enriched_coll_count": enriched_coll_count, "enriched_item_count": enriched_item_count, "missing_id_count": missing_id_count, "missing_source_resource_count": missing_source_resource_count, "errors": errors } return json.dumps(data) return json.dumps(docs)
def pipe(content,ctype,enrichments,wsgi_header): body = json.dumps(content) for uri in enrichments: if len(uri) < 1: continue # in case there's no pipeline headers = copy_headers_to_dict(request.environ,exclude=[wsgi_header]) headers['content-type'] = ctype resp, cont = H.request(uri,'POST',body=body,headers=headers) if not str(resp.status).startswith('2'): logger.debug("Error in enrichment pipeline at %s: %s"%(uri,repr(resp))) continue body = cont return body
def selid(body, ctype, prop='handle', use_source='yes'): ''' Service that accepts a JSON document and adds or sets the "id" property to the value of the property named by the "prop" paramater ''' if not prop: # Remove this document response.code = 500 response.add_header('content-type', 'text/plain') return "No id property has been selected" try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" request_headers = copy_headers_to_dict(request.environ) source_name = request_headers.get('Source') record_id = None if exists(data, prop): v = getprop(data, prop) if isinstance(v, basestring): record_id = v else: if v: for h in (v if isinstance(v, list) else [v]): if is_absolute(h): record_id = h if not record_id: record_id = v[0] if not record_id: response.code = 500 response.add_header('content-type', 'text/plain') return "No id property was found" ''' If the useSource parameter is True (default) than prepend it to the id and use that value when hashing for the DPLA id ''' if use_source.lower() == 'yes': data[u'_id'] = couch_rec_id_builder(source_name, record_id) else: data[u'_id'] = clean_id(record_id) data[u'id'] = hashlib.md5(data[u'_id']).hexdigest() return json.dumps(data)
def selid(body, ctype, prop='descriptiveNonRepeating/record_link', alternative_prop='descriptiveNonRepeating/record_ID'): ''' Service that accepts a JSON document and adds or sets the "id" property to the value of the property named by the "prop" paramater ''' tmpl = "http://collections.si.edu/search/results.htm?q=record_ID%%3A%s&repo=DPLA" if prop: try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" request_headers = copy_headers_to_dict(request.environ) source_name = request_headers.get('Source') id = None if exists(data, prop) or exists(data, alternative_prop): v = getprop(data, prop, True) if not v: v = getprop(data, alternative_prop) v = tmpl % v if isinstance(v, basestring): id = v else: if v: for h in v: if is_absolute(h): id = h if not id: id = v[0] if not id: response.code = 500 response.add_header('content-type', 'text/plain') return "No id property was found" data[u'_id'] = COUCH_REC_ID_BUILDER(source_name, id) data[u'id'] = hashlib.md5(data[u'_id']).hexdigest() else: logger.error("Prop param in None in %s" % __name__) return json.dumps(data)
def enrich_storage(body, ctype): """Establishes a pipeline of services identified by an ordered list of URIs provided in request header 'Pipeline-Rec' """ request_headers = copy_headers_to_dict(request.environ) rec_enrichments = request_headers.get(u'Pipeline-Rec','').split(',') data = json.loads(body) docs = {} for record in data: doc_text = pipe(record, ctype, rec_enrichments, 'HTTP_PIPELINE_REC') doc = json.loads(doc_text) docs[doc["_id"]] = doc return json.dumps(docs)
def selid(body,ctype,prop='descriptiveNonRepeating/record_link', alternative_prop='descriptiveNonRepeating/record_ID'): ''' Service that accepts a JSON document and adds or sets the "id" property to the value of the property named by the "prop" paramater ''' tmpl="http://collections.si.edu/search/results.htm?q=record_ID%%3A%s&repo=DPLA" if prop: try : data = json.loads(body) except: response.code = 500 response.add_header('content-type','text/plain') return "Unable to parse body as JSON" request_headers = copy_headers_to_dict(request.environ) source_name = request_headers.get('Source') id = None if exists(data, prop) or exists(data, alternative_prop): v = getprop(data,prop, True) if not v: v = getprop(data, alternative_prop) v = tmpl % v if isinstance(v,basestring): id = v else: if v: for h in v: if is_absolute(h): id = h if not id: id = v[0] if not id: response.code = 500 response.add_header('content-type','text/plain') return "No id property was found" data[u'_id'] = COUCH_REC_ID_BUILDER(source_name, id) data[u'id'] = hashlib.md5(data[u'_id']).hexdigest() else: logger.error("Prop param in None in %s" % __name__) return json.dumps(data)
def selid(body, ctype, prop='handle'): ''' Service that accepts a JSON document and adds or sets the "id" property to the value of the property named by the "prop" paramater ''' if not prop: # Remove this document response.code = 500 response.add_header('content-type', 'text/plain') return "No id property has been selected" try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" request_headers = copy_headers_to_dict(request.environ) source_name = request_headers.get('Source') id = None if exists(data, prop): v = getprop(data, prop) if isinstance(v, basestring): id = v else: if v: for h in (v if isinstance(v, list) else [v]): if is_absolute(h): id = h if not id: id = v[0] if not id: response.code = 500 response.add_header('content-type', 'text/plain') return "No id property was found" data[u'_id'] = COUCH_REC_ID_BUILDER(source_name, id) # we don't use this, dump it # data[u'id'] = hashlib.md5(data[u'_id']).hexdigest() return json.dumps(data)
def pipe(content, ctype, enrichments, wsgi_header): body = json.dumps(content) for uri in enrichments: if not uri: continue # in case there's no pipeline if not is_absolute(uri): prefix = request.environ['wsgi.url_scheme'] + '://' prefix += request.environ['HTTP_HOST'] if request.environ.get('HTTP_HOST') else request.environ['SERVER_NAME'] uri = prefix + uri headers = copy_headers_to_dict(request.environ, exclude=[wsgi_header]) headers['content-type'] = ctype logger.debug("Calling url: %s " % uri) resp, cont = H.request(uri, 'POST', body=body, headers=headers) if not str(resp.status).startswith('2'): logger.warn("Error in enrichment pipeline at %s: %s"%(uri,repr(resp))) continue body = cont return body
def selid(body, ctype, prop='handle'): ''' Service that accepts a JSON document and adds or sets the "id" property to the value of the property named by the "prop" paramater ''' if not prop: # Remove this document response.code = 500 response.add_header('content-type', 'text/plain') return "No id property has been selected" try : data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" request_headers = copy_headers_to_dict(request.environ) source_name = request_headers.get('Source') id = None if exists(data,prop): v = getprop(data,prop) if isinstance(v,basestring): id = v else: if v: for h in (v if isinstance(v, list) else [v]): if is_absolute(h): id = h if not id: id = v[0] if not id: response.code = 500 response.add_header('content-type', 'text/plain') return "No id property was found" data[u'_id'] = COUCH_REC_ID_BUILDER(source_name, id) data[u'id'] = hashlib.md5(data[u'_id']).hexdigest() return json.dumps(data)
def selectid(body, ctype): ''' Service that accepts a JSON document and adds or sets the "id" property to the value of the property named by the "prop" paramater ''' try : data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" request_headers = copy_headers_to_dict(request.environ) source_name = request_headers.get('Source') try: select_id(source_name, data) except ValueError, e: response.code = 500 response.add_header('content-type', 'text/plain') return "No id property was found"
def post_page(environ, start_response): ''' Attachments use URI path params (for a bit of discussion see http://groups.google.com/group/comp.lang.python/browse_thread/thread/4662d41aca276d99) ''' #ctype = environ.get('CONTENT_TYPE', 'application/unknown') req_headers = copy_headers_to_dict(environ, exclude=['HTTP_ACCEPT_ENCODING']) wiki_id, base, opener, original_page, wrapped_wiki_base = target(environ) logger.debug("wiki_id,base,opener,original_age,wrapped_wiki_base=" + repr((wiki_id, base, opener, original_page, wrapped_wiki_base))) check_auth(environ, start_response, base, opener, req_headers) page = environ['PATH_INFO'].lstrip('/') page, chaff, attachment = page.partition(';attachment=') # print >> sys.stderr, page, attachment #now = datetime.now().isoformat() #Unfortunately because urllib2's data dicts don't give an option for limiting read length, must read into memory and wrap #content = StringIO(environ['wsgi.input'].read(clen)) temp_fpath = read_http_body_to_temp(environ, start_response) form_vars = fill_attachment_form(page, attachment, wiki_id, base, opener, req_headers) form_vars["file"] = open(temp_fpath, "rb") url = absolutize(page, base) #print >> sys.stderr, url, temp_fpath #data = urllib.urlencode(form_vars) request = urllib2.Request(url, form_vars, req_headers) try: with closing(opener.open(request)) as resp: doc = htmlparse(resp) raise_embedded_error(doc) #logger.debug('POST for attachment page response... ' + doc.xml_encode()) except urllib2.URLError, e: if e.code == 404: raise MoinNotFoundError(fronturl=request_uri(environ), backurl=url) else: raise UnexpectedResponseError(url=url, code=e.code, error=str(e))
def pipe(content, ctype, enrichments, wsgi_header): body = json.dumps(content) for uri in enrichments: if not uri: continue # in case there's no pipeline if not is_absolute(uri): prefix = request.environ['wsgi.url_scheme'] + '://' if request.environ.get('HTTP_HOST'): prefix += request.environ['HTTP_HOST'] else: prefix += request.environ['SERVER_NAME'] uri = prefix + uri headers = copy_headers_to_dict(request.environ, exclude=[wsgi_header]) headers['content-type'] = ctype logger.debug("Calling url: %s " % uri) resp, cont = H.request(uri, 'POST', body=body, headers=headers) if not str(resp.status).startswith('2'): logger.warn("Error in enrichment pipeline at %s: %s" % (uri, repr(resp))) continue body = cont return body
def selectid(body, ctype): ''' Service that accepts a JSON document and adds or sets the "id" property to the value of the property named by the "prop" paramater ''' try : data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" request_headers = copy_headers_to_dict(request.environ) source_name = request_headers.get('Source') objid = None v = getprop(data, 'identifier') if isinstance(v,basestring): objid = v else: if v: for h in (v if isinstance(v, list) else [v]): if h['text'].startswith('http://ark.cdlib.org/ark:'): if is_absolute(h['text']): objid = h['text'] if not objid: objid = v[0] if not objid: response.code = 500 response.add_header('content-type', 'text/plain') return "No id property was found" data[u'_id'] = COUCH_REC_ID_BUILDER(source_name, objid) data[u'id'] = hashlib.md5(data[u'_id']).hexdigest() data[u'isShownAt'] = objid data[u'isShownBy'] = objid + '/thumbnail' return json.dumps(data)
def post_page(environ, start_response): ''' Attachments use URI path params (for a bit of discussion see http://groups.google.com/group/comp.lang.python/browse_thread/thread/4662d41aca276d99) ''' #ctype = environ.get('CONTENT_TYPE', 'application/unknown') req_headers = copy_headers_to_dict(environ,exclude=['HTTP_ACCEPT_ENCODING']) wiki_id, base, opener, original_page, wrapped_wiki_base = target(environ) logger.debug("wiki_id,base,opener,original_age,wrapped_wiki_base="+repr((wiki_id,base,opener,original_page,wrapped_wiki_base))) check_auth(environ, start_response, base, opener, req_headers) page = environ['PATH_INFO'].lstrip('/') page, chaff, attachment = page.partition(';attachment=') # print >> sys.stderr, page, attachment #now = datetime.now().isoformat() #Unfortunately because urllib2's data dicts don't give an option for limiting read length, must read into memory and wrap #content = StringIO(environ['wsgi.input'].read(clen)) temp_fpath = read_http_body_to_temp(environ, start_response) form_vars = fill_attachment_form(page, attachment, wiki_id, base, opener, req_headers) form_vars["file"] = open(temp_fpath, "rb") url = absolutize(page, base) #print >> sys.stderr, url, temp_fpath #data = urllib.urlencode(form_vars) request = urllib2.Request(url, form_vars, req_headers) try: with closing(opener.open(request)) as resp: doc = htmlparse(resp) raise_embedded_error(doc) #logger.debug('POST for attachment page response... ' + doc.xml_encode()) except urllib2.URLError,e: if e.code == 404: raise MoinNotFoundError(fronturl=request_uri(environ), backurl=url) else: raise UnexpectedResponseError(url=url,code=e.code,error=str(e))
def get_page(environ, start_response): #logger.debug('get_page: ' + repr((environ['SCRIPT_NAME'], environ['PATH_INFO']))) req_headers = copy_headers_to_dict(environ,exclude=['HTTP_ACCEPT_ENCODING']) wiki_id, base, opener, original_page, wrapped_wiki_base = target(environ) page = environ['PATH_INFO'].lstrip('/') check_auth(environ, start_response, base, opener, req_headers) upstream_handler = None status = httplib.OK params = cgi.parse_qs(environ['QUERY_STRING']) #Note: probably a better solution here: http://code.google.com/p/mimeparse/ accepted_imts = environ.get('HTTP_ACCEPT', '').split(',') #logger.debug('accepted_imts: ' + repr(accepted_imts)) imt = first_item(dropwhile(lambda x: '*' in x, accepted_imts)) #logger.debug('imt: ' + repr(imt)) params_for_moin = {} cache_max_age = CACHE_MAX_AGE # max-age of this response. If set to None, it will not be used if NO_CACHE_PATHS and first_item(dropwhile(lambda x: x not in page, NO_CACHE_PATHS)): cache_max_age = None if 'rev' in params: #XXX: Not compatible with search #params_for_moin = {'rev' : params['rev'][0], 'action': 'recall'} params_for_moin = {'rev' : params['rev'][0]} if 'search' in params: searchq = params['search'][0] query = urllib.urlencode({'value' : searchq, 'action': 'fullsearch', 'context': '180', 'fullsearch': 'Text'}) #?action=fullsearch&context=180&value=foo&=Text url = absolutize('?'+query, base) request = urllib2.Request(url, None, req_headers) ctype = moin.RDF_IMT cache_max_age = None #elif 'action' in params and params['action'][0] == 'recall': elif moin.HTML_IMT in environ.get('HTTP_ACCEPT', ''): params = urllib.urlencode(params_for_moin) url = absolutize(page+'?'+params, base) request = urllib2.Request(url, None, req_headers) ctype = moin.HTML_IMT elif moin.RDF_IMT in environ.get('HTTP_ACCEPT', ''): #FIXME: Make unique flag optional #url = base + '/RecentChanges?action=rss_rc&unique=1&ddiffs=1' url = absolutize('RecentChanges?action=rss_rc&unique=1&ddiffs=1', base) #print >> sys.stderr, (url, base, '/RecentChanges?action=rss_rc&unique=1&ddiffs=1', ) request = urllib2.Request(url, None, req_headers) ctype = moin.RDF_IMT elif moin.ATTACHMENTS_IMT in environ.get('HTTP_ACCEPT', ''): url = absolutize(page + '?action=AttachFile', base) request = urllib2.Request(url, None, req_headers) ctype = moin.ATTACHMENTS_IMT def upstream_handler(): #Sigh. Sometimes you have to break some Tag soup eggs to make a RESTful omlette with closing(opener.open(request)) as resp: rbody = resp.read() doc = htmlparse(rbody) raise_embedded_error(doc) attachment_nodes = doc.xml_select(u'//*[contains(@href, "action=AttachFile") and contains(@href, "do=view")]') targets = [] for node in attachment_nodes: target = [ param.split('=', 1)[1] for param in node.href.split(u'&') if param.startswith('target=') ][0] targets.append(target) output = structencoder(indent=u"yes") output.feed( ROOT( E((u'attachments'), (E(u'attachment', {u'href': unicode(t)}) for t in targets) ) )) return output.read(), ctype #Notes on use of URI parameters - http://markmail.org/message/gw6xbbvx4st6bksw elif ';attachment=' in page: page, attachment = page.split(';attachment=', 1) url = absolutize(page + '?action=AttachFile&do=get&target=' + attachment, base) request = urllib2.Request(url, None, req_headers) def upstream_handler(): with closing(opener.open(request)) as resp: rbody = resp.read() return rbody, dict(resp.info())['content-type'] # elif ';history' in page: cache_max_age = None page, discard = page.split(';history', 1) ctype = moin.XML_IMT def upstream_handler(): revs = scrape_page_history(page, base, opener, req_headers) output = structencoder(indent=u"yes") output.feed( ROOT( E((u'history'), (E(u'rev', {u'id': unicode(r['rev']), u'editor': unicode(r['editor']), u'date': unicode(r['date']).replace(' ', 'T')}) for r in revs) ) )) return output.read(), ctype elif imt: params_for_moin.update({'mimetype': imt}) params = urllib.urlencode(params_for_moin) url = absolutize(page, base) + '?' + params request = urllib2.Request(url, None, req_headers) ctype = moin.DOCBOOK_IMT else: params_for_moin.update({'action': 'raw'}) params = urllib.urlencode(params_for_moin) url = absolutize(page, base) + '?' + params request = urllib2.Request(url, None, req_headers) ctype = moin.WIKITEXT_IMT try: if upstream_handler: rbody, ctype = upstream_handler() else: with closing(opener.open(request)) as resp: rbody = resp.read() #headers = {moin.ORIG_BASE_HEADER: base} #moin_base = absolutize(wiki_id, base) moin_base_info = base + ' ' + wrapped_wiki_base + ' ' + original_page response_headers = [("Content-Type", ctype), ("Vary", "Accept"), (moin.ORIG_BASE_HEADER, moin_base_info)] if cache_max_age: response_headers.append(("Cache-Control","max-age="+str(cache_max_age))) start_response(status_response(status), response_headers) return rbody except urllib2.URLError, e: if e.code == 401: raise HTTPAuthorizationError(url=request.get_full_url()) if e.code == 403: raise MoinMustAuthenticateError(url=request.get_full_url(),target=wiki_id) if e.code == 404: raise MoinNotFoundError(fronturl=request_uri(environ),backurl=url) else: raise UnexpectedResponseError(url=url,code=e.code,error=str(e))
def enrich(body, ctype): """ Establishes a pipeline of services identified by an ordered list of URIs provided in two request headers, one for collections and one for items. Returns a JSON dump of the collections and records enriched along with a count of records enriched. """ request_headers = copy_headers_to_dict(request.environ) item_enrichments = request_headers.get(u"Pipeline-Item", "").split(",") coll_enrichments = request_headers.get(u"Pipeline-Coll", "").split(",") records = json.loads(body) # Counts for enrich script enriched_coll_count = 0 enriched_item_count = 0 missing_id_count = 0 missing_source_resource_count = 0 errors = [] enriched_records = {} for record in records: if record.get("ingestType") == "collection": wsgi_header = "HTTP_PIPELINE_COLL" enrichments = coll_enrichments else: wsgi_header = "HTTP_PIPELINE_ITEM" enrichments = item_enrichments # Preserve record prior to any enrichments record["originalRecord"] = record.copy() record["ingestType"] = "item" # Explicitly populate ingestDate as UTC record["ingestDate"] = iso_utc_with_tz() error, enriched_record_text = pipe(record, ctype, enrichments, wsgi_header) enriched_record = json.loads(enriched_record_text) if error: errors.append(error) ingest_type = record.get("ingestType") # Enriched record should have an _id if enriched_record.get("_id", None): # Item records should have sourceResource if (ingest_type == "item" and not "sourceResource" in enriched_record): logger.error("Records %s does not have sourceResource: %s" % (enriched_record["_id"], enriched_record)) missing_source_resource_count += 1 else: enriched_records[enriched_record["_id"]] = enriched_record if ingest_type == "item": enriched_item_count += 1 else: enriched_coll_count += 1 else: missing_id_count += 1 data = { "enriched_records": enriched_records, "enriched_coll_count": enriched_coll_count, "enriched_item_count": enriched_item_count, "missing_id_count": missing_id_count, "missing_source_resource_count": missing_source_resource_count, "errors": errors } return json.dumps(data)
def enrich(body,ctype): ''' Establishes a pipeline of services identified by an ordered list of URIs provided in two request headers, one for collections and one for records ''' request_headers = copy_headers_to_dict(request.environ) source_name = request_headers.get('Source') collection_name = request_headers.get('Collection') if not (collection_name or source_name): response.code = 500 response.add_header('content-type','text/plain') return "Source and Collection request headers are required" coll_enrichments = request_headers.get(u'Pipeline-Coll','').split(',') rec_enrichments = request_headers.get(u'Pipeline-Rec','').split(',') data = json.loads(body) # First, we run the collection representation through its enrichment pipeline cid = "%s-%s"%(source_name,collection_name) at_id = "http://dp.la/api/collections/" + cid COLL = { "_id": cid, "@id": at_id, } enriched_coll_text = pipe(COLL, ctype, coll_enrichments, 'HTTP_PIPELINE_COLL') enriched_collection = json.loads(enriched_coll_text) if COUCH_DATABASE: docuri = couch_rev_check(join(COUCH_DATABASE,cid)) resp, cont = H.request(docuri,'PUT',body=enriched_coll_text,headers=CT_JSON) if not str(resp.status).startswith('2'): logger.debug("Error storing collection in Couch: "+repr((resp,content))) # Then the records for record in data[u'items']: # Preserve record prior to any enrichments record['original_record'] = record.copy() # Add collection information record[u'collection'] = { '@id' : at_id, 'title' : enriched_collection.get('title',"") } # Set id to value of the first handle, disambiguated w source. Not sure if # one is guaranteed or on what scale it's unique rid = "%s-%s"%(source_name,record[u'handle'][0].strip()) record[u'_id'] = rid enriched_record = pipe(record, ctype, rec_enrichments, 'HTTP_PIPELINE_REC') if COUCH_DATABASE: docuri = couch_rev_check(join(COUCH_DATABASE,rid)) resp, cont = H.request(docuri,'PUT',body=enriched_record,headers=CT_JSON) if not str(resp.status).startswith('2'): logger.debug("Error storing record in Couch: "+repr((resp,content))) continue return json.dumps({})
def get_page(environ, start_response): #logger.debug('get_page: ' + repr((environ['SCRIPT_NAME'], environ['PATH_INFO']))) req_headers = copy_headers_to_dict(environ, exclude=['HTTP_ACCEPT_ENCODING']) wiki_id, base, opener, original_page, wrapped_wiki_base = target(environ) page = environ['PATH_INFO'].lstrip('/') check_auth(environ, start_response, base, opener, req_headers) upstream_handler = None status = httplib.OK params = cgi.parse_qs(environ['QUERY_STRING']) #Note: probably a better solution here: http://code.google.com/p/mimeparse/ accepted_imts = environ.get('HTTP_ACCEPT', '').split(',') #logger.debug('accepted_imts: ' + repr(accepted_imts)) imt = first_item(dropwhile(lambda x: '*' in x, accepted_imts)) #logger.debug('imt: ' + repr(imt)) params_for_moin = {} cache_max_age = CACHE_MAX_AGE # max-age of this response. If set to None, it will not be used if NO_CACHE_PATHS and first_item( dropwhile(lambda x: x not in page, NO_CACHE_PATHS)): cache_max_age = None if 'rev' in params: #XXX: Not compatible with search #params_for_moin = {'rev' : params['rev'][0], 'action': 'recall'} params_for_moin = {'rev': params['rev'][0]} if 'search' in params: searchq = params['search'][0] query = urllib.urlencode({ 'value': searchq, 'action': 'fullsearch', 'context': '180', 'fullsearch': 'Text' }) #?action=fullsearch&context=180&value=foo&=Text url = absolutize('?' + query, base) request = urllib2.Request(url, None, req_headers) ctype = moin.RDF_IMT cache_max_age = None #elif 'action' in params and params['action'][0] == 'recall': elif moin.HTML_IMT in environ.get('HTTP_ACCEPT', ''): params = urllib.urlencode(params_for_moin) url = absolutize(page + '?' + params, base) request = urllib2.Request(url, None, req_headers) ctype = moin.HTML_IMT elif moin.RDF_IMT in environ.get('HTTP_ACCEPT', ''): #FIXME: Make unique flag optional #url = base + '/RecentChanges?action=rss_rc&unique=1&ddiffs=1' url = absolutize('RecentChanges?action=rss_rc&unique=1&ddiffs=1', base) #print >> sys.stderr, (url, base, '/RecentChanges?action=rss_rc&unique=1&ddiffs=1', ) request = urllib2.Request(url, None, req_headers) ctype = moin.RDF_IMT elif moin.ATTACHMENTS_IMT in environ.get('HTTP_ACCEPT', ''): url = absolutize(page + '?action=AttachFile', base) request = urllib2.Request(url, None, req_headers) ctype = moin.ATTACHMENTS_IMT def upstream_handler(): #Sigh. Sometimes you have to break some Tag soup eggs to make a RESTful omlette with closing(opener.open(request)) as resp: rbody = resp.read() doc = htmlparse(rbody) raise_embedded_error(doc) attachment_nodes = doc.xml_select( u'//*[contains(@href, "action=AttachFile") and contains(@href, "do=view")]' ) targets = [] for node in attachment_nodes: target = [ param.split('=', 1)[1] for param in node.href.split(u'&') if param.startswith('target=') ][0] targets.append(target) output = structencoder(indent=u"yes") output.feed( ROOT( E((u'attachments'), (E(u'attachment', {u'href': unicode(t)}) for t in targets)))) return output.read(), ctype #Notes on use of URI parameters - http://markmail.org/message/gw6xbbvx4st6bksw elif ';attachment=' in page: page, attachment = page.split(';attachment=', 1) url = absolutize( page + '?action=AttachFile&do=get&target=' + attachment, base) request = urllib2.Request(url, None, req_headers) def upstream_handler(): with closing(opener.open(request)) as resp: rbody = resp.read() return rbody, dict(resp.info())['content-type'] # elif ';history' in page: cache_max_age = None page, discard = page.split(';history', 1) ctype = moin.XML_IMT def upstream_handler(): revs = scrape_page_history(page, base, opener, req_headers) output = structencoder(indent=u"yes") output.feed( ROOT( E((u'history'), (E( u'rev', { u'id': unicode(r['rev']), u'editor': unicode(r['editor']), u'date': unicode(r['date']).replace(' ', 'T') }) for r in revs)))) return output.read(), ctype elif imt: params_for_moin.update({'mimetype': imt}) params = urllib.urlencode(params_for_moin) url = absolutize(page, base) + '?' + params request = urllib2.Request(url, None, req_headers) ctype = moin.DOCBOOK_IMT else: params_for_moin.update({'action': 'raw'}) params = urllib.urlencode(params_for_moin) url = absolutize(page, base) + '?' + params request = urllib2.Request(url, None, req_headers) ctype = moin.WIKITEXT_IMT try: if upstream_handler: rbody, ctype = upstream_handler() else: with closing(opener.open(request)) as resp: rbody = resp.read() #headers = {moin.ORIG_BASE_HEADER: base} #moin_base = absolutize(wiki_id, base) moin_base_info = base + ' ' + wrapped_wiki_base + ' ' + original_page response_headers = [("Content-Type", ctype), ("Vary", "Accept"), (moin.ORIG_BASE_HEADER, moin_base_info)] if cache_max_age: response_headers.append( ("Cache-Control", "max-age=" + str(cache_max_age))) start_response(status_response(status), response_headers) return rbody except urllib2.URLError, e: if e.code == 401: raise HTTPAuthorizationError(url=request.get_full_url()) if e.code == 403: raise MoinMustAuthenticateError(url=request.get_full_url(), target=wiki_id) if e.code == 404: raise MoinNotFoundError(fronturl=request_uri(environ), backurl=url) else: raise UnexpectedResponseError(url=url, code=e.code, error=str(e))
def enrich(body,ctype): ''' Establishes a pipeline of services identified by an ordered list of URIs provided in two request headers, one for collections and one for records ''' request_headers = copy_headers_to_dict(request.environ) source_name = request_headers.get('Source') collection_name = request_headers.get('Collection') if not (collection_name or source_name): response.code = 500 response.add_header('content-type','text/plain') return "Source and Collection request headers are required" coll_enrichments = request_headers.get(u'Pipeline-Coll','').split(',') rec_enrichments = request_headers.get(u'Pipeline-Rec','').split(',') data = json.loads(body) # First, we run the collection representation through its enrichment pipeline cid = COUCH_ID_BUILDER(source_name,collection_name) at_id = "http://dp.la/api/collections/" + cid COLL = { "_id": cid, "@id": at_id, "ingestType": "collection" } # Set collection title field from collection_name if no sets if not coll_enrichments[0]: COLL['title'] = collection_name set_ingested_date(COLL) enriched_coll_text = pipe(COLL, ctype, coll_enrichments, 'HTTP_PIPELINE_COLL') enriched_collection = json.loads(enriched_coll_text) # FIXME. Integrate collection storage into bulk call below if COUCH_DATABASE: docuri = join(COUCH_DATABASE,cid) couch_rev_check_coll(docuri,enriched_collection) resp, cont = H.request(docuri,'PUT',body=json.dumps(enriched_collection),headers=dict(CT_JSON.items()+COUCH_AUTH_HEADER.items())) if not str(resp.status).startswith('2'): logger.debug("Error storing collection in Couch: "+repr((resp,cont))) # Then the records docs = [] for record in data[u'items']: # Preserve record prior to any enrichments record['originalRecord'] = record.copy() # Add collection information record[u'collection'] = { '@id' : at_id, 'name' : enriched_collection.get('title',"") } if 'description' in enriched_collection: record[u'collection']['description'] = enriched_collection.get('description',"") record[u'ingestType'] = 'item' set_ingested_date(record) doc_text = pipe(record, ctype, rec_enrichments, 'HTTP_PIPELINE_REC') doc = json.loads(doc_text) docs.append(doc) couch_rev_check_recs(docs,source_name) couch_docs_text = json.dumps({"docs":docs}) if COUCH_DATABASE: resp, content = H.request(join(COUCH_DATABASE,'_bulk_docs'),'POST',body=couch_docs_text,headers=dict(CT_JSON.items()+COUCH_AUTH_HEADER.items())) logger.debug("Couch bulk update response: "+content) if not str(resp.status).startswith('2'): logger.debug('HTTP error posting to CouchDB: '+repr((resp,content))) return json.dumps({'docs' : docs})
def enrich(body, ctype): """ Establishes a pipeline of services identified by an ordered list of URIs provided in two request headers, one for collections and one for records. Returns a JSON dump of the collections and records enriched along with a count of records enriched. """ request_headers = copy_headers_to_dict(request.environ) rec_enrichments = request_headers.get(u'Pipeline-Rec', '').split(',') coll_enrichments = request_headers.get(u'Pipeline-Coll', '').split(',') data = json.loads(body) provider = data['provider'] collection = data['collection'] contributor = data['contributor'] # Enrich collection first if collection: coll_id = collection.get('id') desc = collection.get('description') title = collection.get('title') COLLECTIONS[coll_id] = enrich_coll(ctype, provider, coll_id, coll_enrichments, title, desc) docs = {} for record in data['records']: # Preserve record prior to any enrichments record['originalRecord'] = record.copy() # Set ingestType, provider, and ingestDate record[u'ingestType'] = 'item' record[u'provider'] = contributor set_ingested_date(record) # Add collection(s) record[u'collection'] = [] # OAI records can be part of multiple collections whose titles are # listed in the record's "setSpec" property sets = record.get('setSpec') if sets: for set_id in iterify(sets): if set_id not in COLLECTIONS: COLLECTIONS[set_id] = enrich_coll(ctype, provider, set_id, coll_enrichments) record[u'collection'].append(create_record_collection( COLLECTIONS[set_id]) ) if len(record[u'collection']) == 1: record[u'collection'] = record[u'collection'][0] elif collection: record[u'collection'] = create_record_collection( COLLECTIONS[coll_id] ) doc_text = pipe(record, ctype, rec_enrichments, 'HTTP_PIPELINE_REC') doc = json.loads(doc_text) # After pipe doc must have _id and sourceResource if doc.get("_id", None): if "sourceResource" in doc: logger.debug("Enriched record %s" % doc["_id"]) docs[doc["_id"]] = doc else: logger.error("Document %s does not have sourceResource: %s" % (doc["_id"], doc)) else: logger.error("Document does not have an _id: %s" % doc) enriched_records_count = len(docs) # Add collections to docs for collection in COLLECTIONS.values(): docs[collection["_id"]] = collection data = { "enriched_records": docs, "enriched_records_count": enriched_records_count } return json.dumps(data)
def enrich(body, ctype): """ Establishes a pipeline of services identified by an ordered list of URIs provided in two request headers, one for collections and one for records """ request_headers = copy_headers_to_dict(request.environ) source_name = request_headers.get('Source') collection_name = request_headers.get('Collection') if not (collection_name or source_name): response.code = 500 response.add_header('content-type','text/plain') return "Source and Collection request headers are required" coll_enrichments = request_headers.get(u'Pipeline-Coll', '').split(',') rec_enrichments = request_headers.get(u'Pipeline-Rec', '').split(',') data = json.loads(body) # For non-OAI, the collection title is included as part of the data, # so we extract it here to pass it to def enrich_coll a few lines down. # For OAI, the collection enrichment pipeline with set the title and so # None will be overridden. collection_title = data.get("title", None) docs = {} for record in data[u'items']: # Preserve record prior to any enrichments record['originalRecord'] = record.copy() # Add collection(s) record[u'collection'] = [] sets = record.get('setSpec', collection_name) for s in (sets if isinstance(sets, list) else [sets]): if s not in COLLECTIONS: COLLECTIONS[s] = enrich_coll(ctype, source_name, s, collection_title, coll_enrichments) rec_collection = { 'id': COLLECTIONS[s].get('id', None), '@id': COLLECTIONS[s].get('@id', None), 'title': COLLECTIONS[s].get('title', None), 'description': COLLECTIONS[s].get('description', None) } record[u'collection'].append(dict((k, v) for k, v in rec_collection.items() if v)) if len(record[u'collection']) == 1: record[u'collection'] = record[u'collection'][0] record[u'ingestType'] = 'item' set_ingested_date(record) doc_text = pipe(record, ctype, rec_enrichments, 'HTTP_PIPELINE_REC') doc = json.loads(doc_text) # After pipe doc must have _id and sourceResource if doc.get("_id", None): if "sourceResource" in doc: docs[doc["_id"]] = doc else: logger.error("Document does not have sourceResource: %s" % doc["_id"]) # Add collections to docs for collection in COLLECTIONS.values(): docs[collection["_id"]] = collection return json.dumps(docs)