def pipe(content, ctype, enrichments, wsgi_header): body = json.dumps(content) error = None for uri in enrichments: if not uri: continue # in case there's no pipeline if not is_absolute(uri): prefix = request.environ["wsgi.url_scheme"] + "://" if request.environ.get("HTTP_HOST"): prefix += request.environ["HTTP_HOST"] else: prefix += request.environ["SERVER_NAME"] # Join the prefix and given pipeline module path, ensuring the # path starts with "/". uri = prefix + re.sub(r"^(?!/)", "/", uri) headers = copy_headers_to_dict(request.environ, exclude=[wsgi_header]) headers["content-type"] = ctype logger.debug("Calling url: %s " % uri) resp, cont = H.request(uri, "POST", body=body, headers=headers) if not str(resp.status).startswith("2"): error = "Error in enrichment pipeline at %s" % uri logger.error(error) continue body = cont return error, body
def is_shown_at_transform(d): source = "" for s in d["handle"] if not isinstance(d["handle"], basestring) else [d["handle"]]: if is_absolute(s): source = s break return {"isShownAt": {"@id": source, "format": d.get("format", None), "rights": d.get("rights", None)}}
def source_transform(d): source = None for s in d["handle"]: if is_absolute(s): source = s break return {"source": source} if source else {}
def __new__(cls, arg, uri=None, encoding=None, resolver=None, sourcetype=0): """ arg - a string, Unicode object (only if you really know what you're doing), file-like object (stream), file path or URI. You can also pass an InputSource object, in which case the return value is just the same object, possibly with the URI modified uri - optional override URI. The base URI for the IS will be set to this value Returns an input source which can be passed to Amara APIs. """ #do the imports within the function to avoid circular crap #from amara._xmlstring import IsXml as isxml #These importa are tucked in here because amara.lib.iri is an expensive import from amara.lib.iri import is_absolute, os_path_to_uri from amara.lib.irihelpers import DEFAULT_RESOLVER resolver = resolver or DEFAULT_RESOLVER if isinstance(arg, InputSource): return arg #if arg == (u'', ''): -> UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal if arg == '': #FIXME L10N raise ValueError("Cannot parse an empty string as XML") if isinstance(arg, urllib2.Request): uri = arg.get_full_url() #One of the rightly labeled "lame" helper methods in urllib2 ;) stream = resolver.resolve(arg) elif hasattr(arg, 'read'): #Create dummy Uri to use as base uri = uri or uuid4().urn stream = arg #XXX: Should we at this point refuse to proceed unless it's a basestring? elif sourcetype == XMLSTRING or isxml(arg): #See this article about XML detection heuristics #http://www.xml.com/pub/a/2007/02/28/what-does-xml-smell-like.html uri = uri or uuid4().urn stream = StringIO(arg) elif is_absolute(arg) and not os.path.isfile(arg): uri = arg stream = resolver.resolve(uri) #If the arg is beyond a certain length, don't even try it as a URI elif len(arg) < MAX_URI_LENGTH_FOR_HEURISTIC: uri = os_path_to_uri(arg) stream = resolver.resolve(uri) else: #FIXME L10N raise ValueError("Does not appear to be well-formed XML") #We might add the ability to load zips, gzips & bzip2s #http://docs.python.org/lib/module-zlib.html #http://docs.python.org/lib/module-gzip.html #http://docs.python.org/lib/module-bz2.html #http://docs.python.org/lib/zipfile-objects.html #import inspect; print inspect.stack() #InputSource.__new__ is in C: expat/input_source.c:inputsource_new return InputSource.__new__(cls, stream, uri, encoding)
def cdl_identify_object(body, ctype): """ Responsible for: adding a field to a document with the URL where we should expect to the find the thumbnail. """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" url = None if exists(data, "object"): handle = getprop(data, "object") for h in (handle if not isinstance(handle, basestring) else [handle]): if is_absolute(h): url = h break if exists(data, "originalRecord/doc/isShownBy"): handle = getprop(data, "originalRecord/doc/isShownBy") for h in (handle if not isinstance(handle, basestring) else [handle]): if is_absolute(h): url = h break if url: if 'content.cdlib.org' in url: base_url, obj_id, object_type = url.rsplit("/", 2) is_shown_at = getprop(data, "isShownAt") is_shown_at_base, is_shown_at_id = is_shown_at.rsplit("/", 1) if obj_id != is_shown_at_id: logger.warn( "Object url for %s has ARK value (%s) that does not match isShownAt (%s)" % (data["_id"], obj_id, is_shown_at_id)) obj_id = is_shown_at_id url = "/".join([base_url, obj_id, object_type]) if object_type == "hi-res": setprop(data, "hasView", {"@id": url}) url = url.replace('hi-res', 'thumbnail') setprop(data, "object", url) else: logger.warn("No url found for object in id %s" % data["_id"]) delprop(data, "object", True) return json.dumps(data)
def is_shown_at_transform(d): source = None for s in (d["handle"] if not isinstance(d["handle"],basestring) else [d["handle"]]): if is_absolute(s): source = s break return {"isShownAt" : source }
def cdl_identify_object(body, ctype): """ Responsible for: adding a field to a document with the URL where we should expect to the find the thumbnail. """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" url = None if exists(data, "object"): handle = getprop(data, "object") for h in (handle if not isinstance(handle, basestring) else [handle]): if is_absolute(h): url = h break if exists(data, "originalRecord/doc/isShownBy"): handle = getprop(data, "originalRecord/doc/isShownBy") for h in (handle if not isinstance(handle, basestring) else [handle]): if is_absolute(h): url = h break if url: if 'content.cdlib.org' in url: base_url, obj_id, object_type = url.rsplit("/", 2) is_shown_at = getprop(data, "isShownAt") is_shown_at_base, is_shown_at_id = is_shown_at.rsplit("/", 1) if obj_id != is_shown_at_id: logger.warn("Object url for %s has ARK value (%s) that does not match isShownAt (%s)" % (data["_id"], obj_id, is_shown_at_id)) obj_id = is_shown_at_id url = "/".join([base_url, obj_id, object_type]) if object_type == "hi-res": setprop(data, "hasView", {"@id": url}) url = url.replace('hi-res', 'thumbnail') setprop(data, "object", url) else: logger.warn("No url found for object in id %s" % data["_id"]) delprop(data, "object", True) return json.dumps(data)
def is_shown_at_transform(d): source = None for s in (d["handle"] if not isinstance(d["handle"], basestring) else [d["handle"]]): if is_absolute(s): source = s break return {"isShownAt": source}
def evaluate_as_nodeset(self, context): arg0, arg1 = self._args if arg1 is None: base_uri = context.instruction.baseUri else: for node in arg1.evaluate_as_nodeset(context): base_uri = node.xml_base break else: raise XsltRuntimeError(XsltError.DOC_FUNC_EMPTY_NODESET, context.instruction) arg0 = arg0.evaluate(context) if isinstance(arg0, datatypes.nodeset): uris = set() for node in arg0: uri = datatypes.string(node) if arg1 is None: base_uri = node.xml_base assert base_uri or iri.is_absolute(uri) uris.add(iri.absolutize(uri, base_uri)) else: uri = datatypes.string(arg0) assert base_uri or iri.is_absolute(uri) uris = [iri.absolutize(uri, base_uri)] documents = context.documents sources = context.transform.root.sources result = [] for uri in uris: if uri in documents: doc = documents[uri] else: if uri in sources: doc = amara.parse(StringIO(sources[uri]), uri) else: doc = amara.parse(uri) documents[uri] = doc result.append(doc) return datatypes.nodeset(result)
def selid(body, ctype, prop='handle', use_source='yes'): ''' Service that accepts a JSON document and adds or sets the "id" property to the value of the property named by the "prop" paramater ''' if not prop: # Remove this document response.code = 500 response.add_header('content-type', 'text/plain') return "No id property has been selected" try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" request_headers = copy_headers_to_dict(request.environ) source_name = request_headers.get('Source') record_id = None if exists(data, prop): v = getprop(data, prop) if isinstance(v, basestring): record_id = v else: if v: for h in (v if isinstance(v, list) else [v]): if is_absolute(h): record_id = h if not record_id: record_id = v[0] if not record_id: response.code = 500 response.add_header('content-type', 'text/plain') return "No id property was found" ''' If the useSource parameter is True (default) than prepend it to the id and use that value when hashing for the DPLA id ''' if use_source.lower() == 'yes': data[u'_id'] = couch_rec_id_builder(source_name, record_id) else: data[u'_id'] = clean_id(record_id) data[u'id'] = hashlib.md5(data[u'_id']).hexdigest() return json.dumps(data)
def oaisetname(body, ctype, sets_service=None): ''' Service that accepts a JSON document and sets the "name" property based on looking up the set in the HTTP_CONTEXT using the service passed in the 'sets_service' parameter. Assumes that the set_service returns a JSON array of two-element arrays, where the first element is the id and the second element the complete name. ''' if not sets_service: response.code = 500 response.add_header('content-type', 'text/plain') return "No set service has been selected" try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if not is_absolute(sets_service): prefix = request.environ['wsgi.url_scheme'] + '://' prefix += request.environ['HTTP_HOST'] if request.environ.get( 'HTTP_HOST') else request.environ['SERVER_NAME'] sets_service = prefix + sets_service H = httplib2.Http('/tmp/.cache') H.force_exception_as_status_code = True resp, content = H.request(sets_service) if not resp[u'status'].startswith('2'): print >> sys.stderr, ' HTTP error (' + resp[ u'status'] + ') resolving URL: ' + sets_service try: sets = json.loads(content) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse sets service result as JSON: " + repr(content) setpos = data['_id'].find('--') match = data['_id'][setpos + 2:] if setpos > -1 else data['_id'] for s in sets: if match == s['setSpec']: data[u'title'] = s['setName'] if s['setDescription']: data[u'description'] = s['setDescription'].strip() break return json.dumps(data)
def selid(body, ctype, prop='descriptiveNonRepeating/record_link', alternative_prop='descriptiveNonRepeating/record_ID'): ''' Service that accepts a JSON document and adds or sets the "id" property to the value of the property named by the "prop" paramater ''' tmpl = "http://collections.si.edu/search/results.htm?q=record_ID%%3A%s&repo=DPLA" if prop: try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" request_headers = copy_headers_to_dict(request.environ) source_name = request_headers.get('Source') id = None if exists(data, prop) or exists(data, alternative_prop): v = getprop(data, prop, True) if not v: v = getprop(data, alternative_prop) v = tmpl % v if isinstance(v, basestring): id = v else: if v: for h in v: if is_absolute(h): id = h if not id: id = v[0] if not id: response.code = 500 response.add_header('content-type', 'text/plain') return "No id property was found" data[u'_id'] = COUCH_REC_ID_BUILDER(source_name, id) data[u'id'] = hashlib.md5(data[u'_id']).hexdigest() else: logger.error("Prop param in None in %s" % __name__) return json.dumps(data)
def oaisetname(body,ctype,sets_service=None): ''' Service that accepts a JSON document and sets the "name" property based on looking up the set in the HTTP_CONTEXT using the service passed in the 'sets_service' parameter. Assumes that the set_service returns a JSON array of two-element arrays, where the first element is the id and the second element the complete name. ''' if not sets_service: response.code = 500 response.add_header('content-type','text/plain') return "No set service has been selected" try : data = json.loads(body) except: response.code = 500 response.add_header('content-type','text/plain') return "Unable to parse body as JSON" if not is_absolute(sets_service): prefix = request.environ['wsgi.url_scheme'] + '://' prefix += request.environ['HTTP_HOST'] if request.environ.get('HTTP_HOST') else request.environ['SERVER_NAME'] sets_service = prefix + sets_service H = httplib2.Http('/tmp/.cache') H.force_exception_as_status_code = True resp, content = H.request(sets_service) if not resp[u'status'].startswith('2'): print >> sys.stderr, ' HTTP error ('+resp[u'status']+') resolving URL: '+sets_service try : sets = json.loads(content) except: response.code = 500 response.add_header('content-type','text/plain') return "Unable to parse sets service result as JSON: " + repr(content) setpos = data['_id'].find('--') match = data['_id'][setpos+2:] if setpos > -1 else data['_id'] for s in sets: if match == s['setSpec']: data[u'title'] = s['setName'] if s['setDescription']: data[u'description'] = s['setDescription'].strip() break return json.dumps(data)
def map_is_shown_at(self, index=None): if exists(self.provider_data, "handle"): is_shown_at = None identifiers = [id for id in iterify(self.provider_data["handle"]) if is_absolute(id)] if index: try: is_shown_at = identifiers[int(index)] except: pass if not is_shown_at: is_shown_at = identifiers[0] if is_shown_at: self.mapped_data.update({"isShownAt": is_shown_at})
def map_is_shown_at(self, index=None): if exists(self.provider_data, "handle"): is_shown_at = None identifiers = [ id for id in iterify(self.provider_data["handle"]) if is_absolute(id) ] if index: try: is_shown_at = identifiers[int(index)] except: pass if not is_shown_at: is_shown_at = identifiers[0] if is_shown_at: self.mapped_data.update({"isShownAt": is_shown_at})
def selid(body,ctype,prop='descriptiveNonRepeating/record_link', alternative_prop='descriptiveNonRepeating/record_ID'): ''' Service that accepts a JSON document and adds or sets the "id" property to the value of the property named by the "prop" paramater ''' tmpl="http://collections.si.edu/search/results.htm?q=record_ID%%3A%s&repo=DPLA" if prop: try : data = json.loads(body) except: response.code = 500 response.add_header('content-type','text/plain') return "Unable to parse body as JSON" request_headers = copy_headers_to_dict(request.environ) source_name = request_headers.get('Source') id = None if exists(data, prop) or exists(data, alternative_prop): v = getprop(data,prop, True) if not v: v = getprop(data, alternative_prop) v = tmpl % v if isinstance(v,basestring): id = v else: if v: for h in v: if is_absolute(h): id = h if not id: id = v[0] if not id: response.code = 500 response.add_header('content-type','text/plain') return "No id property was found" data[u'_id'] = COUCH_REC_ID_BUILDER(source_name, id) data[u'id'] = hashlib.md5(data[u'_id']).hexdigest() else: logger.error("Prop param in None in %s" % __name__) return json.dumps(data)
def selid(body, ctype, prop='handle'): ''' Service that accepts a JSON document and adds or sets the "id" property to the value of the property named by the "prop" paramater ''' if not prop: # Remove this document response.code = 500 response.add_header('content-type', 'text/plain') return "No id property has been selected" try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" request_headers = copy_headers_to_dict(request.environ) source_name = request_headers.get('Source') id = None if exists(data, prop): v = getprop(data, prop) if isinstance(v, basestring): id = v else: if v: for h in (v if isinstance(v, list) else [v]): if is_absolute(h): id = h if not id: id = v[0] if not id: response.code = 500 response.add_header('content-type', 'text/plain') return "No id property was found" data[u'_id'] = COUCH_REC_ID_BUILDER(source_name, id) # we don't use this, dump it # data[u'id'] = hashlib.md5(data[u'_id']).hexdigest() return json.dumps(data)
def pipe(content, ctype, enrichments, wsgi_header): body = json.dumps(content) for uri in enrichments: if not uri: continue # in case there's no pipeline if not is_absolute(uri): prefix = request.environ['wsgi.url_scheme'] + '://' prefix += request.environ['HTTP_HOST'] if request.environ.get('HTTP_HOST') else request.environ['SERVER_NAME'] uri = prefix + uri headers = copy_headers_to_dict(request.environ, exclude=[wsgi_header]) headers['content-type'] = ctype logger.debug("Calling url: %s " % uri) resp, cont = H.request(uri, 'POST', body=body, headers=headers) if not str(resp.status).startswith('2'): logger.warn("Error in enrichment pipeline at %s: %s"%(uri,repr(resp))) continue body = cont return body
def selid(body, ctype, prop='handle'): ''' Service that accepts a JSON document and adds or sets the "id" property to the value of the property named by the "prop" paramater ''' if not prop: # Remove this document response.code = 500 response.add_header('content-type', 'text/plain') return "No id property has been selected" try : data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" request_headers = copy_headers_to_dict(request.environ) source_name = request_headers.get('Source') id = None if exists(data,prop): v = getprop(data,prop) if isinstance(v,basestring): id = v else: if v: for h in (v if isinstance(v, list) else [v]): if is_absolute(h): id = h if not id: id = v[0] if not id: response.code = 500 response.add_header('content-type', 'text/plain') return "No id property was found" data[u'_id'] = COUCH_REC_ID_BUILDER(source_name, id) data[u'id'] = hashlib.md5(data[u'_id']).hexdigest() return json.dumps(data)
def update_resource(self, path=None): ''' Update a resource based on WSGI environment or a uri path ''' if path: docid = path if is_absolute(path): docid = relativize(path, self.remotedb) else: docid = self.environ['PATH_INFO'].lstrip('/').rsplit(self.space_tag, 1)[1].lstrip('/') #e.g. '/mydb/MyDoc' -> 'MyDoc' if logger: logger.debug('query ' + repr((self.remotedb, docid, join(self.remotedb, docid)))) body = self.environ['wsgi.input'].read() # If the document already exists, we need to determine its current rev and add it to the # input body, skipping the process if rev is provided in the PUT request body body_js = json.loads(body) rev = json.loads(body).get('_rev',None) if not rev: # Need to GET the rev resp, content = self.h.request(join(self.remotedb, docid), "GET") if str(resp.status).startswith('2'): rev = json.loads(content).get('_rev',None) logger.debug('update_resource: found existing rev = '+repr(rev)) if rev: body_js['_rev'] = rev body = json.dumps(body_js) headers = {'content-type':self.environ['CONTENT_TYPE']} resp, content = self.h.request(join(self.remotedb, docid), "PUT", body=body, headers=headers) if logger: logger.debug('resp ' + repr((content[:100], resp))) self.prep_slave_response(resp) if not (self.resp_status.startswith('2') or self.resp_status.startswith('304')): if logger: logger.debug("Error looking up resource: %s: %s\n" % (content, self.resp_status)) return '' #No resource could be retrieved return content
def wiki_uri(original_base, wrapped_base, link, relative_to=None, raw=False): ''' Constructs absolute URLs to the original and REST-wrapper for a page, given a link from another page original_base - The base URI of the actual Moin instance wrapped_base - The base URI of the REST-wrapped proxy of the Moin instance link - the relative link, generally from one wiki page to another relative_to - the REST-wrapped version of the page from which the relative link came, defaults to same as wrapped_base raw - the link is a full hierarchical path, rather than relative to the wiki base Returns a tuple (wrapped_uri, abs_link) wrapped_uri - the URI wrapped for REST ops abs_link - the full, original wiki URL >>> from akara.util.moin import wiki_uri >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', '/spam') ('http://localhost:8880/moin/w/spam', 'http://example.com/mywiki/spam') >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', 'http://google.com/spam') (None, None) >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', 'http://google.com/spam', raw=True) (None, None) >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', '/mywiki/spam', raw=True) ('http://localhost:8880/moin/w/spam', 'http://example.com/mywiki/spam') >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', '/mywiki/spam') ('http://localhost:8880/moin/w/mywiki/spam', 'http://example.com/mywiki/mywiki/spam') ''' #rel_link = relativize(abs_link, original_wiki_base) #e.g. original wiki base is http://myhost:8080/mywiki/ and link is /a/b #abs_link is http://myhost:8080/mywiki/a/b note the need to strip the leading / to get that #from akara import logger; logger.debug('wiki_uri' + repr((original_base, wrapped_base, link, relative_to, absolutize(link, original_base.rstrip('/')+'/')))) if raw and not is_absolute(link): (scheme, authority, path, query, fragment) = split_uri_ref(original_base) link = link[len(path):] link = link.lstrip('/') abs_link = absolutize(link, original_base.rstrip('/') + '/') rel_to_wikibase = relativize(abs_link, original_base.rstrip('/') + '/') if not rel_to_wikibase: #It's not a relative wiki link return None, None rest_uri = absolutize(rel_to_wikibase, wrapped_base.rstrip('/') + '/') return rest_uri, abs_link
def wiki_uri(original_base, wrapped_base, link, relative_to=None, raw=False): ''' Constructs absolute URLs to the original and REST-wrapper for a page, given a link from another page original_base - The base URI of the actual Moin instance wrapped_base - The base URI of the REST-wrapped proxy of the Moin instance link - the relative link, generally from one wiki page to another relative_to - the REST-wrapped version of the page from which the relative link came, defaults to same as wrapped_base raw - the link is a full hierarchical path, rather than relative to the wiki base Returns a tuple (wrapped_uri, abs_link) wrapped_uri - the URI wrapped for REST ops abs_link - the full, original wiki URL >>> from akara.util.moin import wiki_uri >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', '/spam') ('http://localhost:8880/moin/w/spam', 'http://example.com/mywiki/spam') >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', 'http://google.com/spam') (None, None) >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', 'http://google.com/spam', raw=True) (None, None) >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', '/mywiki/spam', raw=True) ('http://localhost:8880/moin/w/spam', 'http://example.com/mywiki/spam') >>> wiki_uri('http://example.com/mywiki/', 'http://localhost:8880/moin/w/', '/mywiki/spam') ('http://localhost:8880/moin/w/mywiki/spam', 'http://example.com/mywiki/mywiki/spam') ''' #rel_link = relativize(abs_link, original_wiki_base) #e.g. original wiki base is http://myhost:8080/mywiki/ and link is /a/b #abs_link is http://myhost:8080/mywiki/a/b note the need to strip the leading / to get that #from akara import logger; logger.debug('wiki_uri' + repr((original_base, wrapped_base, link, relative_to, absolutize(link, original_base.rstrip('/')+'/')))) if raw and not is_absolute(link): (scheme, authority, path, query, fragment) = split_uri_ref(original_base) link = link[len(path):] link = link.lstrip('/') abs_link = absolutize(link, original_base.rstrip('/')+'/') rel_to_wikibase = relativize(abs_link, original_base.rstrip('/')+'/') if not rel_to_wikibase: #It's not a relative wiki link return None, None rest_uri = absolutize(rel_to_wikibase, wrapped_base.rstrip('/')+'/') return rest_uri, abs_link
def selectid(body, ctype): ''' Service that accepts a JSON document and adds or sets the "id" property to the value of the property named by the "prop" paramater ''' try : data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" request_headers = copy_headers_to_dict(request.environ) source_name = request_headers.get('Source') objid = None v = getprop(data, 'identifier') if isinstance(v,basestring): objid = v else: if v: for h in (v if isinstance(v, list) else [v]): if h['text'].startswith('http://ark.cdlib.org/ark:'): if is_absolute(h['text']): objid = h['text'] if not objid: objid = v[0] if not objid: response.code = 500 response.add_header('content-type', 'text/plain') return "No id property was found" data[u'_id'] = COUCH_REC_ID_BUILDER(source_name, objid) data[u'id'] = hashlib.md5(data[u'_id']).hexdigest() data[u'isShownAt'] = objid data[u'isShownBy'] = objid + '/thumbnail' return json.dumps(data)
def pipe(content, ctype, enrichments, wsgi_header): body = json.dumps(content) for uri in enrichments: if not uri: continue # in case there's no pipeline if not is_absolute(uri): prefix = request.environ['wsgi.url_scheme'] + '://' if request.environ.get('HTTP_HOST'): prefix += request.environ['HTTP_HOST'] else: prefix += request.environ['SERVER_NAME'] uri = prefix + uri headers = copy_headers_to_dict(request.environ, exclude=[wsgi_header]) headers['content-type'] = ctype logger.debug("Calling url: %s " % uri) resp, cont = H.request(uri, 'POST', body=body, headers=headers) if not str(resp.status).startswith('2'): logger.warn("Error in enrichment pipeline at %s: %s" % (uri, repr(resp))) continue body = cont return body
def delete_resource(self, path=None): ''' Delete a resource based on WSGI environment or a uri path ''' if path: docid = path if is_absolute(path): docid = relativize(path, self.remotedb) else: docid = self.environ['PATH_INFO'].lstrip('/').rsplit(self.space_tag, 1)[1].lstrip('/') #e.g. '/mydb/MyDoc' -> 'MyDoc' if logger: logger.debug('query ' + repr((self.remotedb, docid, join(self.remotedb, docid)))) resp, content = self.h.request(join(self.remotedb, docid), "DELETE")#, headers=headers) if logger: logger.debug('resp ' + repr((content[:100], resp))) self.prep_slave_response(resp) if not (self.resp_status.startswith('2') or self.resp_status.startswith('304')): if logger: logger.debug("Error looking up resource: %s: %s\n" % (content, self.resp_status)) return '' #No resource could be retrieved return content
def resource_factory(self, path=None): ''' Look up and retrieve a new resource based on WSGI environment or a uri path ''' if path: docid = path if is_absolute(path): docid = relativize(path, self.remotedb) else: docid = self.environ['PATH_INFO'].lstrip('/').rsplit(self.space_tag, 1)[1].lstrip('/') #e.g. '/mydb/MyDoc' -> 'MyDoc' #resp, content = self.h.request(slave_uri + ';history', "GET", headers=auth_headers) if logger: logger.debug('query ' + repr((self.remotedb, docid, join(self.remotedb, docid)))) resp, content = self.h.request(join(self.remotedb, urllib.quote_plus(docid))) if logger: logger.debug('resp ' + repr((content[:100], resp))) self.prep_slave_response(resp) if not (self.resp_status.startswith('2') or self.resp_status.startswith('304')): if logger: logger.debug("Error looking up resource: %s: %s\n" % (content, self.resp_status)) return '' #No resource could be retrieved data = json.loads(content) return resource.factory(self, docid, data)
def contentdm_identify_object(body, ctype, download="True"): """ Responsible for: adding a field to a document with the URL where we should expect to the find the thumbnail. There are two methods of creating the thumbnail URL: 1. Replacing "cdm/ref" with "utils/getthumbail" in the handle field Example: handle: http://test.provider/cdm/ref/collection/1/id/1 thumbnail: http://test.provider/utils/getthumbnail/collection/1/id/1 2. Splitting the handle field on "u?" and using the parts to compose the thumbnail URL. Example: handle: http://test.provider/u?/ctm,101 thumbnail: http://test.provider/cgi-bin/thumbnail.exe?CISOROOT=/ctm&CISOPTR=101" """ try: data = json.loads(body) except: response.code = 500 response.add_header("content-type", "text/plain") return "Unable to parse body as JSON" handle_field = "originalRecord/handle" if exists(data, handle_field): url = None handle = getprop(data, handle_field) for h in handle if not isinstance(handle, basestring) else [handle]: if is_absolute(h): url = h break if not url: logger.error("There is no URL in %s." % handle_field) return body else: logger.error("Field %s does not exist" % handle_field) return body if "cdm/ref" in url: object = url.replace("cdm/ref", "utils/getthumbnail") else: p = url.split("u?") if len(p) != 2: logger.error("Bad URL %s. It should have just one 'u?' part." % url) return body (base_url, rest) = p if base_url == "" or rest == "": logger.error("Bad URL: %s. There is no 'u?' part." % url) return body p = rest.split(",") if len(p) != 2: logger.error( "Bad URL %s. Expected two parts at the end, used " + "in thumbnail URL for CISOROOT and CISOPTR." % url ) return body # Thumb url field. object = "%scgi-bin/thumbnail.exe?CISOROOT=%s&CISOPTR=%s" % (base_url, p[0], p[1]) data["object"] = object status = IGNORE if download == "True": status = PENDING if "admin" in data: data["admin"]["object_status"] = status else: data["admin"] = {"object_status": status} return json.dumps(data)
def source_transform(d): source = "" for i,s in enumerate(d["handle"]): if is_absolute(s): source = s return {"source":source}
def enrichformat(body, ctype, action="enrich-format", prop="sourceResource/format", type_field="sourceResource/type"): """ Service that accepts a JSON document and enriches the "format" field of that document by: a) Setting the format to be all lowercase b) Running through a set of cleanup regex's (e.g. image/jpg -> image/jpeg) c) Checking to see if the field is a valid IMT See http://www.iana.org/assignments/media-types for list of valid media-types. We require that a subtype is defined. d) Removing any extra text after the IMT e) Moving valid IMT values to hasView/format if hasView exists and its format is not set f) Setting type field from format field, if it is not set. The format field is taken if it is a string, or the first element if it is a list. It is then split and the first part of IMT is taken. By default works on the 'sourceResource/format' field but can be overridden by passing the name of the field to use as the 'prop' parameter. """ FORMAT_2_TYPE_MAPPINGS = { "audio": "sound", "image": "image", "video": "moving image", "text": "text" } REGEXPS = ('audio/mp3', 'audio/mpeg'), ('images/jpeg', 'image/jpeg'), \ ('image/jpg', 'image/jpeg'), ('image/jp$', 'image/jpeg'), \ ('img/jpg', 'image/jpeg'), ('^jpeg$', 'image/jpeg'), \ ('^jpg$', 'image/jpeg'), ('\W$', '') IMT_TYPES = [ 'application', 'audio', 'image', 'message', 'model', 'multipart', 'text', 'video' ] def get_ext(s): ext = os.path.splitext(s)[1].split('.') return ext[1] if len(ext) == 2 else "" def cleanup(s): s = s.lower().strip() for pattern, replace in REGEXPS: s = re.sub(pattern, replace, s) s = re.sub(r"^([a-z0-9/]+)\s.*", r"\1", s) return s def is_imt(s): logger.debug("Checking: " + s) imt_regexes = [re.compile('^' + x + '(/)') for x in IMT_TYPES] return any(regex.match(s) for regex in imt_regexes) try: data = json.loads(body) except Exception as e: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON\n" + str(e) imt_values = [] if exists(data, prop): v = getprop(data, prop) format = [] hasview_format = [] for s in (v if not isinstance(v, basestring) else [v]): if s.startswith("http") and is_absolute(s): s = get_ext(s) cleaned = cleanup(s) if is_imt(cleaned): # Append to imt_values for use in type imt_values.append(cleaned) # Move IMT values to hasView/format else discard if exists(data, "hasView") and not \ exists(data, "hasView/format") and \ cleaned not in hasview_format: hasview_format.append(cleaned) else: # Retain non-IMT values in sourceResource/format, non-cleaned if s not in format: format.append(s) if format: if len(format) == 1: format = format[0] setprop(data, prop, format) else: delprop(data, prop) if hasview_format: if len(hasview_format) == 1: hasview_format = hasview_format[0] setprop(data, "hasView/format", hasview_format) # Setting the type if it is empty. if not exists(data, type_field) and imt_values: type = [] for imt in imt_values: t = getprop(FORMAT_2_TYPE_MAPPINGS, imt.split("/")[0], True) if t and t not in type: type.append(t) if type: if len(type) == 1: type = type[0] setprop(data, type_field, type) return json.dumps(data)
def map_is_shown_at(self): for h in iterify(self.provider_data.get("handle")): if is_absolute(h): self.mapped_data.update({"isShownAt": h}) break
def __new__(cls, arg, uri=None, encoding=None, resolver=None, sourcetype=0): """ arg - a string, Unicode object (only if you really know what you're doing), file-like object (stream), file path or URI. You can also pass an InputSource object, in which case the return value is just the same object, possibly with the URI modified uri - optional override URI. The base URI for the IS will be set to this value Returns an input source which can be passed to Amara APIs. """ #do the imports within the function to avoid circular crap #from amara._xmlstring import IsXml as isxml #These importa are tucked in here because amara.lib.iri is an expensive import from amara.lib.iri import is_absolute, os_path_to_uri from amara.lib.irihelpers import DEFAULT_RESOLVER resolver = resolver or DEFAULT_RESOLVER if isinstance(arg, InputSource): return arg #if arg == (u'', ''): -> UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal if arg == '': #FIXME L10N raise ValueError("Cannot parse an empty string as XML") if isinstance(arg, urllib2.Request): uri = arg.get_full_url( ) #One of the rightly labeled "lame" helper methods in urllib2 ;) stream = resolver.resolve(arg) elif hasattr(arg, 'read'): #Create dummy Uri to use as base uri = uri or uuid4().urn stream = arg #XXX: Should we at this point refuse to proceed unless it's a basestring? elif sourcetype == XMLSTRING or isxml(arg): #See this article about XML detection heuristics #http://www.xml.com/pub/a/2007/02/28/what-does-xml-smell-like.html uri = uri or uuid4().urn stream = StringIO(arg) elif is_absolute(arg) and not os.path.isfile(arg): uri = arg stream = resolver.resolve(uri) #If the arg is beyond a certain length, don't even try it as a URI elif len(arg) < MAX_URI_LENGTH_FOR_HEURISTIC: uri = os_path_to_uri(arg) stream = resolver.resolve(uri) else: #FIXME L10N raise ValueError("Does not appear to be well-formed XML") #We might add the ability to load zips, gzips & bzip2s #http://docs.python.org/lib/module-zlib.html #http://docs.python.org/lib/module-gzip.html #http://docs.python.org/lib/module-bz2.html #http://docs.python.org/lib/zipfile-objects.html #import inspect; print inspect.stack() #InputSource.__new__ is in C: expat/input_source.c:inputsource_new return InputSource.__new__(cls, stream, uri, encoding)
def contentdm_identify_object(body, ctype, download="True"): """ Responsible for: adding a field to a document with the URL where we should expect to the find the thumbnail. There are two methods of creating the thumbnail URL: 1. Replacing "cdm/ref" with "utils/getthumbail" in the handle field Example: handle: http://test.provider/cdm/ref/collection/1/id/1 thumbnail: http://test.provider/utils/getthumbnail/collection/1/id/1 2. Splitting the handle field on "u?" and using the parts to compose the thumbnail URL. Example: handle: http://test.provider/u?/ctm,101 thumbnail: http://test.provider/cgi-bin/thumbnail.exe?CISOROOT=/ctm&CISOPTR=101" """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" handle_field = "originalRecord/handle" if exists(data, handle_field): url = None handle = getprop(data, handle_field) for h in (handle if not isinstance(handle, basestring) else [handle]): if is_absolute(h): url = h break if not url: logger.error("There is no URL in %s." % handle_field) return body else: logger.error("Field %s does not exist" % handle_field) return body if "cdm/ref" in url: object = url.replace("cdm/ref", "utils/getthumbnail") else: p = url.split("u?") if len(p) != 2: logger.error("Bad URL %s. It should have just one 'u?' part." % url) return body (base_url, rest) = p if base_url == "" or rest == "": logger.error("Bad URL: %s. There is no 'u?' part." % url) return body p = rest.split(",") if len(p) != 2: logger.error("Bad URL %s. Expected two parts at the end, used " + "in thumbnail URL for CISOROOT and CISOPTR." % url) return body # Thumb url field. object = "%scgi-bin/thumbnail.exe?CISOROOT=%s&CISOPTR=%s" % \ (base_url, p[0], p[1]) data["object"] = object status = IGNORE if download == "True": status = PENDING if "admin" in data: data["admin"]["object_status"] = status else: data["admin"] = {"object_status": status} return json.dumps(data)
def map_is_shown_at(self): for h in iterify(self.provider_data_source.get("handle")): if h and is_absolute(h): self.mapped_data.update({"isShownAt": h}) break
def enrichformat(body, ctype, action="enrich-format", prop="sourceResource/format", type_field="sourceResource/type"): """ Service that accepts a JSON document and enriches the "format" field of that document by: a) Setting the format to be all lowercase b) Running through a set of cleanup regex's (e.g. image/jpg -> image/jpeg) c) Checking to see if the field is a valid IMT See http://www.iana.org/assignments/media-types for list of valid media-types. We require that a subtype is defined. d) Removing any extra text after the IMT e) Moving valid IMT values to hasView/format if hasView exists and its format is not set f) Setting type field from format field, if it is not set. The format field is taken if it is a string, or the first element if it is a list. It is then split and the first part of IMT is taken. By default works on the 'sourceResource/format' field but can be overridden by passing the name of the field to use as the 'prop' parameter. """ FORMAT_2_TYPE_MAPPINGS = { "audio": "sound", "image": "image", "video": "moving image", "text": "text" } REGEXPS = ('audio/mp3', 'audio/mpeg'), ('images/jpeg', 'image/jpeg'), \ ('image/jpg', 'image/jpeg'), ('image/jp$', 'image/jpeg'), \ ('img/jpg', 'image/jpeg'), ('^jpeg$', 'image/jpeg'), \ ('^jpg$', 'image/jpeg'), ('\W$', '') IMT_TYPES = ['application', 'audio', 'image', 'message', 'model', 'multipart', 'text', 'video'] def get_ext(s): ext = os.path.splitext(s)[1].split('.') return ext[1] if len(ext) == 2 else "" def cleanup(s): s = s.lower().strip() for pattern, replace in REGEXPS: s = re.sub(pattern, replace, s) s = re.sub(r"^([a-z0-9/]+)\s.*",r"\1", s) return s def is_imt(s): imt_regexes = [re.compile('^' + x + '(/)') for x in IMT_TYPES] return any(regex.match(s) for regex in imt_regexes) try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" imt_values = [] if exists(data, prop): v = getprop(data, prop) format = [] hasview_format = [] for s in (filter(None,v) if not isinstance(v, basestring) else [v]): if s is not None and s.startswith("http") and is_absolute(s): s = get_ext(s) cleaned = cleanup(s) if is_imt(cleaned): # Append to imt_values for use in type imt_values.append(cleaned) # Move IMT values to hasView/format else discard if exists(data, "hasView") and not \ exists(data, "hasView/format") and \ cleaned not in hasview_format: hasview_format.append(cleaned) else: # Retain non-IMT values in sourceResource/format, non-cleaned if s not in format: format.append(s) if format: if len(format) == 1: format = format[0] setprop(data, prop, format) else: delprop(data, prop) if hasview_format: if len(hasview_format) == 1: hasview_format = hasview_format[0] setprop(data, "hasView/format", hasview_format) # Setting the type if it is empty. if not exists(data, type_field) and imt_values: type = [] for imt in imt_values: t = getprop(FORMAT_2_TYPE_MAPPINGS, imt.split("/")[0], True) if t and t not in type: type.append(t) if type: if len(type) == 1: type = type[0] setprop(data, type_field, type) return json.dumps(data)