def copy_auth(environ, top, realm=None): ''' Get auth creds (HTTP basic only, for now) from the incoming request and return an HTTP auth handler for urllib2. This handler allows you to "forward" this auth to remote services environ - The usual WSGI structure. Note: if you are using simple_service, in Akara services available as akara.request.environ, or perhaps passed right into the handler top - top URL to be used for this auth. ''' #Useful: http://www.voidspace.org.uk/python/articles/authentication.shtml creds = extract_auth(environ) if creds: username, password = creds else: return None password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm() # HTTPPasswordMgr top must omit any URL components before the host (i.e. no scheme and no auth info in the authority section) #(scheme, authority, path, query, fragment) = split_uri_ref(top) #auth, host, port = split_authority(authority) #auth_top_url = (host + ':' + port if port else host) + path #print >> sys.stderr, 'Auth creds: %s:%s (%s)'%(username, password, auth_top_url) logger.debug('Auth creds: %s:%s (%s)' % (username, password, top)) # Not setting the realm for now, so use None #password_mgr.add_password(None, auth_top_url, username, password) password_mgr.add_password(None, top, username, password) #password_handler = urllib2.HTTPDigestAuthHandler(password_mgr) password_handler = urllib2.HTTPBasicAuthHandler(password_mgr) return password_handler
def kentucky_identify_object(body, ctype, download="True"): """ Responsible for: adding a field to a document with the URL where we should expect to the find the thumbnail """ data = {} try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" relation_field = "sourceResource/relation" if exists(data, relation_field): url = getprop(data, relation_field) else: logger.debug("Field %s does not exist" % relation_field) return body base_url, ext = os.path.splitext(url) data["object"] = "%s_tb%s" % (base_url, ext) status = IGNORE if download == "True": status = PENDING if "admin" in data: data["admin"]["object_status"] = status else: data["admin"] = {"object_status": status} return json.dumps(data)
def find_file_extension(mime): """ Finds out the file extension based on the MIME type from the opened connection. Implementation: Function is using the configuration field 'mime_to_type' stored at akara.conf. Arguments: mime (String) - MIME type read from the HTTP headers Returns: file extension (String) - extension for the file - WITH DOT AT THE BEGINNING!!! Throws: throws exception if it cannot find the extension """ if mime in MIME_TYPES: ext = MIME_TYPES[mime] logger.debug("MIME type is [%s], returning extension [%s]" % \ (mime, ext)) return ext else: msg = "Cannot find extension for mime type: [%s]." % mime logger.error(msg) raise FileExtensionException(msg)
def freemix(body, ctype, maxcount=None, diagnostics=None): ''' Render the contents of a file as best as possible in Exhibit JSON * Supports Excel, BibTex and JSON for now Sample queries: * curl --request POST --data-binary @- http://localhost:8880/freemix.json?diagnostics=yes < test/data/load/iraq.xml * curl --request POST --data-binary @- http://localhost:8880/freemix.json < test/data/load/iraq.xml * curl --request POST --data-binary "@foo.xls" --header "Content-Type: application/vnd.ms-excel" "http://*****:*****@foo.xls" --header "Content-Type: application/msword" "http://localhost:8880/freemix.json" #FIXME: OK enough tower-of-pisa code. Use more functions #DIAGNOSTICS config no longer used at all #if diagnostics is None: # diagnostics = DIAGNOSTICS #else: diagnostics = diagnostics == u'yes' logger.debug('diagnostics: ' + repr(diagnostics)) fixup_obj_labels = True imt_saved = imt = guess_imt_(body, ctype) #logger.debug("IMT: " + imt) ss_data = None diag_info = [] if imt == UNKNOWN_IMT: try: source = speadsheet.read(body) ss_data = [ row for row in source.rows() ] imt = EXCEL_IMTS[0] except (KeyboardInterrupt, SystemExit): raise except Exception, e: pass
def pipe(content, ctype, enrichments, wsgi_header): body = json.dumps(content) error = None for uri in enrichments: if not uri: continue # in case there's no pipeline if not is_absolute(uri): prefix = request.environ["wsgi.url_scheme"] + "://" if request.environ.get("HTTP_HOST"): prefix += request.environ["HTTP_HOST"] else: prefix += request.environ["SERVER_NAME"] # Join the prefix and given pipeline module path, ensuring the # path starts with "/". uri = prefix + re.sub(r"^(?!/)", "/", uri) headers = copy_headers_to_dict(request.environ, exclude=[wsgi_header]) headers["content-type"] = ctype logger.debug("Calling url: %s " % uri) resp, cont = H.request(uri, "POST", body=body, headers=headers) if not str(resp.status).startswith("2"): error = "Error in enrichment pipeline at %s" % uri logger.error(error) continue body = cont return error, body
def set_field_from_value_mode(data, field, mode, value, multivalue=True): '''Set the value for the data "field" from data in collection ckey field with the value passed in. ''' logger.debug('Field:{} mode:{} value:{} mv:{}'.format(field, mode, value, multivalue)) if value: #no value don't bother if mode=='overwrite': if exists(data, field): setprop(data, field, value) else: pp,pn = tuple(field.lstrip('/').split('/',1)) if not pp in data: data[pp] = {} data[pp][pn] = value elif mode=='append': new_value = [] if exists(data, field): old_value = getprop(data, field) if isinstance(old_value, list): new_value.extend(old_value) else: new_value.append(old_value) if isinstance(value, list): new_value.extend(value) else: new_value.append(value) setprop(data, field, new_value) else: # fill blanks if not exists(data, field) or not getprop(data, field,keyErrorAsNone=True): if multivalue and not isinstance(value, list): value = [value] setprop(data, field, value) return data
def copy_auth(environ, top, realm=None): ''' Get auth creds (HTTP basic only, for now) from the incoming request and return an HTTP auth handler for urllib2. This handler allows you to "forward" this auth to remote services environ - The usual WSGI structure. Note: if you are using simple_service, in Akara services available as akara.request.environ, or perhaps passed right into the handler top - top URL to be used for this auth. ''' #Useful: http://www.voidspace.org.uk/python/articles/authentication.shtml creds = extract_auth(environ) if creds: username, password = creds else: return None password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm() # HTTPPasswordMgr top must omit any URL components before the host (i.e. no scheme and no auth info in the authority section) #(scheme, authority, path, query, fragment) = split_uri_ref(top) #auth, host, port = split_authority(authority) #auth_top_url = (host + ':' + port if port else host) + path #print >> sys.stderr, 'Auth creds: %s:%s (%s)'%(username, password, auth_top_url) logger.debug('Auth creds: %s:%s (%s)'%(username, password, top)) # Not setting the realm for now, so use None #password_mgr.add_password(None, auth_top_url, username, password) password_mgr.add_password(None, top, username, password) #password_handler = urllib2.HTTPDigestAuthHandler(password_mgr) password_handler = urllib2.HTTPBasicAuthHandler(password_mgr) return password_handler
def kentucky_identify_object(body, ctype, download="True"): """ Responsible for: adding a field to a document with the URL where we should expect to the find the thumbnail """ data = {} try: data = json.loads(body) except: response.code = 500 response.add_header("content-type", "text/plain") return "Unable to parse body as JSON" relation_field = "sourceResource/relation" if exists(data, relation_field): url = getprop(data, relation_field) else: logger.debug("Field %s does not exist" % relation_field) return body base_url, ext = os.path.splitext(url) data["object"] = "%s_tb%s" % (base_url, ext) status = IGNORE if download == "True": status = PENDING if "admin" in data: data["admin"]["object_status"] = status else: data["admin"] = {"object_status": status} return json.dumps(data)
def add_ejson_profile(data, fixup_obj_labels=True): objkeys = dict([ (k, k) for obj in data for k in obj ]) #FIXME: reduce from 3 full passes through obj to 2 (don't think we can go lower than 2) for k in objkeys: kcount = reduce(lambda count, obj, k=k: count + int(k in obj), data, 0) logger.debug("Key usage count %s: %i" % (k, kcount)) if not kcount: del objkeys[k] logger.debug("Modified data profile keys: " + repr(objkeys)) if fixup_obj_labels: for obj in data: for k in obj: #Yes we could receive non-string "labels" if not isinstance(k, basestring): k = str(k) new_k = UNSUPPORTED_IN_EXHIBITKEY.sub('_', k) if not new_k or new_k[0].isdigit(): new_k = '_' + new_k if k != new_k: objkeys[new_k] = k try: del objkeys[k] except KeyError: pass obj[new_k] = obj[k] del obj[k] #print >> sys.stderr, objkeys return {"properties": [ {"property": k, "enabled": (k not in ("id", "label")), "label": v, "types": ["text"]} for k, v in objkeys.iteritems() ]}
def pubmed_adapter(search=None, id=None): ''' Sample queries: #curl "http://localhost:8880/pubmed?" curl "http://localhost:8880/pubmed?search=stem+cells" curl "http://localhost:8880/pubmed?id=19358275" ''' #FIXME: How do we handle no search or id param? Just serve up the latest entries? Or error as below? #assert_(not(search and id), msg="You must specify the 'search' or 'id' query parameter is mandatory.") if search: #search = first_item(search) #reldate: only search for last N days #query = urllib.urlencode({'db' : NCBI_DB, 'term': query, 'reldate': '60', 'datetype': 'edat', 'retmax': DEFAULT_MAX_RESULTS, 'usehistory': 'y'}) query = urllib.urlencode({'term': search, 'db' : NCBI_DB, 'datetype': 'edat', 'retmax': DEFAULT_MAX_RESULTS, 'usehistory': 'y'}) search_url = NCBI_SEARCH_PATTERN + query logger.debug("Term search URL: " + search_url) doc = bindery.parse(search_url, standalone=True) search_terms = search ids = ( unicode(i) for i in doc.eSearchResult.IdList.Id ) ids = ','.join(ids) self_link = '/pubmed?search='+search else: #ids = first_item(id) #fulltext = fulltext[0] if fulltext else u'no' #if fulltext == 'yes': search_terms = ids self_link = '/pubmed?id='+ids query = urllib.urlencode({'db' : NCBI_DB, 'id': ids, 'retmode': 'xml'}) search_url = NCBI_ARTICLE_ACCESS_PATTERN + query logger.debug("ID search URL: " + search_url) alt_link = search_url doc = bindery.parse(search_url, standalone=True, model=PUBMED_MODEL) #doc = bindery.parse(open('/Users/uche/tmp/efetch.fcgi.html'), standalone=True, model=PUBMED_MODEL) metadata, first_id = metadata_dict(generate_metadata(doc)) return atom_results(doc, metadata, self_link, alt_link, search_terms)
def post_resource(environ, start_response): ''' Create a new record with a resource type ''' slaveinfo, space_tag = setup_request(environ) temp_fpath = read_http_body_to_temp(environ, start_response) body = open(temp_fpath, "r").read() resource_type = slaveinfo.resource_factory() imt = environ['CONTENT_TYPE'].split(';')[0] lang = environ.get('CONTENT_LANGUAGE') handler = resource_type.run_rulesheet(environ, environ['REQUEST_METHOD'], imt, lang) new_path, content = handler(resource_type, body) logger.debug('rulesheet transform output & new uri path (post_resource): ' + repr((content[:100], new_path))) #Comes back as Unicode, but we need to feed it to slave as encoded byte string content = content.encode('utf-8') environ['wsgi.input'] = cStringIO.StringIO(content) environ['CONTENT_LENGTH'] = len(content) response = slaveinfo.create_resource(new_path) if not slaveinfo.resp_status.startswith('2'): start_response(status_response(slaveinfo.resp_status), slaveinfo.resp_headers) return ["Unable to create resource\n"] start_response(slaveinfo.resp_status, slaveinfo.resp_headers) return response
def tocouch(**params): ''' @xslt - URL to the XSLT transform to be applied all other query parameters are passed ot the XSLT processor as top-level params Sample request: curl --request POST --data-binary "@foo.xml" --header "Content-Type: application/xml" "http://*****:*****@xslt=http://hg.akara.info/amara/trunk/raw-file/tip/demo/data/identity.xslt" You can check after the fact by visiting http://sforza.ogbuji.net:5984/test1/_all_docs Then get the id and surf there http://sforza.ogbuji.net:5984/test1/b10d978ced600227e663d6503b1abec4 or just explore it in Futon http://sforza.ogbuji.net:5984/_utils/database.html?test1 ''' logger.debug('params: ' + repr(params)) title = params['t'].decode('UTF-8') url = params['url'].decode('UTF-8') tags = params['tags'].decode('UTF-8').split(u',') desc = params.get('d', u'').decode('UTF-8') body = json.dumps({'title': title, 'url': url, 'tags': tags, 'desc': desc}, indent=4) headers = {} resp, content = H.request(COUCHBASE, 'POST', body, headers=headers) return '<div>Couch updated?</div><pre>%s</pre>'%body #{'ld': '2', 'd': 'I gave a number of talks this spring on jQuery and especially on some of the recent additions made in jQuery 1.4.', 'tlt': '2', 'url': 'http://ejohn.org/', 'blt': '1', 'tt': 'totag', 'nd': '1', 'bt': 'via for to unread', 'tl': '6', 'u': 'uche', 'user': '******', 'ned': '1', 'bld': '1', 'net': '1', 'bbt': '1', 'dt': 'todescribe', 't': 'John Resig - JavaScript Programmer'}
def akara_cache_proxy(url=None): ''' Sample request: curl -I "http://localhost:8880/akara.cache-proxy?url=http://poemtree.com/poems/UsefulAdvice.htm" ''' logger.debug('remote URL {0}: '.format(repr(url))) if not url: raise ValueError('url query parameter required') resp, content = H.request(url) if OVERRIDE_STALE: response.add_header(*MAXAGE_HEADER(get_max_age(url))) else: (fresh, lifetime) = is_fresh(resp) if fresh: response.add_header(*MAXAGE_HEADER( max(get_max_age(url),lifetime) )) else: response.add_header(*MAXAGE_HEADER(0)) logger.debug('remote response headers {0}: '.format(repr(resp))) #Oof. What about 'transfer-encoding' and other such headers for k in resp: if k not in ('server','status', 'transfer-encoding', 'content-length','cache-control','expires','date'): response.add_header(normalize_http_header_name(k), resp[k]) #response.add_header(k, resp[k]) #FIXME: This might distort return encoding, which would of course throw off content length & encoding. Workaround for now is removal of e.g. transfer-encoding (above) return content
def main(argv): parser = define_arguments() args = parser.parse_args(argv[1:]) config_file = ("akara.ini") config = ConfigParser.ConfigParser() config.readfp(open(config_file)) uri_base = "http://localhost:" + config.get("Akara", "Port") with open(args.profile_path, "r") as f: try: profile = json.load(f) except: print "Error, could not load profile in %s" % __name__ return None provider = profile["name"] couch = Couch() latest_ingestion_doc = couch._get_last_ingestion_doc_for(provider) if latest_ingestion_doc and \ getprop(latest_ingestion_doc, "delete_process/status") != "complete": error_msg = "Error, last ingestion did not complete. Review " + \ "dashboard document %s for errors." % \ latest_ingestion_doc["_id"] logger.error(error_msg) print error_msg return None ingestion_document_id = couch._create_ingestion_document(provider, uri_base, args.profile_path) logger.debug("Ingestion document %s created." % ingestion_document_id) return ingestion_document_id
def replace_regex(body, ctype, prop=None, regex=None, new=None): """Replaces a regex in prop Keyword arguments: body -- the content to load ctype -- the type of content prop -- the prop to apply replacing regex -- the regex to replace new -- the substring to replaced regex with """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" if not regex: logger.error("No regex parameter supplied") else: if not new: logger.debug("NO New parameter, will replace with empty string") new = '' if exists(data, prop): v = getprop(data, prop) new_val = replace_regex_recurse_field(v, regex, new) setprop(data, prop, new_val) return json.dumps(data)
def akara_cache_proxy(url=None): ''' Sample request: curl -I "http://localhost:8880/akara.cache-proxy?url=http://poemtree.com/poems/UsefulAdvice.htm" ''' logger.debug('remote URL {0}: '.format(repr(url))) if not url: raise ValueError('url query parameter required') resp, content = H.request(url) if OVERRIDE_STALE: response.add_header(*MAXAGE_HEADER(get_max_age(url))) else: (fresh, lifetime) = is_fresh(resp) if fresh: response.add_header( *MAXAGE_HEADER(max(get_max_age(url), lifetime))) else: response.add_header(*MAXAGE_HEADER(0)) logger.debug('remote response headers {0}: '.format(repr(resp))) #Oof. What about 'transfer-encoding' and other such headers for k in resp: if k not in ('server', 'status', 'transfer-encoding', 'content-length', 'cache-control', 'expires', 'date'): response.add_header(normalize_http_header_name(k), resp[k]) #response.add_header(k, resp[k]) #FIXME: This might distort return encoding, which would of course throw off content length & encoding. Workaround for now is removal of e.g. transfer-encoding (above) return content
def contentdm(collection='all', query=None, site=DEFAULT_SITE, limit=None): ''' Search all collections in Louisville: curl "http://localhost:8880/contentdm.json?query=crutches&site=http://digital.library.louisville.edu/cdm4/&limit=100" Search just /jthom collection in Louisville: curl "http://localhost:8880/contentdm.json?collection=/jthom&query=crutches&site=http://digital.library.louisville.edu/cdm4/&limit=100" Search all collections in U Miami: curl "http://localhost:8880/contentdm.json?query=crutches&site=http://doyle.lib.muohio.edu/cdm4/&limit=100" ''' limit = int(limit) if limit else None results = read_contentdm(site, collection=collection, query=query, limit=limit, logger=logger, proxy=CACHE_PROXY_SERVICE) header = results.next() url = header['basequeryurl'] count = 0 logger.debug("Start URL: {0}, Limit: {1}".format(repr(url), limit)) entries = list(results) logger.debug("Result count: {0}".format(len(entries))) properties = profile_properties(entries) #logger.debug("DEFAULT_PROPERTIES: {0}".format(DEFAULT_PROPERTIES)) for prop in properties: if prop[u"property"] in DEFAULT_PROPERTIES: prop[u"tags"] = DEFAULT_PROPERTIES[prop[u"property"]][u"tags"] #checkmem() return json.dumps({'items': entries, 'data_profile': {"properties": properties}}, indent=4)
def factory(rest_uri, moin_link=None, opener=None): opener = opener or urllib2.build_opener() logger.debug('rest_uri: ' + rest_uri) req = urllib2.Request(rest_uri, headers={'Accept': DOCBOOK_IMT}) resp = opener.open(req) doc = bindery.parse(resp, standalone=True, model=MOIN_DOCBOOK_MODEL) original_wiki_base = dict(resp.info())[ORIG_BASE_HEADER] #self.original_wiki_base = dict(resp.info())[ORIG_BASE_HEADER] #amara.xml_print(self.content_cache) metadata, first_id = metadata_dict(generate_metadata(doc)) metadata = metadata[first_id] akara_type = U(metadata[u'ak-type']) logger.debug('Type: ' + akara_type) try: #Older Moin CMS resource types are implemented by registration to the global node.NODES cls = node.NODES[akara_type] except KeyError: #Newer Moin CMS resource types are implemented by discovery of a URL, #to which a POST request executes the desired action return node.ENDPOINTS and (rest_uri, akara_type, node.ENDPOINTS[akara_type], doc, metadata, original_wiki_base) else: instance = cls(rest_uri, moin_link, opener, cache=(doc, metadata, original_wiki_base)) return instance
def rss2translate(url=None, format=None): """Convert RSS 2.0 feed to Atom or RSS 1.0 Sample request: * curl "http://localhost:8880/akara.rss2translate?url=http://feeds.delicious.com/v2/rss/recent" This is a demo and is not meant as an industrial-strength converter. """ # Support connection-negotiation in addition to query parameter if not format: accepted_imts = request.environ.get('HTTP_ACCEPT', '').split(',') imt = first_item(dropwhile(lambda x: '*' in x, accepted_imts)) if imt == 'RDF_IMT': format = 'rss1' else: format = 'atom' if not url: raise AssertionError("The 'url' query parameter is mandatory.") import feedparser # From http://www.feedparser.org/ feed = feedparser.parse(url) # Note: bad URLs might mean the feed doesn't have headers logger.debug('Feed info: ' + repr((url, feed.version, feed.encoding, feed.headers.get('Content-type')))) updated = getattr(feed.feed, 'updated_parsed', None) if updated: #FIXME: Double-check this conversion updated = datetime(*updated[:7]).isoformat() f = atomtools.feed(title=feed.feed.title, updated=updated, id=feed.feed.link) for e in feed.entries: updated = getattr(e, 'updated_parsed', None) if updated: #FIXME: Double-check this conversion updated = datetime(*updated[:7]).isoformat() links = [ #FIXME: self? (e.link, u'alternate'), ] f.append( e.link, e.title, updated = updated, summary=e.description, #e.author_detail.name #authors=authors, links=links, ) if format == 'atom': result = f.xml_encode() response.add_header("Content-Type", ATOM_IMT) else: result = f.rss1format() response.add_header("Content-Type", RDF_IMT) return result
def texas_enrich_location(body, ctype, action="texas_enrich_location", prop="sourceResource/spatial"): """ Service that accepts a JSON document and enriches the "spatial" field of that document. For use with the texas profile """ try: data = json.loads(body) except: response.code = 500 response.add_header("content-type", "text/plain") return "Unable to parse body as JSON" def _get_coordinates(value): lat, lon = None, None for v in value.split(";"): if "north=" in v: lat = v.split("=")[-1] elif "east=" in v: lon = v.split("=")[-1] if lat and lon: return (lat, lon) else: return () if exists(data, prop): spatial = [] values = getprop(data, prop) for v in values: sp = {"name": v} shredded = [s.strip() for s in v.split(" - ")] coordinates = _get_coordinates(sp["name"]) if coordinates: sp["name"] = "%s, %s" % coordinates if len(shredded) < 5: if not re.search("\d", sp["name"]): sp["country"] = shredded[0] if "country" in sp: if sp["country"] in ["United States", "Canada"]: try: sp["state"] = shredded[1] sp["county"] = shredded[2] sp["city"] = shredded[3] except Exception, e: logger.debug("Error enriching location %s: %s" % (data["_id"], e)) spatial.append(sp) logger.debug("SPATIAL: %s" % spatial) setprop(data, prop, spatial)
def run(self): self._sock.setblocking(1) logger.debug("Start request from address %r, local socket %r" % (self._addr, self._sock.getsockname())) handler = AkaraWSGIDispatcher(self.settings, self.config) self.handler = AkaraWSGIHandler(self._sock, self._addr, handler) logger.debug("End request from address %r, local socket %r" % (self._addr, self._sock.getsockname())) self._sock.close()
def akara_echo_body(body, ctype, log=u'no'): ''' Sample request: curl --request POST --data-binary "@foo.dat" --header "Content-type: text/plain" "http://localhost:8880/akara.echo" ''' if log == u'yes': from akara import logger logger.debug('akara_echo_body: ' + body) return body
def texas_enrich_location(body, ctype, action="texas_enrich_location", prop="sourceResource/spatial"): """ Service that accepts a JSON document and enriches the "spatial" field of that document. For use with the texas profile """ try: data = json.loads(body) except: response.code = 500 response.add_header("content-type", "text/plain") return "Unable to parse body as JSON" def _get_coordinates(value): lat, lon = None, None for v in value.split(";"): if "north=" in v: lat = v.split("=")[-1] elif "east=" in v: lon = v.split("=")[-1] if lat and lon: return (lat, lon) else: return () if exists(data, prop): spatial = [] values = getprop(data,prop) for v in values: sp = {"name": v} shredded = [s.strip() for s in v.split(" - ")] coordinates = _get_coordinates(sp["name"]) if coordinates: sp["name"] = "%s, %s" % coordinates if len(shredded) < 5: if not re.search("\d", sp["name"]): sp["country"] = shredded[0] if "country" in sp: if sp["country"] in ["United States", "Canada"]: try: sp["state"] = shredded[1] sp["county"] = shredded[2] sp["city"] = shredded[3] except Exception, e: logger.debug("Error enriching location %s: %s" % (data["_id"], e)) spatial.append(sp) logger.debug("SPATIAL: %s" % spatial) setprop(data, prop, spatial)
def zen_type(space, data): ''' Computer a Zen type full moinrest uri as well as a path relative to top of the wiki instance ''' rtype = data['zen:metadata']['zen:type'] if logger: logger.debug('zen_type link: ' + repr(rtype)) tpath, tid = rtype, absolutize(rtype, space.remotedb) if logger: logger.debug('Retrieved zen_type: ' + repr((tid, tpath))) return (tid, tpath)
def download_preview(body, ctype): """ Reponsible for: downloading a preview for a document Usage: as a module in separate pipeline, to be run on existing documents in the repository to download the thumbnails. """ try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" # Check the "admin/object_status" field status = None try: status = getprop(data, "admin/object_status") if status in ["error", "downloaded"]: logger.debug("Status is %s, doing nothing" % status) return body except KeyError as e: logger.error(e.args[0]) data = set_error(data) return json.dumps(data) # Thumbnail URL url = None try: url = getprop(data, "object/@id") except KeyError as e: logger.error(e.args[0]) data = set_error(data) return json.dumps(data) # Document ID id = None try: id = getprop(data, "id") except KeyError as e: logger.error(e.args[0]) data = set_error(data) return json.dumps(data) download = False if status == "pending": download = True (relative_fname, mime, status) = download_image(url, id, download) if not relative_fname: logger.error("Cannot save thumbnail from: %s." % (url)) # so everything is OK and the file is on disk doc = update_document(data, relative_fname, mime, status) return json.dumps(doc)
def unset_prop(body, ctype, prop=None, condition=None, condition_prop=None): """Unsets the value of prop. Keyword arguments: body -- the content to load ctype -- the type of content prop -- the prop to unset condition -- the condition to be met (uses prop by default) condition_prop -- the prop(s) to use in the condition (comma-separated if multiple props) """ CONDITIONS = { "is_digit": lambda v: v[0].isdigit(), "mwdl_exclude": lambda v: (v[0] == "collections" or v[0] == "findingAids"), "hathi_exclude": lambda v: "Minnesota Digital Library" in v, "finding_aid_title": lambda v: v[0].startswith("Finding Aid"), "usc_no_contributor": lambda v: not v[0].get("contributor", False) } def condition_met(condition_prop, condition): values = [] props = condition_prop.split(",") for p in props: iterified = iterify(getprop(data, p, True)) [values.append(i) for i in iterified] return CONDITIONS[condition](values) try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" # Check if prop exists to avoid key error if exists(data, prop): if not condition: delprop(data, prop) else: if not condition_prop: condition_prop = prop try: if condition_met(condition_prop, condition): logger.debug("Unsetting prop %s for doc with id %s" % (prop, data["_id"])) delprop(data, prop) except KeyError: logger.error("CONDITIONS does not contain %s" % condition) return json.dumps(data)
def dataprovider_transform(d, p): dataprovider = [] for s in getprop(d, p): if "partner" in s: term = s.split(":")[-1] try: dataprovider.append(DATAPROVIDER_TERM_LABEL[term]) except: logger.debug("TERM %s does not exist %s" % (term, d["_id"])) return {"dataProvider": dataprovider} if dataprovider else {}
def register_service(self, ident, path, handler, doc=None, query_template=None): if "/" in path: raise ValueError("Registered path %r may not contain a '/'" % (path,)) if doc is None: doc = inspect.getdoc(handler) or "" if ident in self._registered_services: logger.warn("Replacing mount point %r (%r)" % (path, ident)) else: logger.debug("Created new mount point %r (%r)" % (path, ident)) serv = Service(handler, path, ident, doc, query_template) self._registered_services[path] = serv
def url(self): domain = "ws.geonames.org" user = '' if self.user: domain = "ba-ws.geonames.net" user = '******'%self.user logger.debug('Using Commercial GeoNames service (ba-ws.geonames.org). Username: '******'json': 'JSON'} resource = "search" + append_formats.get(output_format, '') return "http://%(domain)s/%(resource)s?%(user)s%%s" % locals()
def primotodpla(body, ctype, geoprop=None): """ Convert output of JSON-ified PRIMO (MWDL) format into the DPLA JSON-LD format. Parameter "geoprop" specifies the property name containing lat/long coords """ try: data = json.loads(body) except: response.code = 500 response.add_header("content-type", "text/plain") return "Unable to parse body as JSON" global GEOPROP GEOPROP = geoprop out = {"@context": CONTEXT, "sourceResource": {}} # Apply all transformation rules from original document for p in CHO_TRANSFORMER: if exists(data, p): out["sourceResource"].update(CHO_TRANSFORMER[p](data, p)) for p in AGGREGATION_TRANSFORMER: if exists(data, p): out.update(AGGREGATION_TRANSFORMER[p](data, p)) # Apply transformations that are dependent on more than one # original document field sp_props = ["display/lds08"] ipo_props = ["display/lds04"] title_props = ["display/title", "display/lds10"] out["sourceResource"].update( multi_transform(data, "spatial", sp_props, "list")) out["sourceResource"].update(multi_transform(data, "isPartOf", ipo_props)) out["sourceResource"].update(multi_transform(data, "title", title_props)) dp_props = ["display/lds03"] out.update(multi_transform(data, "dataProvider", dp_props)) # Additional content not from original document if "HTTP_CONTRIBUTOR" in request.environ: try: out["provider"] = json.loads( base64.b64decode(request.environ["HTTP_CONTRIBUTOR"])) except Exception as e: logger.debug("Unable to decode Contributor header value: " + request.environ["HTTP_CONTRIBUTOR"] + "---" + repr(e)) # Strip out keys with None/null values? out = dict((k, v) for (k, v) in out.items() if v) return json.dumps(out)
def primotodpla(body,ctype,geoprop=None): """ Convert output of JSON-ified PRIMO (MWDL) format into the DPLA JSON-LD format. Parameter "geoprop" specifies the property name containing lat/long coords """ try : data = json.loads(body) except: response.code = 500 response.add_header("content-type","text/plain") return "Unable to parse body as JSON" global GEOPROP GEOPROP = geoprop out = { "@context": CONTEXT, "sourceResource": {} } # Apply all transformation rules from original document for p in CHO_TRANSFORMER: if exists(data, p): out["sourceResource"].update(CHO_TRANSFORMER[p](data, p)) for p in AGGREGATION_TRANSFORMER: if exists(data, p): out.update(AGGREGATION_TRANSFORMER[p](data, p)) # Apply transformations that are dependent on more than one # original document field sp_props = ["display/lds08"] ipo_props = ["display/lds04"] title_props = ["display/title", "display/lds10"] out["sourceResource"].update(multi_transform(data, "spatial", sp_props, "list")) out["sourceResource"].update(multi_transform(data, "isPartOf", ipo_props)) out["sourceResource"].update(multi_transform(data, "title", title_props)) dp_props = ["display/lds03"] out.update(multi_transform(data, "dataProvider", dp_props)) # Additional content not from original document if "HTTP_CONTRIBUTOR" in request.environ: try: out["provider"] = json.loads(base64.b64decode(request.environ["HTTP_CONTRIBUTOR"])) except Exception as e: logger.debug("Unable to decode Contributor header value: "+request.environ["HTTP_CONTRIBUTOR"]+"---"+repr(e)) # Strip out keys with None/null values? out = dict((k,v) for (k,v) in out.items() if v) return json.dumps(out)
def search(self, term): qstr = urllib.urlencode({'verb' : 'GetRecord', 'metadataPrefix': 'oai_dc', 'identifier': dspace_id}) url = DSPACE_OAI_ENDPOINT + '?' + qstr logger.debug('DSpace URL: ' + str(url)) #keywords = [ (k.strip(), JOVE_TAG) for k in unicode(row.xml_select(u'string(.//*[@class="keywords"])')).split(',') ] doc = bindery.parse(url, model=OAI_MODEL) #print >> sys.stderr, list(generate_metadata(doc)) resources, first_id = metadata_dict(generate_metadata(doc), nesteddict=False) record = doc.OAI_PMH resource = resources[first_id]
def arctodpla(body, ctype, geoprop=None): """ Convert output of JSON-ified ARC (NARA) format into the DPLA JSON-LD format. Parameter "geoprop" specifies the property name containing lat/long coords """ try: data = json.loads(body) except: response.code = 500 response.add_header("content-type", "text/plain") return "Unable to parse body as JSON" global GEOPROP GEOPROP = geoprop out = {"@context": CONTEXT, "sourceResource": {}} # Apply all transformation rules from original document for p in data.keys(): if p in CHO_TRANSFORMER: out["sourceResource"].update(CHO_TRANSFORMER[p](data)) if p in AGGREGATION_TRANSFORMER: out.update(AGGREGATION_TRANSFORMER[p](data)) # Apply transformations that are dependent on more than one # original document field out["sourceResource"].update(type_transform(data)) out["sourceResource"].update(rights_transform(data)) out["sourceResource"].update(subject_and_spatial_transform(data)) out.update(has_view_transform(data)) out["sourceResource"].update(transform_state_located_in(data)) if exists(out, "sourceResource/date"): logger.debug("OUTTYPE: %s" % getprop(out, "sourceResource/date")) if exists(data, "objects/object"): out.update(transform_thumbnail(data)) # Additional content not from original document if "HTTP_CONTRIBUTOR" in request.environ: try: out["provider"] = json.loads( base64.b64decode(request.environ["HTTP_CONTRIBUTOR"])) except Exception as e: logger.debug("Unable to decode Contributor header value: " + request.environ["HTTP_CONTRIBUTOR"] + "---" + repr(e)) # Strip out keys with None/null values? out = dict((k, v) for (k, v) in out.items() if v) return json.dumps(out)
def pipe(content,ctype,enrichments,wsgi_header): body = json.dumps(content) for uri in enrichments: if len(uri) < 1: continue # in case there's no pipeline headers = copy_headers_to_dict(request.environ,exclude=[wsgi_header]) headers['content-type'] = ctype resp, cont = H.request(uri,'POST',body=body,headers=headers) if not str(resp.status).startswith('2'): logger.debug("Error in enrichment pipeline at %s: %s"%(uri,repr(resp))) continue body = cont return body
def listrecords(limit=100): import httplib h = httplib2.Http() h.force_exception_as_status_code = True url = join(COUCH_DATABASE, '_design', VIEW_APP, '_view', VIEW_NAME) url += '?limit=' + str(limit) logger.debug(url) resp, content = h.request(url, "GET", headers=COUCH_AUTH_HEADER) logger.debug("Content: " + content) if str(resp.status).startswith('2'): return content else: logger.error("Couldn't get documents via: " + repr(resp))
def edantodpla(body, ctype, geoprop=None): """ Convert output of JSON-ified EDAN (Smithsonian) format into the DPLA JSON-LD format. Parameter "geoprop" specifies the property name containing lat/long coords """ try: data = json.loads(body) except: response.code = 500 response.add_header("content-type", "text/plain") return "Unable to parse body as JSON" global GEOPROP GEOPROP = geoprop out = {"@context": CONTEXT, "sourceResource": {}} # Apply all transformation rules from original document for k, v in CHO_TRANSFORMER.items(): if exists(data, k): out["sourceResource"].update(v(data)) for k, v in AGGREGATION_TRANSFORMER.items(): if exists(data, k): out.update(v(data)) # Apply transformations that are dependent on more than one # original document field #out["sourceResource"].update(type_transform(data)) out["sourceResource"].update(transform_rights(data)) out["sourceResource"].update(transform_subject(data)) out["sourceResource"].update(transform_spatial(data)) out.update(transform_is_shown_at(data)) out.update(transform_object(data)) out.update(transform_data_provider(data)) # Additional content not from original document if "HTTP_CONTRIBUTOR" in request.environ: try: out["provider"] = json.loads( base64.b64decode(request.environ["HTTP_CONTRIBUTOR"])) except Exception as e: logger.debug("Unable to decode Contributor header value: " + request.environ["HTTP_CONTRIBUTOR"] + "---" + repr(e)) # Strip out keys with None/null values? out = dict((k, v) for (k, v) in out.items() if v) return json.dumps(out)
def subject_and_spatial_transform(d, p): val = {} val["subject"] = [] val["spatial"] = [] v = getprop(d, p) for s in (iterify(v)): subject = [] if "name" in s: subject.append(name_from_name_part(getprop(s, "name/namePart"))) if "topic" in s: for t in (s["topic"] if isinstance(s["topic"], list) else [s["topic"]]): if t not in subject: subject.append(t) if "geographic" in s: for g in iterify(s["geographic"]): if g not in subject: subject.append(g) if g not in val["spatial"]: val["spatial"].append(g) if "hierarchicalGeographic" in s: for h in iterify(s["hierarchicalGeographic"]): if isinstance(h, dict): for k in h.keys(): if k not in ["city", "county", "state", "country", "coordinates"]: del h[k] if h not in val["spatial"]: val["spatial"].append(h) if "country" in h: subject.append(h["country"]) coords = getprop(s, "cartographics/coordinates", True) if coords and coords not in val["spatial"]: val["spatial"].append(coords) if "temporal" in s: logger.debug("TEMPORAL: %s" % s["temporal"]) val["subject"].append("--".join(subject)) if not val["subject"]: del val["subject"] if not val["spatial"]: del val["spatial"] return val
def scraper_json(url=None): ''' End-point for bookmarklet that scrapes a site for RDFa then using Calais Sample request: * curl "http://localhost:8880/z.scraper.json?url=http://zepheira.com" ''' for s in SCRAPER_SERVICES: logger.debug("Not found: " + place) #print >> sys.stderr, 'Trying:', s%{'url': url[0]} #result = urllib.urlopen(s%{'url': url[0]}).read() result = urllib.urlopen(s + url[0]).read() if result: return result return '{}'
def __init__(self, space, docid, data, rtype=None): ''' ''' self.docid = docid self.space = space self.slave_uri = join(space.remotedb, docid) self.data = data self.rulesheet = None if logger: logger.debug('GRIPPO: ' + repr(rtype)) if isinstance(rtype, basestring) and rtype != RESOURCE_TYPE_TYPE: self.type = space.resource_factory(rtype) else: self.type = rtype return
def find_conversion_dictionary(mapping_key): """Finds the dictionary with values to use for conversion. Args: mapping_key (Str): Name of conversion key read from Akara.conf Returns: Dictionary used for converting values. """ # Mapping should be in akara.conf mapping = module_config().get('lookup_mapping') logger.debug("Looking for mapping using key [%s]" % mapping_key) dict_name = mapping[mapping_key].upper() logger.debug("Found substitution dict [%s] for key mapping [%s]" % (dict_name, mapping_key,)) return globals()[dict_name]
def map_data_provider(self): prop = "originalRecord/header/setSpec" if exists(self.provider_data, prop): dataprovider = [] for s in iterify(getprop(self.provider_data, prop)): if "partner" in s: term = s.split(":")[-1] try: dataprovider.append(self.dataprovider_term_label[term]) except: logger.debug("Term %s does not exist in " % term + "self.dataprovider_term_label for %s" % self.provider_data["_id"]) if dataprovider: self.mapped_data.update({"dataProvider": dataprovider})
def register_service(self, ident, path, handler, doc=None, query_template=None): if "/" in path: raise ValueError("Registered path %r may not contain a '/'" % (path, )) if doc is None: doc = inspect.getdoc(handler) or "" if ident in self._registered_services: logger.warn("Replacing mount point %r (%r)" % (path, ident)) else: logger.debug("Created new mount point %r (%r)" % (path, ident)) serv = Service(handler, path, ident, doc, query_template) self._registered_services[path] = serv
def generate_file_path(id, file_extension): """ Generates and returns the file path based in provided params. Algorithm for generating the file path: The file path is generated using the following algorithm: - convert all not allowed characters from the document id to "_" - to the above string add number and extension getting FILE_NAME - fetch id (it will already be the md5 of the _id field) - convert to uppercase - insert "/" between each to characters of this hash getting CALCULATED_PATH - join the MAIN_PATH, CALCULATED_PATH and FILE_NAME Arguments: id - document id from couchdb file_extension - extension of the file Returns: filepath - path, without file name full_filepath - path, with file name relative_fname - path, relative, without ROOT_PATH Example: Function call: generate_file_path('clemsontest--hcc001-hcc016', ".jpg") Generated values for the algorithm steps: TODO: Update doc here for the new algorithm. CLEARED_ID: clemsontest__hcc001_hcc016 FILE_NAME: clemsontest__hcc001_hcc016.jpg HASHED_ID: 8E393B3B5DA0E0B3A7AEBFB91FE1278A PATH: 8E/39/3B/3B/5D/A0/E0/B3/A7/AE/BF/B9/1F/E1/27/8A/ FULL_NAME: /main_pic_dir/8E/39/3B/3B/5D/A0/E0/B3/A7/AE/BF/B9/1F/E1/27/8A/clemsontest__hcc001_hcc016.jpg """ cleared_id = id.upper() logger.debug("Generating filename for document with id: [%s].", id) fname = "%s%s" % (cleared_id, file_extension) logger.debug("File name: " + fname) path = re.sub("(.{2})", "\\1" + os.sep, cleared_id, re.DOTALL) logger.debug("PATH: " + path) relative_fname = os.path.join(path, fname) path = os.path.join(THUMBS_ROOT_PATH, path) full_fname = os.path.join(path, fname) logger.debug("FULL PATH: " + full_fname) return (path, full_fname, relative_fname)
def find_conversion_dictionary(mapping_key): """Finds the dictionary with values to use for conversion. Args: mapping_key (Str): Name of conversion key read from Akara.conf Returns: Dictionary used for converting values. """ # Mapping should be in akara.conf mapping = module_config().get('lookup_mapping') logger.debug("Looking for mapping using key [%s]" % mapping_key) dict_name = mapping[mapping_key].upper() logger.debug("Found substitution dict [%s] for key mapping [%s]" % ( dict_name, mapping_key, )) return globals()[dict_name]
def update_document(body, ctype): logger.debug(body) from StringIO import StringIO io = StringIO(body) parsed_doc = json.load(io) document_id = parsed_doc[u"id"] document = body logger.debug("Storing the document: " + document_id) import httplib h = httplib2.Http() h.force_exception_as_status_code = True url = join(COUCH_DATABASE, document_id) resp, content = h.request(url, 'PUT', body=document, headers=COUCH_AUTH_HEADER) if str(resp.status).startswith('2'): return content else: logger.error("Couldn't store the document %s with the id: %s. " % (document, document_id, ) )
def pipe(content, ctype, enrichments, wsgi_header): body = json.dumps(content) for uri in enrichments: if not uri: continue # in case there's no pipeline if not is_absolute(uri): prefix = request.environ['wsgi.url_scheme'] + '://' prefix += request.environ['HTTP_HOST'] if request.environ.get('HTTP_HOST') else request.environ['SERVER_NAME'] uri = prefix + uri headers = copy_headers_to_dict(request.environ, exclude=[wsgi_header]) headers['content-type'] = ctype logger.debug("Calling url: %s " % uri) resp, cont = H.request(uri, 'POST', body=body, headers=headers) if not str(resp.status).startswith('2'): logger.warn("Error in enrichment pipeline at %s: %s"%(uri,repr(resp))) continue body = cont return body
def oaitodpla(body, ctype, geoprop=None): ''' Convert output of Freemix OAI service into the DPLA JSON-LD format. Does not currently require any enrichments to be ahead in the pipeline, but supports geocoding if used. In the future, subject shredding may be assumed too. Parameter "geoprop" specifies the property name containing lat/long coords ''' try: data = json.loads(body) except: response.code = 500 response.add_header('content-type', 'text/plain') return "Unable to parse body as JSON" global GEOPROP GEOPROP = geoprop out = {"@context": CONTEXT, "sourceResource": {}} # Apply all transformation rules from original document to sourceResource for p in data.keys(): if p in CHO_TRANSFORMER: out['sourceResource'].update(CHO_TRANSFORMER[p](data)) if p in AGGREGATION_TRANSFORMER: out.update(AGGREGATION_TRANSFORMER[p](data)) # Additional content not from original document if 'HTTP_CONTRIBUTOR' in request.environ: try: out["provider"] = json.loads( base64.b64decode(request.environ['HTTP_CONTRIBUTOR'])) except Exception as e: logger.debug("Unable to decode Contributor header value: " + request.environ['HTTP_CONTRIBUTOR'] + "---" + repr(e)) # Strip out keys with None/null values? out = dict((k, v) for (k, v) in out.items() if v) return json.dumps(out)
def kentucky_identify_object(body, ctype, download="True"): """ Responsible for: adding a field to a document with the URL where we should expect to the find the thumbnail """ LOG_JSON_ON_ERROR = True def log_json(): if LOG_JSON_ON_ERROR: logger.debug(body) data = {} try: data = json.loads(body) except Exception as e: msg = "Bad JSON: " + e.args[0] logger.error(msg) response.code = 500 response.add_header('content-type', 'text/plain') return msg relation_field = "sourceResource/relation" if exists(data, relation_field): url = getprop(data, relation_field) else: msg = "Field %s does not exist" % relation_field logger.debug(msg) return body base_url, ext = os.path.splitext(url) data["object"] = "%s_tb%s" % (base_url, ext) status = IGNORE if download == "True": status = PENDING if "admin" in data: data["admin"]["object_status"] = status else: data["admin"] = {"object_status": status} return json.dumps(data)
def _twofishes_data(self, url): """Return a dict of Twofishes data for the given URL. Rely on the response being Unicode-encoded JSON. """ logger.debug("GET %s" % url) try: response = urlopen(url, None, 2) http_status = response.getcode() if http_status != 200: logger.error("Got status %d from %s" % (http_status, url)) return {} return json.loads(response.read()) except URLError as e: logger.error("Could not open %s (%s)" % (url, e)) return {} except Exception as e: logger.error("Unexpected exception from %s: %e" % (url, e)) return {}
def is_fresh(resp): """ Returns a tuple, the first element a boolean whether the response can be considered (for our purposes) fresh or not, and the second the freshness lifetime of the response. Much of this is reworked from httplib2._entry_disposition. We can't reuse it directly since it assumes responses are stale unless otherwise marked as fresh, and we want to do the opposite. """ fresh = True freshness_lifetime = 0 cc_response = httplib2._parse_cache_control(resp) if 'no-cache' in cc_response or 'private' in cc_response: fresh = False elif 'date' in resp: date = calendar.timegm(email.Utils.parsedate_tz(resp['date'])) now = time.time() current_age = max(0, now - date - 5) # Give us 5 seconds to get this far if 'max-age' in cc_response: try: freshness_lifetime = int(cc_response['max-age']) except ValueError: freshness_lifetime = 0 elif 'expires' in resp: expires = email.Utils.parsedate_tz(resp['expires']) if expires == None: freshness_lifetime = 0 else: freshness_lifetime = calendar.timegm(expires) - date else: freshness_lifetime = 0 if freshness_lifetime < current_age: logger.debug( 'lifetime = {0}, age = {1}, so marking explicitly stale'. format(freshness_lifetime, current_age)) fresh = False return fresh, freshness_lifetime
def david_rumsey_identify_object(body, ctype, download="True"): """ Responsible for: adding a field to a document with the URL where we should expect to the find the thumbnail """ LOG_JSON_ON_ERROR = True def log_json(): if LOG_JSON_ON_ERROR: logger.debug(body) data = {} try: data = json.loads(body) except Exception as e: msg = "Bad JSON: " + e.args[0] logger.error(msg) response.code = 500 response.add_header('content-type', 'text/plain') return msg handle_field = "originalRecord/handle" if exists(data, handle_field): handle = getprop(data, handle_field) else: msg = "Field %s does not exist" % handle_field logger.debug(msg) return body data["object"] = handle[1] status = IGNORE if download == "True": status = PENDING if "admin" in data: data["admin"]["object_status"] = status else: data["admin"] = {"object_status": status} return json.dumps(data)