class PublicDomain: app = None config = None fcrepo_id = "" dhurl = "" is_public_domain = False def __init__(self, app, config, fcrepo_id): self.logger = getLogger(__name__) self.app = app self.config = config self.fcrepo_id = fcrepo_id self.session = requests.Session() if "enviro" not in config: self.session.proxies = { "http": "http://sysprox.artic.edu:3128", "https": "http://sysprox.artic.edu:3128", } if self._valid_fcrepo_id(fcrepo_id): self.dhurl = "http://aggregator-data.artic.edu/api/v1/artworks/search?cache=false&query[bool][should][][term][image_id]=" + self.fcrepo_id + "&query[bool][should][][term][alt_image_ids]=" + self.fcrepo_id + "&fields=is_public_domain,id,is_zoomable,max_zoom_window_size,api_link,title,artist_display" self._db = DB(app, config["sqlite"]["db"]) self.logger.debug("fcrepo_id is: {}".format(self.fcrepo_id)) return def get(self): self.logger.debug("Fetching public_domain status for: {}".format( self.fcrepo_id)) fs_path = self.get_fs_path() if fs_path != "Status404": self.logger.debug("Reading: {}".format(fs_path)) with open(fs_path, "rb") as f: imagedata = f.read() self.logger.debug("Serving: {}".format(fs_path)) if imagedata: response = Response(imagedata) response.headers['Content-type'] = self.contenttype return (response, "200") else: return ("What? " + fs_path, 404) else: return ("404 Not Found", 404) def get_pd_status(self): self.logger.debug( "Fetching stored public_domain status for: {}".format( self.fcrepo_id)) if self._valid_fcrepo_id(self.fcrepo_id): pd_status = self._pd_desg_get() self.logger.debug( "Returning public_domain status {} for {}".format( pd_status, self.fcrepo_id)) if str(pd_status) == "Status503" or str(pd_status) == "Status404": return pd_status else: return '{ "is_public_domain": ' + str(pd_status).lower() + ' }' else: return "Status404" def _pd_desg_get(self): self.pd_desgs_exists = False sql_query = "SELECT public_domain FROM pd_designations WHERE fcrepo_image_id = '" + self.fcrepo_id + "' AND last_checked >= datetime('now', '-24 hours');" self.logger.debug( "Checking for existing pd_status within expiry time: {}".format( sql_query)) pd_desgs = self._db.query(sql_query) if pd_desgs != None: self.logger.debug("Found DB entry for {}.".format(self.fcrepo_id)) self.pd_desgs_exists = True if str(pd_desgs[0][0]) == "1": self.is_public_domain = True else: # Must look it up in the datahub, but first we'll make sure it is in lakemichigan if self._content_in_fcrepo(self.fcrepo_id): self.logger.debug("No DB entry found for {}.".format( self.fcrepo_id)) self.logger.debug("Checking datahub for {}.".format( self.fcrepo_id)) try: dhresponse = self.session.get(self.dhurl) dhdata = dhresponse.json() if (len(dhdata["data"]) > 0): if (dhdata["data"][0]["is_public_domain"]): self.is_public_domain = True else: self.logger.debug( "Datahub does not know about {}. Public_domain is true as this may be an Interpretive Resource." .format(self.fcrepo_id)) self.is_public_domain = True self._pd_desg_put() except: return "Status503" else: return "Status404" return self.is_public_domain def _pd_desg_put(self): self.logger.debug( "Public domain status is {} for insert to DB for Asset {}".format( self.is_public_domain, self.fcrepo_id)) pd_status_str = "0" if self.is_public_domain: pd_status_str = "1" sql_query = "SELECT public_domain FROM pd_designations WHERE fcrepo_image_id = '" + self.fcrepo_id + "';" self.logger.debug( "Checking for existing pd_status regardless of expiry: {}".format( sql_query)) pd_desgs = self._db.query(sql_query) if pd_desgs != None: sql_query = "UPDATE pd_designations SET public_domain='" + pd_status_str + "', last_checked=datetime('now') WHERE fcrepo_image_id = '" + self.fcrepo_id + "';" etags = self._db.update(sql_query) else: sql_query = "INSERT INTO pd_designations (fcrepo_image_id, public_domain, last_checked) VALUES ('" + self.fcrepo_id + "', '" + pd_status_str + "', datetime('now'))" dbid = self._db.update(sql_query) return True def _content_in_fcrepo(self, fcrepo_id): fcrepo_path = fcrepo_path_from_hash(fcrepo_id) fcrepo_url = self.config["httpresolver"][ "prefix"] + fcrepo_path + self.config["httpresolver"]["postfix"] fcrepo_hit = self.session.head(fcrepo_url) if fcrepo_hit.status_code == 200: return True return False def _valid_fcrepo_id(self, fcrepo_id): regex = re.compile( '^[a-z0-9]{8}-?[a-z0-9]{4}-?[a-z0-9]{4}-?[a-z0-9]{4}-?[a-z0-9]{12}$', re.I) match = regex.match(fcrepo_id) if bool(match): fcrepo_path = fcrepo_path_from_hash(fcrepo_id) fcrepo_url = self.config["httpresolver"][ "prefix"] + fcrepo_path + self.config["httpresolver"]["postfix"] fcrepo_hit = self.session.head(fcrepo_url) if fcrepo_hit.status_code == 200: return True return False
class Content: app = None config = None fcrepo_id = "" fcepo_path = "" url = "" etag_exists = False content_type_extension_map = { "image/jp2": "jp2", "image/tiff": "tif", "image/tif": "tif", "audio/mpeg": "mp3", "audio/x-wave": "wav", "text/plain": "txt", "application/pdf": "pdf", "video/mp4": "mp4", "video/mpeg": "mpeg", "video/quicktime": "mov", "video/x-flv": "flv", "application/x-shockwave-flash": "swf", "image/jpeg": "jpg", "image/png": "png", "image/gif": "gif", } extension = "" contenttype = "" def __init__(self, app, config, fcrepo_id): self.logger = getLogger(__name__) self.app = app self.config = config self.fcrepo_id = fcrepo_id self.session = requests.Session() if self._valid_fcrepo_id(fcrepo_id): self.fcrepo_path = fcrepo_path_from_hash(fcrepo_id) else: self.fcrepo_path = '/' + fcrepo_id self.url = self.config["httpresolver"][ "prefix"] + self.fcrepo_path + self.config["httpresolver"][ "postfix"] self._db = DB(app, config["sqlite"]["db"]) self.logger.debug("fcrepo_id is: {}".format(self.fcrepo_id)) self.logger.debug("fcrepo_path is: {}".format(self.fcrepo_path)) return def get(self): self.logger.debug("Fetching binary for: {}".format(self.fcrepo_id)) fs_path = self.get_fs_path() if fs_path != "Status404": self.logger.debug("Reading: {}".format(fs_path)) with open(fs_path, "rb") as f: imagedata = f.read() self.logger.debug("Serving: {}".format(fs_path)) if imagedata: response = Response(imagedata) response.headers['Content-type'] = self.contenttype return (response, "200") else: return ("What? " + fs_path, 404) else: return ("404 Not Found", 404) def get_fs_path(self): self.logger.debug("Fetching fileystem location for: {}".format( self.fcrepo_id)) headers = {} etag = self._etag_get() self.logger.debug("Etag is: {}".format(etag)) if etag: headers["If-None-Match"] = etag cache_req = self.session.head(self.url, headers=headers) self.logger.debug('ETag cache response code: {}'.format( cache_req.status_code)) self.logger.debug("cache_req headers: {}".format(cache_req.headers)) if cache_req.status_code == 304: self.logger.debug('Status was 304. Looking for cached file.') cache_fs_path = self.config["cache"][ "basedir"] + self.fcrepo_path + "." + self.extension # ABSOLUTE NECESSITY file_matches = glob.glob(cache_fs_path + "[0-9a-zA-Z]*") if len(file_matches) > 0: cached_file_path = file_matches[0] if '.' in cached_file_path: self.extension = cached_file_path.split('.')[-1] for key, value in self.content_type_extension_map.items(): if value == self.extension: self.extension = value self.contenttype = key break if self.extension == "": self.extension = "jp2" self.contenttype = "image/jp2" else: cached_file_path = self._copy_to_cache(cache_req.headers) elif cache_req.status_code == 404: cached_file_path = "Status404" elif cache_req.status_code == 503: cached_file_path = "Status503" else: cached_file_path = self._copy_to_cache(cache_req.headers) self.logger.debug( "Returning filesystem location: {}".format(cached_file_path)) return cached_file_path def iipimage_redirect_path(self): fs_path = self.get_fs_path() redirect_file = fs_path.replace(self.config["cache"]["basedir"], '') return redirect_file def _set_extension_contenttype(self, cache_req_headers): if "content-type" in cache_req_headers: for key, value in self.content_type_extension_map.items(): if key == cache_req_headers["content-type"]: self.extension = value self.contenttype = key break return def _copy_to_cache(self, cache_req_headers): if "content-type" in cache_req_headers: self._set_extension_contenttype(cache_req_headers) else: contenthead = self.session.head(self.url) self._set_extension_contenttype(contenthead.headers) self.logger.debug("Copying to cache.") ident = self.fcrepo_id # Will take: # /86/bf/14/11/86bf1411-6180-8103-52a1-e4d84f478ec1 # and return: # /86/bf/14/11/ cache_dir = self.config["cache"]["basedir"] + self.fcrepo_path.replace( self.fcrepo_id, '') self.logger.debug("Cache dir is: {}".format(cache_dir)) self._create_cache_dir(cache_dir) cache_fs_path = self.config["cache"][ "basedir"] + self.fcrepo_path + "." + self.extension if os.path.isfile(cache_fs_path): os.unlink(cache_fs_path) # Mindful of this. Requests.Session may require non-streamed content # or the connection is not released back in to the pool. with self.session.get(self.url, stream=True) as r: with open(cache_fs_path, 'wb') as f: # Increase the chunk size. Fewer disk writes. for chunk in r.iter_content(10240): f.write(chunk) # Store ETags. self._etag_put(r.headers['etag']) ''' # This code didn't seem to improve matters and, in fact, the memory hit # may have resulted in diminished service. # Curious about fewer disk writes. r = self.session.get(self.url) with open(cache_fs_path, 'wb') as f: f.write(r.content) ''' # Store ETags. # self._etag_put(r.headers['etag']) return cache_fs_path def _create_cache_dir(self, cache_dir): try: os.makedirs(cache_dir) except OSError as ose: if ose.errno == errno.EEXIST: pass else: raise def _etag_get(self): sql_query = "SELECT etag FROM etags WHERE fcrepoid = '" + self.fcrepo_id + "';" etags = self._db.query(sql_query) if etags != None: self.etag_exists = True return '"' + str(etags[0][0]) + '"' else: return None def _etag_put(self, etag): etag = etag.split(',')[0] etag = etag.replace('"', '') self.logger.debug("Etag for inserting into DB: {}".format(etag)) if self.etag_exists: sql_query = "UPDATE etags SET etag='" + etag + "' WHERE fcrepoid = '" + self.fcrepo_id + "';" etags = self._db.update(sql_query) else: sql_query = "INSERT INTO etags (fcrepoid, etag) VALUES ('" + self.fcrepo_id + "', '" + etag + "')" dbid = self._db.update(sql_query) return True def _valid_fcrepo_id(self, fcrepo_id): regex = re.compile( '^[a-z0-9]{8}-?[a-z0-9]{4}-?[a-z0-9]{4}-?[a-z0-9]{4}-?[a-z0-9]{12}$', re.I) match = regex.match(fcrepo_id) return bool(match)