def get(self): """ A full, synchronous get-with-expiration check. The whole get-check-extract process is in three easily accessible stages so that async cache APIs can call them separately. """ if not self._policy.should_read_cache(): # this is now pretty normal for client. We should only log # this when the client isn't requesting it. # LOG.warn("memcache.skip.read", "Skipping cache", **self._log_kwds()) self._policy.add_cost('s') return ('miss', None) full_result = self.raw_get() key = self.get_key() if not self.check_result(key, full_result): # would rather log this in the policy object, because this # is the only use of tag? LOG.notice("%s.cache.result" % self._policy.tag, "", key=key, keyobj=self.key_obj, **self._log_kwds(code="miss")) if self._policy.allow_stale() and full_result: return ('miss', self._policy.extract_result(full_result)) else: return ('miss', None) return ('hit', self._policy.extract_result(full_result))
def _safe_entrypoint(f, *args, **kwds): """ Decorator that just makes the function avoid throwing exception """ try: return f(*args, **kwds) except pylibmc.Error as e: LOG.error("memcache.%s.error" % f.__name__, "Error in memcache call", error=e)
def envelope_error(self,message,**kwds): # this should match the MQLError definition inside mw/mql/error.py error_detail = { 'code': '/api/status/error/envelope/parse', 'info': kwds, 'message': message } LOG.warning("envelope.error", repr(error_detail)) self.mss.add_hint('mql-error') return { 'code': '/api/status/error', 'messages': [ error_detail ] }
def fix_xml_encoding(text_body): """ If the given xml document has an encoding declaration, then reencode that document with that declaration. This sucks, but it's the only sane way to deal with lxml.html """ encoding = encoding_declaration(text_body) if encoding: # reencode so lxml can be happy - this totally # sucks because we just spent all this time # encoding it. LOG.notice("content.reencode", "Dumb reencoding of blob body as %s" % encoding) text_body = text_body.encode(encoding) return text_body
def parse(self, mss): """ extract data from the image exif tags from digital cameras """ # exif tags from digital cameras? self.content.fetch_body(mss) try: # XXXarielb move to pygmy as soon as pygmy doesn't crash within threads from PIL import Image img = Image.open(StringIO(self.content.body)) # believe the image parser over anything in the graph self.size = img.size except ImportError, ie: LOG.error("format.image.no_pil", str(e)) raise
def guess(self, soup): """ if we ran beautifulsoup, it may have figured out more about the document. incorporate if so... """ # beautifulsoup may have figured out the text encoding if self.content.text_encoding is None: self.content.text_encoding = soup.originalEncoding else: # for now, go with the explict header given by the # server, assuming that some transcoding tool somewhere # along the line probably didn't have deep enough knowledge # of sgml to change the explicit encoding in the file. if self.content.text_encoding != soup.originalEncoding: LOG.warn( 'format.html.guess', 'html encoding mismatch for content %s: inferred %s' % (self.content, soup.originalEncoding))
def __exit__(self, type, value, traceback): """ check the memcache state as compared to a previous state - if the state has changed in a bad way, log it. python-memcached doesn't provide a good way of catching server errors, so the best thing we can do is compare the list of "up" servers before and after access """ if hasattr(self.memcache, 'servers'): now_live_servers = set(server for server in self.memcache.servers if server.deaduntil == 0) down_servers = self.old_state - now_live_servers for server in down_servers: LOG.warn("memcache.dead", "Server just went down", code=str(server))
def raw_get(self): """ Gets the raw value from the cache """ key = self.get_key() self._policy.add_cost('r') # no memcache hooked up? if not self.cache: return None with MemcacheChecker(self.cache): try: return self.cache.get(key) except pylibmc.Error as e: LOG.error("memcache.error.get", "memcache get failure", error=e, **self._log_kwds())
def set_oauth_attribution_if_needed(mss): if not mss.authorized_app_id: return user_id = mss.get_user_id() query = [{ "id": None, "creator": user_id, "type": "/freebase/written_by", "/freebase/written_by/application": { "id": mss.authorized_app_id } }] result = mss.mqlread(query, cache=False) if result: if len(result) > 1: # somehow we manage to get multiple attributions - fail gracefully and log an error LOG.warn("set_oauth_attribution_if_needed.duplicate", "duplicate attributions for %s and %s" % (mss.authorized_app_id, user_id), application_id=mss.authorized_app_id, user_id=user_id, attributions=result) result = result[0] else: query = { "create": "unconditional", "id": None, "/freebase/written_by/application": { "connect": "insert", "id": mss.authorized_app_id }, "type": ["/freebase/written_by", "/type/attribution"] } with mss.push_variables(permission="/boot/oauth_permission", privileged=scope.Privileged, authority=None): result = mss.mqlwrite(query) mss.push_variables(attribution=result['id'] if result else None)
def raw_set(self, full_result): """ Sets the raw value directly in the cache """ key = self.get_key() if not self.cache: return self._policy.add_cost('w') expires, long_lived = self._policy.get_expires() with MemcacheChecker(self.cache): try: return self.cache.set(key, full_result, time=expires) except (pylibmc.WriteError, pylibmc._pylibmc.MemcachedError) as e: LOG.error("memcache.error.set", "memcache set failure", error=e, **self._log_kwds())
def sanitize_content(content, encoding='utf-8'): """ sanitize any content that could cause a browser to run javascript. understands html for now. XXX what other media types need to be sanitized? multipart/* containing html rss or atom that contains html any other content-types that might be interpreted by the browser? """ data = content.body mt = content.media_type if mt == 'text/x-cross-domain-policy': raise SanitizationError( "Cannot retrieve text/x-cross-domain-policy files for security reasons", app_code="/sanitizer/media_type/restricted", media_type=mt) # XXX better regex possible, but this should catch known valid ones mt_re = re.compile('^[-_.a-z0-9]+/[-_.a-z0-9]+$') if mt_re.match(mt) is None: raise SanitizationError("Unhandled media type", app_code='/sanitizer/media_type/unhandled', media_type=mt) if mt in ['text/html', 'application/xhtml+xml']: data = sanitize_html(data, encoding) else: unsafe_media_types = [ 'text/html', 'application/xhtml+xml', 'application/javascript', 'application/data-javascript', 'application/ecmascript', 'text/javascript', 'text/ecmascript', 'text/css', 'application/atom+xml', 'application/rss+xml' ] if mt.startswith('multipart/') or mt in unsafe_media_types: # log warning and move on LOG.warn("html.sanitize", "Potentially unsafe content data of type: %s" % mt) return data
def set(self, values): """ Sets a value for each key passed into the constructor. """ if not values: return values if not self._policy.should_write_cache() or not self.cache: LOG.warn("memcache.skip.write", "Per policy, not writing result to the cache", **self._log_kwds()) return self._policy.add_cost('w') cache_mapping = izip(self.cache_entries, values) with MemcacheChecker(self.cache): try: result = self.cache.set_multi( dict((ce.get_key(), ce._policy.annotate_result(result)) for ce, result in cache_mapping)) if result: # this only gets logged by python-memcached # implementation LOG.error("memcache.set_multi.write", "Failed to write %s results" % result, keys=result, **self._log_kwds()) except pylibmc.WriteError as e: LOG.error("memcache.error.set_multi", "memcache set_multi failure", error=e, **self._log_kwds()) result = {} return values
def set(self, result): """ A full, synchronous set-with-policy operation. """ if not self._policy.should_write_cache(): LOG.warn("memcache.skip.write", "Per policy, not writing result to the cache", **self._log_kwds()) return full_result = self._policy.annotate_result(result) success = self.raw_set(full_result) if not success: LOG.error("memcache.set.write", "Failed to write %s" % self.get_key(), key=self.get_key(), **self._log_kwds(code=success)) # acts as an identity function so it can be used with Twisted # deferreds and such return result
def load_config(self, options): # this loads the configuration file without attempting to connect to any services from paste.deploy import appconfig config = {} if options.config_file is not None: LOG.debug("parse.args", "Trying to open %s" % options.config_file) try: config = appconfig("config:%s" % options.config_file) except LookupError as e: LOG.debug("parse.args", "Error loading config file, missing paste sections", options.config_file, e) # fall through for k, v in (li.split('=', 1) for li in options.defines): config[k] = v loglevels = 'EMERG ALERT CRIT ERR WARNING NOTICE INFO DEBUG'.split() if options.loglevel in loglevels: LOG.setLevel(logging.getLevelName(options.loglevel)) else: self.error('unknown log level %s\n valid log levels are %s' % (options.loglevel, ', '.join(loglevels))) sys.exit(1) # go through the config file for these options, keeps things # simple if options.graphd_addr: config["graphd.address"] = options.graphd_addr if options.blobd_addr: config["clobd.address"] = options.blobd_addr config["clobd.masteraddress"] = options.blobd_addr if options.relevance_addr: config["relevance.address"] = options.relevance_addr if options.geo_addr: config["geo.address"] = options.geo_addr if options.no_timeouts: config["debug.no_timeouts"] = options.no_timeouts and 'true' self.config = config return config
class ImageContent(ContentWrapper): """ methods for dealing with image content """ # ie6 uses some bizarre content_types for PNG and JPEG images # XXX it would be nice to fix the content_type in the # /type/content object, but it may already have been uploaded. # so for now, images uploaded from ie6 will have the "wrong" # content-type and we'll need to garden them. remap_dumb_ie_mime_types = { 'image/pjpeg': contenttype.MediaType('image/jpeg'), 'image/x-png': contenttype.MediaType('image/png') } @classmethod def match(cls, c): """ true if this ContentWrapper subclass applies to the content argument. """ media_type = cls.remap_dumb_ie_mime_types.get(c.media_type, c.media_type) if not c.media_type.startswith('image/'): return False subtype = media_type.split('/')[1] return subtype in ('gif', 'png', 'jpeg', 'x-icon') def __init__(self, content): super(ImageContent, self).__init__(content) self.size = None def load(self, mss): result = mss.mqlread(dict(id=self.content.content_id, type='/common/image', size=dict(x=None, y=None))) if result is None: return self.size = (result['size']['x'], result['size']['y']) def upload(self, mss): """ add a /common/image facet to the type/content """ self.load(mss) if self.size is None: self.parse(mss) w = { 'id': self.content.content_id, 'type': { 'connect': 'insert', 'id': '/common/image' }} if self.size[0] and self.size[1]: w['/common/image/size'] = { 'create': 'unless_exists', 'type': '/measurement_unit/rect_size', 'x': self.size[0], 'y': self.size[1] } with mss.push_variables(authority="/user/content_administrator", privileged=scope.Authority): result = mss.mqlwrite(w) def parse(self, mss): """ extract data from the image exif tags from digital cameras """ # exif tags from digital cameras? self.content.fetch_body(mss) try: # XXXarielb move to pygmy as soon as pygmy doesn't crash within threads from PIL import Image img = Image.open(StringIO(self.content.body)) # believe the image parser over anything in the graph self.size = img.size except ImportError, ie: LOG.error("format.image.no_pil", str(e)) raise except Exception, e: LOG.error("format.image.parse", str(e)) raise ContentLoadError('Invalid image file', app_code="upload/invalid_image_data", error=e)
def update_content(self): media_type = self.content.media_type LOG.info('update_content', "Image Updating content from %s to %s" % (media_type, self.remap_dumb_ie_mime_types.get(media_type))) self.content.media_type = self.remap_dumb_ie_mime_types.get(media_type, media_type)
media_type = self.content.media_type LOG.info('update_content', "Image Updating content from %s to %s" % (media_type, self.remap_dumb_ie_mime_types.get(media_type))) self.content.media_type = self.remap_dumb_ie_mime_types.get(media_type, media_type) @classmethod def get_fallback_image_path(cls): try: config = mw.siteconfig.get_config2() path = config.get('me.img_thumb_fallback') if path and os.path.exists(path): return path except KeyError, e: pass LOG.error("image.thumb", "Could not find fallback image for thumbnailing service.") return None # failover for thumnailing operation in the event that # the image is too large to thumbnail def thumb_fallback(self, mss): path = ImageContent.get_fallback_image_path() if path is None: return None # load data fd = open(path) data = fd.read() fd.close() # the fallback image is a known GIF image. thumb_mt = 'image/gif'
def get(self): """ Gets all the cache entries - will return a triple of:: ('hit', 'miss' or 'skip', value, CacheEntry) for each cache entry passed in to the CacheEntryList constructor """ # no memcache hooked up? if not self._policy.should_read_cache() or not self.cache: LOG.warn("memcache.skip.read", "Skipping cache", **self._log_kwds()) self._policy.add_cost('s') return [('skip', None, ce) for ce in self.cache_entries] self._policy.add_cost('r') with MemcacheChecker(self.cache): try: memcache_result = self.cache.get_multi( [ce.get_key() for ce in self.cache_entries]) except pylibmc.Error as e: memcache_result = {} LOG.error("memcache.error.get_multi", "memcache get_multi failure", error=e, **self._log_kwds()) assert isinstance(memcache_result, dict) result = [] # create an entry in the result for each cache entry for ce in self.cache_entries: key = ce.get_key() mr = memcache_result.get(key) if not ce.check_result(key, mr): result.append(('miss', None, ce)) else: result.append(('hit', ce._policy.extract_result(mr), ce)) misses = [miss for miss in result if miss[0] == 'miss'] hits = [hit for hit in result if hit[0] == 'hit'] miss_keys = [ce.key_obj for (status, value, ce) in misses] miss_hashes = [ce.get_key() for (status, value, ce) in misses] hit_keys = [ce.key_obj for (status, value, ce) in hits] hit_hashes = [ce.get_key() for (status, value, ce) in hits] if miss_keys: code = "hits+misses" if hit_keys else "all miss" else: code = "all hit" if hit_keys else "empty" LOG.notice("%s.cache.multiresult" % self._policy.tag, "", miss_hashes=miss_hashes, hit_hashes=hit_hashes, miss_count=len(misses), hit_count=len(hits), **self._log_kwds(code=code)) return result