Beispiel #1
0
    def get(self):
        """
        A full, synchronous get-with-expiration check.

        The whole get-check-extract process is in three easily
        accessible stages so that async cache APIs can call them
        separately.
        """
        if not self._policy.should_read_cache():
            # this is now pretty normal for client. We should only log
            # this when the client isn't requesting it.
            # LOG.warn("memcache.skip.read", "Skipping cache", **self._log_kwds())
            self._policy.add_cost('s')
            return ('miss', None)

        full_result = self.raw_get()

        key = self.get_key()
        if not self.check_result(key, full_result):
            # would rather log this in the policy object, because this
            # is the only use of tag?
            LOG.notice("%s.cache.result" % self._policy.tag,
                       "",
                       key=key,
                       keyobj=self.key_obj,
                       **self._log_kwds(code="miss"))
            if self._policy.allow_stale() and full_result:
                return ('miss', self._policy.extract_result(full_result))
            else:
                return ('miss', None)

        return ('hit', self._policy.extract_result(full_result))
Beispiel #2
0
def _safe_entrypoint(f, *args, **kwds):
    """
    Decorator that just makes the function avoid throwing exception
    """
    try:
        return f(*args, **kwds)
    except pylibmc.Error as e:
        LOG.error("memcache.%s.error" % f.__name__,
                  "Error in memcache call",
                  error=e)
Beispiel #3
0
    def envelope_error(self,message,**kwds):
        # this should match the MQLError definition inside mw/mql/error.py
        error_detail = { 'code': '/api/status/error/envelope/parse',
                         'info': kwds,
                         'message': message
                         }
        LOG.warning("envelope.error", repr(error_detail))

        self.mss.add_hint('mql-error')
        return { 'code': '/api/status/error',
                 'messages': [ error_detail ]
                 }
Beispiel #4
0
def fix_xml_encoding(text_body):
    """
    If the given xml document has an encoding declaration, then
    reencode that document with that declaration. This sucks, but it's
    the only sane way to deal with lxml.html
    """
    encoding = encoding_declaration(text_body)
    if encoding:
        # reencode so lxml can be happy - this totally
        # sucks because we just spent all this time
        # encoding it.
        LOG.notice("content.reencode",
                   "Dumb reencoding of blob body as %s" % encoding)
        text_body = text_body.encode(encoding)

    return text_body
Beispiel #5
0
    def parse(self, mss):
        """
        extract data from the image

        exif tags from digital cameras
        """
        # exif tags from digital cameras?
        
        self.content.fetch_body(mss)
        try:
            # XXXarielb move to pygmy as soon as pygmy doesn't crash within threads
            from PIL import Image
            img = Image.open(StringIO(self.content.body))
            # believe the image parser over anything in the graph
            self.size = img.size
        except ImportError, ie:
            LOG.error("format.image.no_pil", str(e))
            raise
Beispiel #6
0
 def guess(self, soup):
     """
     if we ran beautifulsoup, it may have figured out more
     about the document.  incorporate if so...
     """
     # beautifulsoup may have figured out the text encoding
     if self.content.text_encoding is None:
         self.content.text_encoding = soup.originalEncoding
     else:
         # for now, go with the explict header given by the
         # server, assuming that some transcoding tool somewhere
         # along the line probably didn't have deep enough knowledge
         # of sgml to change the explicit encoding in the file.
         if self.content.text_encoding != soup.originalEncoding:
             LOG.warn(
                 'format.html.guess',
                 'html encoding mismatch for content %s: inferred %s' %
                 (self.content, soup.originalEncoding))
Beispiel #7
0
    def __exit__(self, type, value, traceback):
        """
        check the memcache state as compared to a previous state - if
        the state has changed in a bad way, log it.

        python-memcached doesn't provide a good way of catching
        server errors, so the best thing we can do is compare the
        list of "up" servers before and after access
        """
        if hasattr(self.memcache, 'servers'):
            now_live_servers = set(server for server in self.memcache.servers
                                   if server.deaduntil == 0)

            down_servers = self.old_state - now_live_servers
            for server in down_servers:
                LOG.warn("memcache.dead",
                         "Server just went down",
                         code=str(server))
Beispiel #8
0
    def raw_get(self):
        """
        Gets the raw value from the cache
        """
        key = self.get_key()
        self._policy.add_cost('r')

        # no memcache hooked up?
        if not self.cache:
            return None

        with MemcacheChecker(self.cache):
            try:
                return self.cache.get(key)
            except pylibmc.Error as e:
                LOG.error("memcache.error.get",
                          "memcache get failure",
                          error=e,
                          **self._log_kwds())
Beispiel #9
0
def set_oauth_attribution_if_needed(mss):
    if not mss.authorized_app_id:
        return

    user_id = mss.get_user_id()

    query = [{
        "id": None,
        "creator": user_id,
        "type": "/freebase/written_by",
        "/freebase/written_by/application": {
            "id": mss.authorized_app_id
        }
    }]

    result = mss.mqlread(query, cache=False)
    if result:
        if len(result) > 1:
            # somehow we manage to get multiple attributions - fail gracefully and log an error
            LOG.warn("set_oauth_attribution_if_needed.duplicate",
                     "duplicate attributions for %s and %s" %
                     (mss.authorized_app_id, user_id),
                     application_id=mss.authorized_app_id,
                     user_id=user_id,
                     attributions=result)
        result = result[0]
    else:
        query = {
            "create": "unconditional",
            "id": None,
            "/freebase/written_by/application": {
                "connect": "insert",
                "id": mss.authorized_app_id
            },
            "type": ["/freebase/written_by", "/type/attribution"]
        }

        with mss.push_variables(permission="/boot/oauth_permission",
                                privileged=scope.Privileged,
                                authority=None):
            result = mss.mqlwrite(query)
    mss.push_variables(attribution=result['id'] if result else None)
Beispiel #10
0
    def raw_set(self, full_result):
        """
        Sets the raw value directly in the cache
        """
        key = self.get_key()

        if not self.cache:
            return

        self._policy.add_cost('w')
        expires, long_lived = self._policy.get_expires()

        with MemcacheChecker(self.cache):
            try:
                return self.cache.set(key, full_result, time=expires)
            except (pylibmc.WriteError, pylibmc._pylibmc.MemcachedError) as e:
                LOG.error("memcache.error.set",
                          "memcache set failure",
                          error=e,
                          **self._log_kwds())
Beispiel #11
0
def sanitize_content(content, encoding='utf-8'):
    """
    sanitize any content that could cause a browser to run javascript.
    
    understands html for now.
    XXX what other media types need to be sanitized?
        multipart/* containing html
        rss or atom that contains html
        any other content-types that might be interpreted by the browser?
    """
    data = content.body
    mt = content.media_type

    if mt == 'text/x-cross-domain-policy':
        raise SanitizationError(
            "Cannot retrieve text/x-cross-domain-policy files for security reasons",
            app_code="/sanitizer/media_type/restricted",
            media_type=mt)
    # XXX better regex possible, but this should catch known valid ones
    mt_re = re.compile('^[-_.a-z0-9]+/[-_.a-z0-9]+$')
    if mt_re.match(mt) is None:
        raise SanitizationError("Unhandled media type",
                                app_code='/sanitizer/media_type/unhandled',
                                media_type=mt)

    if mt in ['text/html', 'application/xhtml+xml']:
        data = sanitize_html(data, encoding)
    else:
        unsafe_media_types = [
            'text/html', 'application/xhtml+xml', 'application/javascript',
            'application/data-javascript', 'application/ecmascript',
            'text/javascript', 'text/ecmascript', 'text/css',
            'application/atom+xml', 'application/rss+xml'
        ]

        if mt.startswith('multipart/') or mt in unsafe_media_types:
            # log warning and move on
            LOG.warn("html.sanitize",
                     "Potentially unsafe content data of type: %s" % mt)

    return data
Beispiel #12
0
    def set(self, values):
        """
        Sets a value for each key passed into the constructor.
        """
        if not values:
            return values
        if not self._policy.should_write_cache() or not self.cache:
            LOG.warn("memcache.skip.write",
                     "Per policy, not writing result to the cache",
                     **self._log_kwds())
            return

        self._policy.add_cost('w')
        cache_mapping = izip(self.cache_entries, values)
        with MemcacheChecker(self.cache):
            try:
                result = self.cache.set_multi(
                    dict((ce.get_key(), ce._policy.annotate_result(result))
                         for ce, result in cache_mapping))
                if result:
                    # this only gets logged by python-memcached
                    # implementation
                    LOG.error("memcache.set_multi.write",
                              "Failed to write %s results" % result,
                              keys=result,
                              **self._log_kwds())
            except pylibmc.WriteError as e:
                LOG.error("memcache.error.set_multi",
                          "memcache set_multi failure",
                          error=e,
                          **self._log_kwds())
                result = {}

        return values
Beispiel #13
0
    def set(self, result):
        """
        A full, synchronous set-with-policy operation.
        """
        if not self._policy.should_write_cache():
            LOG.warn("memcache.skip.write",
                     "Per policy, not writing result to the cache",
                     **self._log_kwds())
            return

        full_result = self._policy.annotate_result(result)

        success = self.raw_set(full_result)
        if not success:
            LOG.error("memcache.set.write",
                      "Failed to write %s" % self.get_key(),
                      key=self.get_key(),
                      **self._log_kwds(code=success))

        # acts as an identity function so it can be used with Twisted
        # deferreds and such
        return result
Beispiel #14
0
    def load_config(self, options):
        # this loads the configuration file without attempting to connect to any services

        from paste.deploy import appconfig

        config = {}
        if options.config_file is not None:
            LOG.debug("parse.args", "Trying to open %s" % options.config_file)
            try:
                config = appconfig("config:%s" % options.config_file)
            except LookupError as e:
                LOG.debug("parse.args",
                          "Error loading config file, missing paste sections",
                          options.config_file, e)
                # fall through

        for k, v in (li.split('=', 1) for li in options.defines):
            config[k] = v

        loglevels = 'EMERG ALERT CRIT ERR WARNING NOTICE INFO DEBUG'.split()
        if options.loglevel in loglevels:
            LOG.setLevel(logging.getLevelName(options.loglevel))
        else:
            self.error('unknown log level %s\n  valid log levels are %s' %
                       (options.loglevel, ', '.join(loglevels)))
            sys.exit(1)

        # go through the config file for these options, keeps things
        # simple
        if options.graphd_addr:
            config["graphd.address"] = options.graphd_addr

        if options.blobd_addr:
            config["clobd.address"] = options.blobd_addr
            config["clobd.masteraddress"] = options.blobd_addr

        if options.relevance_addr:
            config["relevance.address"] = options.relevance_addr

        if options.geo_addr:
            config["geo.address"] = options.geo_addr

        if options.no_timeouts:
            config["debug.no_timeouts"] = options.no_timeouts and 'true'

        self.config = config
        return config
Beispiel #15
0
class ImageContent(ContentWrapper):
    """
    methods for dealing with image content
    """

    # ie6 uses some bizarre content_types for PNG and JPEG images
    # XXX it would be nice to fix the content_type in the
    #  /type/content object, but it may already have been uploaded.
    # so for now, images uploaded from ie6 will have the "wrong"
    #  content-type and we'll need to garden them.
    remap_dumb_ie_mime_types = {
        'image/pjpeg': contenttype.MediaType('image/jpeg'),
        'image/x-png': contenttype.MediaType('image/png')
    }


    @classmethod
    def match(cls, c):
        """
        true if this ContentWrapper subclass applies to the content argument.
        """
        media_type = cls.remap_dumb_ie_mime_types.get(c.media_type, c.media_type)
        if not c.media_type.startswith('image/'):
            return False

        subtype = media_type.split('/')[1]
        
        return subtype in ('gif', 'png', 'jpeg', 'x-icon')

    def __init__(self, content):
        super(ImageContent, self).__init__(content)
        self.size = None

    def load(self, mss):
        result = mss.mqlread(dict(id=self.content.content_id,
                             type='/common/image',
                             size=dict(x=None, y=None)))

        if result is None:
            return
        
        self.size = (result['size']['x'], result['size']['y'])

    def upload(self, mss):
        """
        add a /common/image facet to the type/content
        """
        self.load(mss)
        if self.size is None:
            self.parse(mss)
            
        w = { 'id': self.content.content_id,
              'type': { 'connect': 'insert',
                        'id': '/common/image' }}
        if self.size[0] and self.size[1]:
            w['/common/image/size'] = { 'create': 'unless_exists',
                                        'type': '/measurement_unit/rect_size',
                                        'x': self.size[0],
                                        'y': self.size[1] }

        with mss.push_variables(authority="/user/content_administrator",
                                privileged=scope.Authority):
            result = mss.mqlwrite(w)

    def parse(self, mss):
        """
        extract data from the image

        exif tags from digital cameras
        """
        # exif tags from digital cameras?
        
        self.content.fetch_body(mss)
        try:
            # XXXarielb move to pygmy as soon as pygmy doesn't crash within threads
            from PIL import Image
            img = Image.open(StringIO(self.content.body))
            # believe the image parser over anything in the graph
            self.size = img.size
        except ImportError, ie:
            LOG.error("format.image.no_pil", str(e))
            raise
        except Exception, e: 
            LOG.error("format.image.parse", str(e))
            raise ContentLoadError('Invalid image file', 
                                   app_code="upload/invalid_image_data", 
                                   error=e)
Beispiel #16
0
 def update_content(self):
     media_type = self.content.media_type
     LOG.info('update_content', "Image Updating content from %s to %s" % (media_type,
                                                   self.remap_dumb_ie_mime_types.get(media_type)))
     self.content.media_type = self.remap_dumb_ie_mime_types.get(media_type, media_type)
Beispiel #17
0
        media_type = self.content.media_type
        LOG.info('update_content', "Image Updating content from %s to %s" % (media_type,
                                                      self.remap_dumb_ie_mime_types.get(media_type)))
        self.content.media_type = self.remap_dumb_ie_mime_types.get(media_type, media_type)
  
    @classmethod
    def get_fallback_image_path(cls):
	try:
            config = mw.siteconfig.get_config2()
            path = config.get('me.img_thumb_fallback')
            if path and os.path.exists(path):
                return path
	except KeyError, e:
	    pass

        LOG.error("image.thumb", "Could not find fallback image for thumbnailing service.")
        return None


    # failover for thumnailing operation in the event that 
    # the image is too large to thumbnail
    def thumb_fallback(self, mss):
        path = ImageContent.get_fallback_image_path()
        if path is None:
            return None
        # load data 
        fd = open(path)
        data = fd.read()
        fd.close()
        # the fallback image is a known GIF image.
        thumb_mt = 'image/gif'
Beispiel #18
0
    def get(self):
        """
        Gets all the cache entries - will return a triple of::
        
            ('hit', 'miss' or 'skip', value, CacheEntry)
            
        for each cache entry passed in to the CacheEntryList constructor
        """

        # no memcache hooked up?
        if not self._policy.should_read_cache() or not self.cache:
            LOG.warn("memcache.skip.read", "Skipping cache",
                     **self._log_kwds())
            self._policy.add_cost('s')
            return [('skip', None, ce) for ce in self.cache_entries]

        self._policy.add_cost('r')

        with MemcacheChecker(self.cache):
            try:
                memcache_result = self.cache.get_multi(
                    [ce.get_key() for ce in self.cache_entries])
            except pylibmc.Error as e:
                memcache_result = {}
                LOG.error("memcache.error.get_multi",
                          "memcache get_multi failure",
                          error=e,
                          **self._log_kwds())

        assert isinstance(memcache_result, dict)
        result = []

        # create an entry in the result for each cache entry
        for ce in self.cache_entries:
            key = ce.get_key()
            mr = memcache_result.get(key)
            if not ce.check_result(key, mr):
                result.append(('miss', None, ce))
            else:
                result.append(('hit', ce._policy.extract_result(mr), ce))

        misses = [miss for miss in result if miss[0] == 'miss']
        hits = [hit for hit in result if hit[0] == 'hit']

        miss_keys = [ce.key_obj for (status, value, ce) in misses]
        miss_hashes = [ce.get_key() for (status, value, ce) in misses]

        hit_keys = [ce.key_obj for (status, value, ce) in hits]
        hit_hashes = [ce.get_key() for (status, value, ce) in hits]

        if miss_keys:
            code = "hits+misses" if hit_keys else "all miss"
        else:
            code = "all hit" if hit_keys else "empty"
        LOG.notice("%s.cache.multiresult" % self._policy.tag,
                   "",
                   miss_hashes=miss_hashes,
                   hit_hashes=hit_hashes,
                   miss_count=len(misses),
                   hit_count=len(hits),
                   **self._log_kwds(code=code))
        return result