Example #1
0
  def get(self):
    namespace = self.request.get('namespace', 'default-gzip')
    digest = self.request.get('digest', '')
    content = None

    if digest and namespace:
      try:
        raw_data, entity = model.get_content(namespace, digest)
      except ValueError:
        self.abort(400, 'Invalid key')
      except LookupError:
        self.abort(404, 'Unable to retrieve the entry')

      if not raw_data:
        stream = gcs.read_file(config.settings().gs_bucket, entity.key.id())
      else:
        stream = [raw_data]
      content = ''.join(model.expand_content(namespace, stream))

      self.response.headers['X-Frame-Options'] = 'SAMEORIGIN'
      # We delete Content-Type before storing to it to avoid having two (yes,
      # two) Content-Type headers.
      del self.response.headers['Content-Type']
      # Apparently, setting the content type to text/plain encourages the
      # browser (Chrome, at least) to sniff the mime type and display
      # things like images.  Images are autowrapped in <img> and text is
      # wrapped in <pre>.
      self.response.headers['Content-Type'] = 'text/plain; charset=utf-8'
      self.response.headers['Content-Disposition'] = str('filename=%s' % digest)
      if content.startswith('{'):
        # Try to format as JSON.
        try:
          content = json.dumps(
              json.loads(content), sort_keys=True, indent=2,
              separators=(',', ': '))
          # If we don't wrap this in html, browsers will put content in a pre
          # tag which is also styled with monospace/pre-wrap.  We can't use
          # anchor tags in <pre>, so we force it to be a <div>, which happily
          # accepts links.
          content = (
            '<div style="font-family:monospace;white-space:pre-wrap;">%s</div>'
             % content)
          # Linkify things that look like hashes
          content = re.sub(r'([0-9a-f]{40})',
            r'<a target="_blank" href="/browse?namespace=%s' % namespace +
              r'&digest=\1">\1</a>',
            content)
          self.response.headers['Content-Type'] = 'text/html; charset=utf-8'
        except ValueError:
          pass

    self.response.write(content)
Example #2
0
def hash_content(content, namespace):
    """Decompresses and hashes given |content|.

  Returns tuple (hex digest, expanded size).

  Raises ValueError in case of errors.
  """
    expanded_size = 0
    digest = hashlib.sha1()
    try:
        for data in model.expand_content(namespace, [content]):
            expanded_size += len(data)
            digest.update(data)
            # Make sure the data is GC'ed.
            del data
        return digest.hexdigest(), expanded_size
    except zlib.error as e:
        raise ValueError('Data is corrupted: %s' % e)
Example #3
0
def hash_content(content, namespace):
  """Decompresses and hashes given |content|.

  Returns tuple (hex digest, expanded size).

  Raises ValueError in case of errors.
  """
  expanded_size = 0
  digest = model.get_hash_algo(namespace)
  try:
    for data in model.expand_content(namespace, [content]):
      expanded_size += len(data)
      digest.update(data)
      # Make sure the data is GC'ed.
      del data
    return digest.hexdigest(), expanded_size
  except zlib.error as e:
    raise ValueError('Data is corrupted: %s' % e)
Example #4
0
  def post(self, namespace, hash_key):
    entry = model.entry_key(namespace, hash_key).get()
    if not entry:
      logging.error('Failed to find entity')
      return
    if entry.is_verified:
      logging.warning('Was already verified')
      return
    if entry.content is not None:
      logging.error('Should not be called with inline content')
      return

    # Get GS file size.
    gs_bucket = config.settings().gs_bucket
    gs_file_info = gcs.get_file_info(gs_bucket, entry.key.id())

    # It's None if file is missing.
    if not gs_file_info:
      # According to the docs, GS is read-after-write consistent, so a file is
      # missing only if it wasn't stored at all or it was deleted, in any case
      # it's not a valid ContentEntry.
      self.purge_entry(entry, 'No such GS file')
      return

    # Expected stored length and actual length should match.
    if gs_file_info.size != entry.compressed_size:
      self.purge_entry(entry,
          'Bad GS file: expected size is %d, actual size is %d',
          entry.compressed_size, gs_file_info.size)
      return

    save_to_memcache = (
        entry.compressed_size <= model.MAX_MEMCACHE_ISOLATED and
        entry.is_isolated)
    expanded_size = 0
    digest = model.get_hash_algo(namespace)
    data = None

    try:
      # Start a loop where it reads the data in block.
      stream = gcs.read_file(gs_bucket, entry.key.id())
      if save_to_memcache:
        # Wraps stream with a generator that accumulates the data.
        stream = Accumulator(stream)

      for data in model.expand_content(namespace, stream):
        expanded_size += len(data)
        digest.update(data)
        # Make sure the data is GC'ed.
        del data

      # Hashes should match.
      if digest.hexdigest() != hash_key:
        self.purge_entry(entry,
            'SHA-1 do not match data (%d bytes, %d bytes expanded)',
            entry.compressed_size, expanded_size)
        return

    except gcs.NotFoundError as e:
      # Somebody deleted a file between get_file_info and read_file calls.
      self.purge_entry(entry, 'File was unexpectedly deleted')
      return
    except (gcs.ForbiddenError, gcs.AuthorizationError) as e:
      # Misconfiguration in Google Storage ACLs. Don't delete an entry, it may
      # be fine. Maybe ACL problems would be fixed before the next retry.
      logging.warning(
          'CloudStorage auth issues (%s): %s', e.__class__.__name__, e)
      # Abort so the job is retried automatically.
      return self.abort(500)
    except (gcs.FatalError, zlib.error, IOError) as e:
      # ForbiddenError and AuthorizationError inherit FatalError, so this except
      # block should be last.
      # It's broken or unreadable.
      self.purge_entry(entry,
          'Failed to read the file (%s): %s', e.__class__.__name__, e)
      return

    # Verified. Data matches the hash.
    entry.expanded_size = expanded_size
    entry.is_verified = True
    future = entry.put_async()
    logging.info(
        '%d bytes (%d bytes expanded) verified',
        entry.compressed_size, expanded_size)
    if save_to_memcache:
      model.save_in_memcache(namespace, hash_key, ''.join(stream.accumulated))
    future.wait()
Example #5
0
    def get(self):
        namespace = self.request.get('namespace', 'default-gzip')
        digest = self.request.get('digest', '')
        content = None
        if not digest:
            self.abort(400, 'Missing digest')
        if not namespace:
            self.abort(400, 'Missing namespace')

        try:
            raw_data, entity = model.get_content(namespace, digest)
        except ValueError:
            self.abort(400, 'Invalid key')
        except LookupError:
            self.abort(404, 'Unable to retrieve the entry')

        logging.info('%s', entity)
        if not raw_data:
            try:
                stream = gcs.read_file(config.settings().gs_bucket,
                                       entity.key.id())
                content = ''.join(model.expand_content(namespace, stream))
            except cloudstorage.NotFoundError:
                logging.error(
                    'Entity in DB but not in GCS: deleting entity in DB')
                entity.key.delete()
                self.abort(404, 'Unable to retrieve the file from GCS')
        else:
            content = ''.join(model.expand_content(namespace, [raw_data]))

        self.response.headers['X-Frame-Options'] = 'SAMEORIGIN'
        # We delete Content-Type before storing to it to avoid having two (yes,
        # two) Content-Type headers.
        del self.response.headers['Content-Type']

        # Apparently, setting the content type to text/plain encourages the
        # browser (Chrome, at least) to sniff the mime type and display
        # things like images.  Images are autowrapped in <img> and text is
        # wrapped in <pre>.
        self.response.headers['Content-Type'] = 'text/plain; charset=utf-8'

        # App Engine puts a limit of 33554432 bytes on a request, which includes
        # headers. Headers are ~150 bytes.  If the content + headers might
        # exceed that limit, we give the user an option to workround getting
        # their file.
        if len(content) > 33554000:
            host = modules.get_hostname(module='default', version='default')
            # host is something like default.default.myisolateserver.appspot.com
            host = host.replace('default.default.', '')
            sizeInMib = len(content) / (1024.0 * 1024.0)
            content = (
                'Sorry, your file is %1.1f MiB big, which exceeds the 32 MiB'
                ' App Engine limit.\nTo work around this, run the following command:\n'
                '    python isolateserver.py download -I %s --namespace %s -f %s %s'
                % (sizeInMib, host, namespace, digest, digest))
        else:
            self.response.headers['Content-Disposition'] = str(
                'filename=%s' % self.request.get('as') or digest)
            try:
                json_data = json.loads(content)
                if self._is_isolated_format(json_data):
                    self.response.headers[
                        'Content-Type'] = 'text/html; charset=utf-8'
                    json_data['files'] = collections.OrderedDict(
                        sorted(json_data['files'].items(),
                               key=lambda (filepath, data): filepath))
                    params = {
                        'namespace': namespace,
                        'isolated': json_data,
                    }
                    content = template.render('isolate/isolated.html', params)
            except ValueError:
                pass

        self.response.write(content)
Example #6
0
    def post(self, namespace, hash_key):
        original_request = self.request.get('req')
        entry = model.get_entry_key(namespace, hash_key).get()
        if not entry:
            logging.error('Failed to find entity\n%s', original_request)
            return
        if entry.is_verified:
            logging.warning('Was already verified\n%s', original_request)
            return
        if entry.content is not None:
            logging.error('Should not be called with inline content\n%s',
                          original_request)
            return

        # Get GS file size.
        gs_bucket = config.settings().gs_bucket
        gs_file_info = gcs.get_file_info(gs_bucket, entry.key.id())

        # It's None if file is missing.
        if not gs_file_info:
            # According to the docs, GS is read-after-write consistent, so a file is
            # missing only if it wasn't stored at all or it was deleted, in any case
            # it's not a valid ContentEntry.
            self.purge_entry(entry, 'No such GS file\n%s', original_request)
            return

        # Expected stored length and actual length should match.
        if gs_file_info.size != entry.compressed_size:
            self.purge_entry(
                entry,
                'Bad GS file: expected size is %d, actual size is %d\n%s',
                entry.compressed_size, gs_file_info.size, original_request)
            return

        save_to_memcache = (
            entry.compressed_size <= model.MAX_MEMCACHE_ISOLATED
            and entry.is_isolated)
        expanded_size = 0
        digest = hashlib.sha1()
        data = None

        try:
            # Start a loop where it reads the data in block.
            stream = gcs.read_file(gs_bucket, entry.key.id())
            if save_to_memcache:
                # Wraps stream with a generator that accumulates the data.
                stream = Accumulator(stream)

            for data in model.expand_content(namespace, stream):
                expanded_size += len(data)
                digest.update(data)
                # Make sure the data is GC'ed.
                del data

            # Hashes should match.
            if digest.hexdigest() != hash_key:
                self.purge_entry(
                    entry, 'SHA-1 do not match data\n'
                    '%d bytes, %d bytes expanded, expected %d bytes\n%s',
                    entry.compressed_size, expanded_size, entry.expanded_size,
                    original_request)
                return

        except gcs.NotFoundError as e:
            # Somebody deleted a file between get_file_info and read_file calls.
            self.purge_entry(entry, 'File was unexpectedly deleted\n%s',
                             original_request)
            return
        except (gcs.ForbiddenError, gcs.AuthorizationError) as e:
            # Misconfiguration in Google Storage ACLs. Don't delete an entry, it may
            # be fine. Maybe ACL problems would be fixed before the next retry.
            logging.warning('CloudStorage auth issues (%s): %s',
                            e.__class__.__name__, e)
            # Abort so the job is retried automatically.
            return self.abort(500)
        except (gcs.FatalError, zlib.error, IOError) as e:
            # ForbiddenError and AuthorizationError inherit FatalError, so this except
            # block should be last.
            # It's broken or unreadable.
            self.purge_entry(entry, 'Failed to read the file (%s): %s\n%s',
                             e.__class__.__name__, e, original_request)
            return

        # Verified. Data matches the hash.
        entry.expanded_size = expanded_size
        entry.is_verified = True
        future = entry.put_async()
        logging.info('%d bytes (%d bytes expanded) verified\n%s',
                     entry.compressed_size, expanded_size, original_request)
        if save_to_memcache:
            model.save_in_memcache(namespace, hash_key,
                                   ''.join(stream.accumulated))
        future.wait()
Example #7
0
    def get(self):
        namespace = self.request.get('namespace', 'default-gzip')
        digest = self.request.get('digest', '')
        content = None

        if digest and namespace:
            try:
                raw_data, entity = model.get_content(namespace, digest)
            except ValueError:
                self.abort(400, 'Invalid key')
            except LookupError:
                self.abort(404, 'Unable to retrieve the entry')

            if not raw_data:
                stream = gcs.read_file(config.settings().gs_bucket,
                                       entity.key.id())
            else:
                stream = [raw_data]
            content = ''.join(model.expand_content(namespace, stream))

            self.response.headers['X-Frame-Options'] = 'SAMEORIGIN'
            # We delete Content-Type before storing to it to avoid having two (yes,
            # two) Content-Type headers.
            del self.response.headers['Content-Type']

            # Apparently, setting the content type to text/plain encourages the
            # browser (Chrome, at least) to sniff the mime type and display
            # things like images.  Images are autowrapped in <img> and text is
            # wrapped in <pre>.
            self.response.headers['Content-Type'] = 'text/plain; charset=utf-8'

            # App Engine puts a limit of 33554432 bytes on a request, which includes
            # headers. Headers are ~150 bytes.  If the content + headers might
            # exceed that limit, we give the user an option to workround getting
            # their file.
            if len(content) > 33554000:
                host = modules.get_hostname(module='default',
                                            version='default')
                # host is something like default.default.myisolateserver.appspot.com
                host = host.replace('default.default.', '')
                sizeInMib = len(content) / (1024.0 * 1024.0)
                content = (
                    'Sorry, your file is %1.1f MiB big, which exceeds the 32 MiB'
                    ' App Engine limit.\nTo work around this, run the following command:\n'
                    '    python isolateserver.py download -I %s --namespace %s -f %s %s'
                    % (sizeInMib, host, namespace, digest, digest))
            else:
                self.response.headers['Content-Disposition'] = str(
                    'filename=%s' % digest)
                if content.startswith('{'):
                    # Try to format as JSON.
                    try:
                        content = json.dumps(json.loads(content),
                                             sort_keys=True,
                                             indent=2,
                                             separators=(',', ': '))
                        # If we don't wrap this in html, browsers will put content in a pre
                        # tag which is also styled with monospace/pre-wrap.  We can't use
                        # anchor tags in <pre>, so we force it to be a <div>, which happily
                        # accepts links.
                        content = (
                            '<div style="font-family:monospace;white-space:pre-wrap;">%s'
                            '</div>' % content)
                        # Linkify things that look like hashes
                        content = re.sub(
                            r'([0-9a-f]{40})',
                            r'<a target="_blank" href="/browse?namespace=%s' %
                            namespace + r'&digest=\1">\1</a>', content)
                        self.response.headers[
                            'Content-Type'] = 'text/html; charset=utf-8'
                    except ValueError:
                        pass

        self.response.write(content)