Ejemplo n.º 1
0
 async def metadata(self):
     path = urlparse(self.url).path
     name, ext = os.path.splitext(os.path.split(path)[-1])
     content_type, _ = mimetypes.guess_type(self.url)
     unique_key = hashlib.sha256(self.url.encode('utf-8')).hexdigest()
     return provider.ProviderMetadata(name, ext, content_type, unique_key,
                                      self.url)
Ejemplo n.º 2
0
 def metadata(self):
     download_url = yield from self._fetch_download_url()
     metadata_url = download_url.replace('/file?', '/data?', 1)
     metadata_request = yield from self._make_request('GET', metadata_url)
     metadata = yield from metadata_request.json()
     # e.g.,
     # metadata = {'data': {
     #     'name': 'blah.png',
     #     'contentType': 'image/png',
     #     'etag': 'ABCD123456...',
     #     'extra': {
     #         ...
     #     },
     # }}
     name, ext = os.path.splitext(metadata['data']['name'])
     content_type = metadata['data']['contentType'] or mimetypes.guess_type(
         metadata['data']['name'])[0]
     cleaned_url = furl.furl(download_url)
     for unneeded in OsfProvider.UNNEEDED_URL_PARAMS:
         cleaned_url.args.pop(unneeded, None)
     unique_key = hashlib.sha256(
         (metadata['data']['etag'] +
          cleaned_url.url).encode('utf-8')).hexdigest()
     return provider.ProviderMetadata(name, ext, content_type, unique_key,
                                      download_url)
Ejemplo n.º 3
0
 def metadata(self):
     download_url = yield from self._fetch_download_url()
     if '/file?' in download_url:
         # TODO Remove this when API v0 is officially deprecated
         metadata_url = download_url.replace('/file?', '/data?', 1)
         metadata_request = yield from self._make_request(
             'GET', metadata_url)
         metadata = yield from metadata_request.json()
     else:
         metadata_request = yield from self._make_request(
             'HEAD', download_url)
         # To make changes to current code as minimal as possible
         metadata = {
             'data':
             json.loads(metadata_request.headers['x-waterbutler-metadata'])
             ['attributes']
         }
     # e.g.,
     # metadata = {'data': {
     #     'name': 'blah.png',
     #     'contentType': 'image/png',
     #     'etag': 'ABCD123456...',
     #     'extra': {
     #         ...
     #     },
     # }}
     name, ext = os.path.splitext(metadata['data']['name'])
     content_type = metadata['data']['contentType'] or mimetypes.guess_type(
         metadata['data']['name'])[0]
     cleaned_url = furl.furl(download_url)
     for unneeded in OsfProvider.UNNEEDED_URL_PARAMS:
         cleaned_url.args.pop(unneeded, None)
     unique_key = hashlib.sha256(
         (metadata['data']['etag'] +
          cleaned_url.url).encode('utf-8')).hexdigest()
     return provider.ProviderMetadata(name, ext, content_type, unique_key,
                                      download_url)
Ejemplo n.º 4
0
    async def metadata(self):
        """Fetch metadata about the file from WaterButler. V0 and V1 urls must be handled
        differently.
        """
        download_url = await self._fetch_download_url()
        logger.debug('download_url::{}'.format(download_url))
        if '/file?' in download_url:
            # URL is for WaterButler v0 API
            # TODO Remove this when API v0 is officially deprecated
            self.metrics.add('metadata.wb_api', 'v0')
            metadata_url = download_url.replace('/file?', '/data?', 1)
            metadata_response = await self._make_request('GET', metadata_url)
            metadata = await metadata_response.json()
        else:
            # URL is for WaterButler v1 API
            self.metrics.add('metadata.wb_api', 'v1')
            metadata_response = await self._make_request(
                'HEAD',
                download_url,
                headers={settings.MFR_ACTION_HEADER: self.action or ''}
            )
            response_code = metadata_response.status
            response_reason = metadata_response.reason
            response_headers = metadata_response.headers
            await metadata_response.release()
            if response_code != 200:
                raise exceptions.MetadataError(
                    'Failed to fetch file metadata from WaterButler. Received response: ',
                    'code {} {}'.format(str(response_code), str(response_reason)),
                    metadata_url=download_url,
                    response=response_reason,
                    provider=self.NAME,
                    code=400
                )

            try:
                metadata = {'data': json.loads(response_headers['x-waterbutler-metadata'])['attributes']}
            except ContentEncodingError:
                pass  # hack: aiohttp tries to unzip empty body when Content-Encoding is set

        self.metrics.add('metadata.raw', metadata)

        # e.g.,
        # metadata = {'data': {
        #     'name': 'blah.png',
        #     'contentType': 'image/png',
        #     'etag': 'ABCD123456...',
        #     'extra': {
        #         ...
        #     },
        # }}

        name, ext = os.path.splitext(metadata['data']['name'])
        size = metadata['data']['size']

        max_file_size = MAX_FILE_SIZE_TO_RENDER.get(ext)
        if max_file_size and size and int(size) > max_file_size:
            raise TooBigToRenderError(
                "This file with extension '{ext}' exceeds the size limit of {max_size} and will not "
                "be rendered. To view this file download it and view it "
                "offline.".format(ext=ext, max_size=sizeof_fmt(max_file_size)),
                requested_size=int(size), maximum_size=max_file_size,
            )

        content_type = metadata['data']['contentType'] or mimetypes.guess_type(metadata['data']['name'])[0]
        cleaned_url = furl.furl(download_url)
        for unneeded in OsfProvider.UNNEEDED_URL_PARAMS:
            cleaned_url.args.pop(unneeded, None)
        self.metrics.add('metadata.clean_url_args', str(cleaned_url))
        meta = metadata['data']
        unique_key = hashlib.sha256((meta['etag'] + cleaned_url.url).encode('utf-8')).hexdigest()
        stable_str = '/{}/{}{}'.format(meta['resource'], meta['provider'], meta['path'])
        stable_id = hashlib.sha256(stable_str.encode('utf-8')).hexdigest()
        logger.debug('stable_identifier: str({}) hash({})'.format(stable_str, stable_id))

        return provider.ProviderMetadata(name, ext, content_type, unique_key, download_url, stable_id)