Esempio n. 1
0
    def _serialize(self, obj):
        pk = obj.get("id")
        collection_id = obj.pop("collection_id", None)
        obj["collection"] = self.resolve(
            Collection, collection_id, CollectionSerializer
        )
        proxy = model.get_proxy(obj)
        properties = obj.get("properties", {})
        for prop in proxy.iterprops():
            if prop.type != registry.entity:
                continue
            values = ensure_list(properties.get(prop.name))
            properties[prop.name] = []
            for value in values:
                entity = self.resolve(Entity, value, EntitySerializer)
                properties[prop.name].append(entity or value)

        links = {
            "self": url_for("entities_api.view", entity_id=pk),
            "references": url_for("entities_api.references", entity_id=pk),
            "tags": url_for("entities_api.tags", entity_id=pk),
            "ui": entity_url(pk),
        }
        if proxy.schema.is_a(Document.SCHEMA):
            content_hash = first(properties.get("contentHash"))
            if content_hash:
                name = entity_filename(proxy)
                mime = first(properties.get("mimeType"))
                links["file"] = archive_url(
                    content_hash,
                    file_name=name,
                    mime_type=mime,
                    expire=request.authz.expire,
                )

            pdf_hash = first(properties.get("pdfHash"))
            if pdf_hash:
                name = entity_filename(proxy, extension="pdf")
                links["pdf"] = archive_url(
                    pdf_hash,
                    file_name=name,
                    mime_type=PDF,
                    expire=request.authz.expire,
                )
            csv_hash = first(properties.get("csvHash"))
            if csv_hash:
                name = entity_filename(proxy, extension="csv")
                links["csv"] = archive_url(
                    csv_hash,
                    file_name=name,
                    mime_type=CSV,
                    expire=request.authz.expire,
                )

        obj["links"] = links
        obj["latinized"] = transliterate_values(proxy)
        obj["writeable"] = check_write_entity(obj, request.authz)
        obj["shallow"] = obj.get("shallow", True)
        return obj
Esempio n. 2
0
    def _serialize(self, obj):
        pk = obj.get('id')
        obj['id'] = str(pk)
        authz = request.authz
        collection_id = obj.pop('collection_id', None)
        obj['collection'] = self.resolve(Collection, collection_id,
                                         CollectionSerializer)
        proxy = model.get_proxy(obj)
        obj['schemata'] = proxy.schema.names
        properties = obj.get('properties', {})
        for prop in proxy.iterprops():
            if prop.type != registry.entity:
                continue
            values = ensure_list(properties.get(prop.name))
            properties[prop.name] = []
            for value in values:
                entity = self.resolve(Entity, value, EntitySerializer)
                properties[prop.name].append(entity)

        links = {
            'self': url_for('entities_api.view', entity_id=pk),
            'references': url_for('entities_api.references', entity_id=pk),
            'tags': url_for('entities_api.tags', entity_id=pk),
            'ui': entity_url(pk)
        }
        if proxy.schema.is_a(Document.SCHEMA):
            links['content'] = url_for('entities_api.content', entity_id=pk)
            content_hash = first(properties.get('contentHash'))
            if content_hash:
                name = entity_filename(proxy)
                mime_type = first(properties.get('mimeType'))
                links['file'] = archive_url(request.authz.id,
                                            content_hash,
                                            file_name=name,
                                            mime_type=mime_type)

            pdf_hash = first(properties.get('pdfHash'))
            if pdf_hash:
                name = entity_filename(proxy, extension='pdf')
                links['pdf'] = archive_url(request.authz.id,
                                           pdf_hash,
                                           file_name=name,
                                           mime_type=PDF)
            csv_hash = first(properties.get('csvHash'))
            if csv_hash:
                name = entity_filename(proxy, extension='csv')
                links['csv'] = archive_url(request.authz.id,
                                           csv_hash,
                                           file_name=name,
                                           mime_type=CSV)

        obj['links'] = links
        obj['writeable'] = authz.can(collection_id, authz.WRITE)
        obj.pop('_index', None)
        return self._clean_response(obj)
Esempio n. 3
0
 def _document_to_pdf(self, file_path, entity):
     """Converts an office document to PDF."""
     file_name = entity_filename(entity)
     mime_type = entity.first('mimeType')
     log.info('Converting [%s] to PDF...', file_name)
     for attempt in count(1):
         try:
             with open(file_path, 'rb') as fh:
                 files = {'file': (file_name, fh, mime_type)}
                 res = requests.post(CONVERT_URL,
                                     params={'timeout': CONVERT_TIMEOUT},
                                     files=files,
                                     timeout=CONVERT_TIMEOUT + 10,
                                     stream=True)
             res.raise_for_status()
             out_path = self.make_work_file('out.pdf')
             with open(out_path, 'wb') as fh:
                 bytes_written = 0
                 for chunk in res.iter_content(chunk_size=None):
                     bytes_written += len(chunk)
                     fh.write(chunk)
                 if bytes_written > 50:
                     return out_path
             raise ProcessingException("Could not be converted to PDF.")
         except HTTPError as exc:
             if exc.response.status_code == 400:
                 raise ProcessingException(res.text)
             msg = "Converter not availble: %s (attempt: %s)"
             log.info(msg, exc, attempt)
             backoff(failures=math.sqrt(attempt))
         except RequestException as exc:
             msg = "Converter not availble: %s (attempt: %s)"
             log.error(msg, exc, attempt)
             backoff(failures=math.sqrt(attempt))
Esempio n. 4
0
    def test_entity_filename(self):
        proxy = model.get_proxy(
            {
                "id": "banana",
                "schema": "Document",
            }
        )
        file_name = entity_filename(proxy)
        assert "banana" == file_name, file_name

        proxy = model.get_proxy(
            {
                "id": "banana",
                "schema": "Document",
                "properties": {
                    "extension": [".doc"],
                },
            }
        )
        file_name = entity_filename(proxy)
        assert "banana.doc" == file_name, file_name

        proxy = model.get_proxy(
            {
                "id": "banana",
                "schema": "Document",
                "properties": {
                    "mimeType": ["application/pdf"],
                },
            }
        )
        file_name = entity_filename(proxy)
        assert "banana.pdf" == file_name, file_name

        proxy = model.get_proxy(
            {
                "id": "banana",
                "schema": "Document",
                "properties": {
                    "fileName": ["bla.doc"],
                },
            }
        )
        file_name = entity_filename(proxy)
        assert "bla.doc" == file_name, file_name
        file_name = entity_filename(proxy, extension="pdf")
        assert "bla.pdf" == file_name, file_name
Esempio n. 5
0
 def ingest_entity(self, entity):
     for content_hash in entity.get("contentHash", quiet=True):
         file_name = entity_filename(entity)
         file_path = self.load(content_hash, file_name=file_name)
         if file_path is None or not file_path.exists():
             continue
         self.ingest(file_path, entity)
         return
     self.finalize(entity)
Esempio n. 6
0
def write_document(export_dir, zf, collection, entity):
    content_hash = entity.first("contentHash", quiet=True)
    if content_hash is None:
        return
    file_name = entity_filename(entity)
    arcname = "{0}-{1}".format(entity.id, file_name)
    arcname = os.path.join(collection.get("label"), arcname)
    try:
        local_path = archive.load_file(content_hash, temp_path=export_dir)
        if local_path is not None and os.path.exists(local_path):
            zf.write(local_path, arcname=arcname)
    finally:
        archive.cleanup_file(content_hash, temp_path=export_dir)
Esempio n. 7
0
    def test_entity_filename(self):
        proxy = model.get_proxy({
            'id': 'banana',
            'schema': 'Document',
        })
        file_name = entity_filename(proxy)
        assert 'banana' == file_name, file_name

        proxy = model.get_proxy({
            'id': 'banana',
            'schema': 'Document',
            'properties': {
                'extension': ['.doc'],
            }
        })
        file_name = entity_filename(proxy)
        assert 'banana.doc' == file_name, file_name

        proxy = model.get_proxy({
            'id': 'banana',
            'schema': 'Document',
            'properties': {
                'mimeType': ['application/pdf'],
            }
        })
        file_name = entity_filename(proxy)
        assert 'banana.pdf' == file_name, file_name

        proxy = model.get_proxy({
            'id': 'banana',
            'schema': 'Document',
            'properties': {
                'fileName': ['bla.doc'],
            }
        })
        file_name = entity_filename(proxy)
        assert 'bla.doc' == file_name, file_name
        file_name = entity_filename(proxy, extension='pdf')
        assert 'bla.pdf' == file_name, file_name
Esempio n. 8
0
    def document_to_pdf(self, file_path, entity):
        key = self.cache_key('pdf', entity.first('contentHash'))
        pdf_hash = self.get_cache_value(key)
        if pdf_hash is not None:
            file_name = entity_filename(entity, extension='pdf')
            path = self.manager.load(pdf_hash, file_name=file_name)
            if path is not None:
                log.info("Using PDF cache: %s", file_name)
                entity.set('pdfHash', pdf_hash)
                return path

        pdf_file = self._document_to_pdf(file_path, entity)
        if pdf_file is not None:
            content_hash = self.manager.store(pdf_file)
            entity.set('pdfHash', content_hash)
            self.set_cache_value(key, content_hash)
        return pdf_file
Esempio n. 9
0
    def document_to_pdf(self, file_path, entity):
        key = self.cache_key("pdf", entity.first("contentHash"))
        pdf_hash = self.tags.get(key)
        if pdf_hash is not None:
            file_name = entity_filename(entity, extension="pdf")
            path = self.manager.load(pdf_hash, file_name=file_name)
            if path is not None:
                log.info("Using PDF cache: %s", file_name)
                entity.set("pdfHash", pdf_hash)
                return path

        pdf_file = self._document_to_pdf(file_path, entity)
        if pdf_file is not None:
            content_hash = self.manager.store(pdf_file)
            entity.set("pdfHash", content_hash)
            self.tags.set(key, content_hash)
        return pdf_file
Esempio n. 10
0
    def _document_to_pdf(self, file_path, entity):
        """Converts an office document to PDF."""
        # Attempt to guess an appropriate time for processing
        # Guessed: 15s per MB of data, max.
        file_size = file_path.stat().st_size
        if file_size < 100:
            return ProcessingException("Document too small.")
        file_size = (file_size / 1024) / 1024  # megabyte
        timeout = int(min(600, max(20, file_size * 15)))

        file_name = entity_filename(entity)
        mime_type = entity.first('mimeType')
        log.info('Converting [%s] to PDF (%ds timeout)...',
                 file_name, timeout)
        failed = ProcessingException("Document could not be converted to PDF.")
        for attempt in service_retries():
            try:
                with open(file_path, 'rb') as fh:
                    files = {'file': (file_name, fh, mime_type)}
                    res = requests.post(CONVERT_URL,
                                        params={'timeout': timeout},
                                        files=files,
                                        timeout=timeout + 3,
                                        stream=True)
                res.raise_for_status()
                out_path = self.make_work_file('out.pdf')
                with open(out_path, 'wb') as fh:
                    bytes_written = 0
                    for chunk in res.iter_content(chunk_size=None):
                        bytes_written += len(chunk)
                        fh.write(chunk)
                    if bytes_written > 50:
                        return out_path
                raise failed
            except RequestException as exc:
                if isinstance(exc, HTTPError) and \
                        exc.response.status_code == 400:
                    raise ProcessingException(res.text)
                log.error("Conversion failed: %s", exc)
                backoff(failures=math.sqrt(attempt))
        raise failed
Esempio n. 11
0
 def _document_to_pdf(self, file_path, entity):
     """Converts an office document to PDF."""
     file_name = entity_filename(entity)
     mime_type = entity.first("mimeType")
     for attempt in count(1):
         log.debug("Converting [%s] to PDF (attempt %d)...", entity,
                   attempt)
         try:
             with open(file_path, "rb") as fh:
                 files = {"file": (file_name, fh, mime_type)}
                 res = requests.post(
                     CONVERT_URL,
                     params={"timeout": CONVERT_TIMEOUT},
                     files=files,
                     timeout=CONVERT_TIMEOUT + 10,
                     stream=True,
                 )
             res.raise_for_status()
             out_path = self.make_work_file("out.pdf")
             with open(out_path, "wb") as fh:
                 bytes_written = 0
                 for chunk in res.iter_content(chunk_size=None):
                     bytes_written += len(chunk)
                     fh.write(chunk)
                 if bytes_written > 50:
                     return out_path
             raise ProcessingException("Could not be converted to PDF.")
         except HTTPError as exc:
             if exc.response.status_code in (400, 500):
                 # For error 500, this might also be a temporary error
                 # in the conversion service. But all attempts to divy
                 # these phenomena apart have failed so far.
                 raise ProcessingException(res.text)
             msg = "Converter not available: %s (attempt: %s)"
             log.info(msg, exc, attempt)
             backoff(failures=math.sqrt(attempt))
         except RequestException as exc:
             msg = "Converter not available: %s (attempt: %s)"
             log.error(msg, exc, attempt)
             backoff(failures=math.sqrt(attempt))
Esempio n. 12
0
    def _serialize(self, obj):
        pk = obj.get("id")
        proxy = model.get_proxy(obj)
        properties = {}
        for prop, value in proxy.itervalues():
            properties.setdefault(prop.name, [])
            if prop.type == registry.entity:
                entity = self.resolve(Entity, value, EntitySerializer)
                value = entity or value
            if value is not None:
                properties[prop.name].append(value)
        obj["properties"] = properties
        links = {
            "self": url_for("entities_api.view", entity_id=pk),
            "expand": url_for("entities_api.expand", entity_id=pk),
            "tags": url_for("entities_api.tags", entity_id=pk),
            "ui": entity_url(pk),
        }
        if proxy.schema.is_a(Document.SCHEMA):
            content_hash = proxy.first("contentHash", quiet=True)
            if content_hash:
                name = entity_filename(proxy)
                mime = proxy.first("mimeType", quiet=True)
                links["file"] = archive_url(content_hash,
                                            file_name=name,
                                            mime_type=mime)

            pdf_hash = proxy.first("pdfHash", quiet=True)
            if pdf_hash:
                name = entity_filename(proxy, extension="pdf")
                links["pdf"] = archive_url(pdf_hash,
                                           file_name=name,
                                           mime_type=PDF)

            csv_hash = proxy.first("csvHash", quiet=True)
            if csv_hash:
                name = entity_filename(proxy, extension="csv")
                links["csv"] = archive_url(csv_hash,
                                           file_name=name,
                                           mime_type=CSV)

        collection = obj.get("collection") or {}
        coll_id = obj.pop("collection_id", collection.get("id"))
        # This is a last resort catcher for entities nested in other
        # entities that get resolved without regard for authz.
        if not request.authz.can(coll_id, request.authz.READ):
            return None
        obj["collection"] = self.resolve(Collection, coll_id,
                                         CollectionSerializer)
        role_id = obj.pop("role_id", None)
        obj["role"] = self.resolve(Role, role_id, RoleSerializer)
        obj["links"] = links
        obj["latinized"] = transliterate_values(proxy)
        obj["writeable"] = check_write_entity(obj, request.authz)
        obj["shallow"] = obj.get("shallow", True)
        # Phasing out multi-values here (2021-01):
        obj["created_at"] = min(ensure_list(obj.get("created_at")),
                                default=None)
        obj["updated_at"] = max(ensure_list(obj.get("updated_at")),
                                default=None)
        return obj