Esempio n. 1
0
    def _serialize(self, obj):
        pk = obj.get('id')
        obj['id'] = str(pk)
        authz = request.authz
        collection_id = obj.pop('collection_id', None)
        obj['collection'] = self.resolve(Collection, collection_id,
                                         CollectionSerializer)
        schema = model.get(obj.get('schema'))
        if schema is None:
            return None
        obj['schemata'] = schema.names
        properties = obj.get('properties', {})
        for prop in schema.properties.values():
            if prop.type != registry.entity:
                continue
            values = ensure_list(properties.get(prop.name))
            properties[prop.name] = []
            for value in values:
                entity = self.resolve(Entity, value, EntitySerializer)
                properties[prop.name].append(entity)

        links = {
            'self': url_for('entities_api.view', entity_id=pk),
            'references': url_for('entities_api.references', entity_id=pk),
            'tags': url_for('entities_api.tags', entity_id=pk),
            'ui': entity_url(pk)
        }
        if schema.is_a(Document.SCHEMA):
            links['content'] = url_for('entities_api.content', entity_id=pk)
            file_name = first(properties.get('fileName'))
            content_hash = first(properties.get('contentHash'))
            if content_hash:
                mime_type = first(properties.get('mimeType'))
                name = safe_filename(file_name, default=pk)
                links['file'] = archive_url(request.authz.id,
                                            content_hash,
                                            file_name=name,
                                            mime_type=mime_type)

            pdf_hash = first(properties.get('pdfHash'))
            if pdf_hash:
                name = safe_filename(file_name, default=pk, extension='.pdf')
                links['pdf'] = archive_url(request.authz.id,
                                           pdf_hash,
                                           file_name=name,
                                           mime_type=PDF)
            csv_hash = first(properties.get('csvHash'))
            if csv_hash:
                name = safe_filename(file_name, default=pk, extension='.csv')
                links['csv'] = archive_url(request.authz.id,
                                           csv_hash,
                                           file_name=name,
                                           mime_type=CSV)

        obj['links'] = links
        obj['writeable'] = authz.can(collection_id, authz.WRITE)
        obj.pop('_index', None)
        return self._clean_response(obj)
Esempio n. 2
0
 def make_filename(self, entity):
     """Some of the file importers actually care about the file
     extension, so this is trying to make sure we use a temporary
     file name that has an appropriate extension."""
     for file_name in entity.get('fileName', quiet=True):
         _, extension = os.path.splitext(file_name)
         if len(extension):
             return safe_filename(file_name)
     extension = first(entity.get('extension', quiet=True))
     if extension is None:
         mime_type = first(entity.get('mimeType', quiet=True))
         if mime_type is not None:
             extension = guess_extension(mime_type)
     extension = extension or 'bin'
     return safe_filename('data', extension=extension)
Esempio n. 3
0
def ingest_upload(collection_id):
    require(request.authz.can(collection_id, request.authz.WRITE))
    sync = get_flag('sync')
    meta, foreign_id = _load_metadata()
    parent_id = _load_parent(collection_id, meta)
    upload_dir = mkdtemp(prefix='aleph.upload.')
    try:
        path = None
        content_hash = None
        for storage in request.files.values():
            path = safe_filename(storage.filename, default='upload')
            path = os.path.join(upload_dir, path)
            storage.save(path)
            content_hash = checksum(path)
        document = Document.by_keys(collection_id=collection_id,
                                    parent_id=parent_id,
                                    foreign_id=foreign_id,
                                    content_hash=content_hash)
        document.update(meta)
        document.schema = Document.SCHEMA
        if content_hash is None:
            document.schema = Document.SCHEMA_FOLDER
        ingest_document(document, path,
                        role_id=request.authz.id,
                        content_hash=content_hash)
    finally:
        shutil.rmtree(upload_dir)

    if document.collection.casefile:
        # Make sure collection counts are always accurate.
        update_document(document, sync=sync)
    return jsonify({
        'status': 'ok',
        'id': stringify(document.id)
    }, status=201)
Esempio n. 4
0
def directory(context, data):
    """Store the collected files to a given directory."""
    with context.http.rehash(data) as result:
        if not result.ok:
            return

        content_hash = data.get("content_hash")
        if content_hash is None:
            context.emit_warning("No content hash in data.")
            return

        path = _get_directory_path(context)
        file_name = data.get("file_name", result.file_name)
        mime_type = normalize_mimetype(
            CaseInsensitiveDict(data.get("headers", {})).get("content-type")
        )
        extension = _get_file_extension(file_name, mime_type)
        file_name = file_name or "data"
        file_name = safe_filename(file_name, extension=extension)
        file_name = "%s.%s" % (content_hash, file_name)
        data["_file_name"] = file_name
        file_path = os.path.join(path, file_name)
        if not os.path.exists(file_path):
            shutil.copyfile(result.file_path, file_path)

        context.log.info("Store [directory]: %s", file_name)
        meta_path = os.path.join(path, "%s.json" % content_hash)
        with open(meta_path, "w") as fh:
            json.dump(data, fh)
Esempio n. 5
0
def directory(context, data):
    """Store the collected files to a given directory."""
    with context.http.rehash(data) as result:
        if not result.ok:
            return

        content_hash = data.get('content_hash')
        if content_hash is None:
            context.emit_warning("No content hash in data.")
            return

        path = _get_directory_path(context)
        file_name = data.get('file_name', result.file_name)
        mime_type = normalize_mimetype(data.get('headers', {}).get('Content-Type'))
        extension = _get_file_extension(file_name, mime_type)
        file_name = file_name or 'data'
        file_name = safe_filename(file_name, extension=extension)
        file_name = '%s.%s' % (content_hash, file_name)
        data['_file_name'] = file_name
        file_path = os.path.join(path, file_name)
        if not os.path.exists(file_path):
            shutil.copyfile(result.file_path, file_path)

        context.log.info("Store [directory]: %s", file_name)
        meta_path = os.path.join(path, '%s.json' % content_hash)
        with open(meta_path, 'w') as fh:
            json.dump(data, fh)
Esempio n. 6
0
def directory(context, data):
    """Store the collected files to a given directory."""
    with context.http.rehash(data) as result:
        if not result.ok:
            return

        content_hash = data.get('content_hash')
        if content_hash is None:
            context.emit_warning("No content hash in data.")
            return

        path = _get_directory_path(context)
        file_name = data.get('file_name', result.file_name)
        file_name = safe_filename(file_name, default='raw')
        file_name = '%s.%s' % (content_hash, file_name)
        data['_file_name'] = file_name
        file_path = os.path.join(path, file_name)
        if not os.path.exists(file_path):
            shutil.copyfile(result.file_path, file_path)

        context.log.info("Store [directory]: %s", file_name)

        meta_path = os.path.join(path, '%s.json' % content_hash)
        with open(meta_path, 'w') as fh:
            json.dump(data, fh)
Esempio n. 7
0
def ingest_upload(collection_id):
    require(request.authz.can(collection_id, request.authz.WRITE))
    sync = get_flag('sync')
    meta, foreign_id = _load_metadata()
    parent_id = _load_parent(collection_id, meta)
    upload_dir = mkdtemp(prefix='aleph.upload.')
    try:
        path = None
        content_hash = None
        for storage in request.files.values():
            path = safe_filename(storage.filename, default='upload')
            path = os.path.join(upload_dir, path)
            storage.save(path)
            content_hash = checksum(path)
        document = Document.by_keys(collection_id=collection_id,
                                    parent_id=parent_id,
                                    foreign_id=foreign_id,
                                    content_hash=content_hash)
        document.update(meta)
        document.schema = Document.SCHEMA
        if content_hash is None:
            document.schema = Document.SCHEMA_FOLDER
        ingest_document(document,
                        path,
                        role_id=request.authz.id,
                        content_hash=content_hash)
    finally:
        shutil.rmtree(upload_dir)

    # Make sure collection counts are always accurate.
    update_document(document, shallow=True, sync=sync)
    return jsonify({'status': 'ok', 'id': stringify(document.id)}, status=201)
Esempio n. 8
0
    def ingest_attachment(self, name, mime_type, body):
        has_body = body is not None and len(body)
        if safe_string(name) is None and not has_body:
            # Hello, Outlook.
            return

        file_name = safe_filename(name, default='attachment')
        name = safe_string(name) or file_name
        foreign_id = join_path(self.result.id, name)

        file_path = join_path(self.work_path, file_name)
        with open(file_path, 'wb') as fh:
            if isinstance(body, str):
                body = body.encode('utf-8')
            if body is not None:
                fh.write(body)

        if isinstance(mime_type, bytes):
            mime_type = mime_type.decode('utf-8')

        self.manager.handle_child(self.result,
                                  file_path,
                                  id=foreign_id,
                                  file_name=name,
                                  mime_type=mime_type)
Esempio n. 9
0
def ingest_upload(collection_id):
    collection = get_db_collection(collection_id, request.authz.WRITE)
    job_id = get_session_id()
    sync = get_flag('sync', default=False)
    meta, foreign_id = _load_metadata()
    parent = _load_parent(collection, meta)
    upload_dir = ensure_path(mkdtemp(prefix='aleph.upload.'))
    try:
        content_hash = None
        for storage in request.files.values():
            path = safe_filename(storage.filename, default='upload')
            path = upload_dir.joinpath(path)
            storage.save(str(path))
            content_hash = archive.archive_file(path)
        document = Document.save(collection=collection,
                                 parent=parent,
                                 foreign_id=foreign_id,
                                 content_hash=content_hash,
                                 meta=meta,
                                 uploader_id=request.authz.id)
        collection.touch()
        db.session.commit()
        proxy = document.to_proxy()
        if proxy.schema.is_a(Document.SCHEMA_FOLDER) and sync:
            index_proxy(collection, proxy, sync=sync)
        ingest_entity(collection, proxy, job_id=job_id, sync=sync)
        document_id = collection.ns.sign(document.id)
        _notify(collection, document_id)
    finally:
        shutil.rmtree(upload_dir)

    return jsonify({'status': 'ok', 'id': document_id}, status=201)
Esempio n. 10
0
def complete_export(export_id, file_path):
    export = Export.by_id(export_id)
    file_path = ensure_path(file_path)
    export.file_name = safe_filename(file_path)
    export.file_size = file_path.stat().st_size
    export.content_hash = checksum(file_path)
    try:
        archive.archive_file(file_path,
                             content_hash=export.content_hash,
                             mime_type=export.mime_type)
        export.set_status(status=Status.SUCCESS)
    except Exception:
        log.exception("Failed to upload export: %s", export)
        export.set_status(status=Status.FAILED)

    db.session.commit()
    params = {"export": export}
    role = Role.by_id(export.creator_id)
    log.info("Export [%r] complete: %s", export, export.status)
    publish(
        Events.COMPLETE_EXPORT,
        params=params,
        channels=[role],
    )
    send_export_notification(export)
Esempio n. 11
0
def ingest_upload(collection_id):
    collection = get_db_collection(collection_id, request.authz.WRITE)
    meta, foreign_id = _load_metadata()
    parent = _load_parent(collection, meta)
    upload_dir = ensure_path(mkdtemp(prefix='aleph.upload.'))
    try:
        content_hash = None
        for storage in request.files.values():
            path = safe_filename(storage.filename, default='upload')
            path = upload_dir.joinpath(path)
            storage.save(str(path))
            content_hash = archive.archive_file(path)
        document = Document.save(collection=collection,
                                 parent=parent,
                                 foreign_id=foreign_id,
                                 content_hash=content_hash,
                                 meta=meta,
                                 uploader_id=request.authz.id)
        db.session.commit()
        proxy = document.to_proxy()
        ingest_entity(collection, proxy)
    finally:
        shutil.rmtree(upload_dir)

    return jsonify({
        'status': 'ok',
        'id': stringify(document.id)
    }, status=201)
Esempio n. 12
0
 def _local_path(self, content_hash, file_name, temp_path):
     path = self._get_local_prefix(content_hash, temp_path=temp_path)
     try:
         os.makedirs(path)
     except Exception:
         pass
     file_name = safe_filename(file_name, default='data')
     return os.path.join(path, file_name)
Esempio n. 13
0
 def dump_table(self, file_path, table_name, temp_dir):
     out_file = safe_filename(table_name, extension='csv')
     out_file = join_path(temp_dir, out_file)
     mdb_export = self.find_command('mdb-export')
     args = [mdb_export, '-b', 'strip', file_path, table_name]
     with open(out_file, 'w') as fh:
         self.subprocess.call(args, stdout=fh)
     return out_file
Esempio n. 14
0
 def set_filepath(self, file_path):
     file_path = ensure_path(file_path)
     file_name = safe_filename(file_path)
     file_size = file_path.stat().st_size
     self.file_name = file_name
     self.file_size = file_size
     self._file_path = file_path
     self.content_hash = checksum(file_path)
Esempio n. 15
0
 def extract_file(self, zipf, name, temp_dir):
     base_name = safe_filename(os.path.basename(name))
     out_file = os.path.join(temp_dir, base_name)
     with open(out_file, 'w+b') as outfh:
         try:
             with zipf.open(name) as infh:
                 shutil.copyfileobj(infh, outfh)
         except KeyError:
             log.warning("Cannot load zip member: %s", name)
     return out_file
Esempio n. 16
0
File: export.py Progetto: sunu/aleph
def export_entities(export_id):
    export = Export.by_id(export_id)
    log.info("Export entities [%r]...", export)
    export_dir = ensure_path(mkdtemp(prefix="aleph.export."))
    collections = {}
    try:
        filters = [export.meta.get("query", {"match_none": {}})]
        file_path = export_dir.joinpath("export.zip")
        with ZipFile(file_path, mode="w") as zf:
            excel_name = safe_filename(export.label, extension="xlsx")
            excel_path = export_dir.joinpath(excel_name)
            exporter = ExcelExporter(excel_path, extra=EXTRA_HEADERS)
            for idx, entity in enumerate(iter_proxies(filters=filters)):
                collection_id = entity.context.get("collection_id")
                if collection_id not in collections:
                    collections[collection_id] = get_collection(collection_id)
                collection = collections[collection_id]
                if collection is None:
                    continue
                extra = [entity_url(entity.id), collection.get("label")]
                exporter.write(entity, extra=extra)
                write_document(export_dir, zf, collection, entity)
                if file_path.stat().st_size >= settings.EXPORT_MAX_SIZE:
                    concern = "total size of the"
                    zf.writestr("EXPORT_TOO_LARGE.txt", WARNING % concern)
                    break
                if idx >= settings.EXPORT_MAX_RESULTS:
                    concern = "number of"
                    zf.writestr("EXPORT_TOO_LARGE.txt", WARNING % concern)
                    break

            exporter.finalize()
            zf.write(excel_path, arcname=excel_name)
        file_name = "Export: %s" % export.label
        file_name = safe_filename(file_name, extension="zip")
        complete_export(export_id, file_path, file_name)
    except Exception:
        log.exception("Failed to process export [%s]", export_id)
        export = Export.by_id(export_id)
        export.set_status(status=Status.FAILED)
        db.session.commit()
    finally:
        shutil.rmtree(export_dir)
Esempio n. 17
0
def ingest_upload(id):
    collection = get_db_collection(id, request.authz.WRITE)
    meta, foreign_id = _load_metadata(collection)
    parent_id = _load_parent(collection, meta)
    upload_dir = mkdtemp(prefix='aleph.upload.')
    try:
        documents = []
        for storage in request.files.values():
            path = safe_filename(storage.filename, default='upload')
            path = os.path.join(upload_dir, path)
            storage.save(path)
            content_hash = checksum(path)
            document = Document.by_keys(collection=collection,
                                        parent_id=parent_id,
                                        foreign_id=foreign_id,
                                        content_hash=content_hash)
            document.update(meta)
            document.uploader_id = request.authz.id
            ingest_document(document, path)
            documents.append(document)

        if not len(request.files):
            # If there is no files uploaded, try to create an empty
            # directory instead. Maybe this should be more explicit,
            # but it seemed like the most simple way of fitting it
            # into the API.
            document = Document.by_keys(collection=collection,
                                        parent_id=parent_id,
                                        foreign_id=foreign_id)
            document.schema = Document.SCHEMA_FOLDER
            document.update(meta)
            document.uploader_id = request.authz.id
            ingest_document(document, None)
            documents.append(document)
    finally:
        shutil.rmtree(upload_dir)

    if collection.casefile:
        for document in documents:
            params = {'document': document, 'collection': collection}
            publish(Events.INGEST_DOCUMENT,
                    actor_id=document.uploader_id,
                    params=params)

    # Update child counts in index.
    if parent_id is not None:
        index_document_id.apply_async([parent_id], priority=1)

    refresh_index(index=entities_index())
    return jsonify({
        'status':
        'ok',
        'documents': [CombinedSchema().dump(d).data for d in documents]
    })
Esempio n. 18
0
def ingest_upload(id):
    collection = get_db_collection(id, request.authz.WRITE)
    meta, foreign_id = _load_metadata(collection)
    parent_id = _load_parent(collection, meta)
    upload_dir = mkdtemp(prefix='aleph.upload.')
    try:
        documents = []
        for storage in request.files.values():
            path = safe_filename(storage.filename, default='upload')
            path = os.path.join(upload_dir, path)
            storage.save(path)
            content_hash = checksum(path)
            document = Document.by_keys(collection_id=collection.id,
                                        parent_id=parent_id,
                                        foreign_id=foreign_id,
                                        content_hash=content_hash)
            document.update(meta)
            document.schema = Document.SCHEMA
            ingest_document(document,
                            path,
                            role_id=request.authz.id,
                            content_hash=content_hash)
            documents.append(document)

        if not len(request.files):
            # If there is no files uploaded, try to create an empty
            # directory instead. Maybe this should be more explicit,
            # but it seemed like the most simple way of fitting it
            # into the API.
            document = Document.by_keys(collection_id=collection.id,
                                        parent_id=parent_id,
                                        foreign_id=foreign_id)
            document.update(meta)
            document.schema = Document.SCHEMA_FOLDER
            ingest_document(document, None, role_id=request.authz.id)
            documents.append(document)
    finally:
        shutil.rmtree(upload_dir)

    # Update child counts in index.
    if parent_id is not None:
        index_document_id.apply_async([parent_id], priority=1)

    # Make sure collection counts are always accurate.
    if get_flag('sync'):
        for document in documents:
            update_document(document, shallow=True, sync=True)

    return jsonify({
        'status':
        'ok',
        'documents': [CombinedSchema().dump(d).data for d in documents]
    })
Esempio n. 19
0
 def extract_file(self, zipf, name):
     """Extract a message file from the OLM zip archive"""
     path = pathlib.Path(name)
     base_name = safe_filename(path.name)
     out_file = self.make_work_file(base_name)
     with open(out_file, "w+b") as outfh:
         try:
             with zipf.open(name) as infh:
                 shutil.copyfileobj(infh, outfh)
         except KeyError:
             log.warning("Cannot load zip member: %s", name)
     return out_file
Esempio n. 20
0
 def load_file(self, content_hash, file_name=None):
     key = self._locate_key(content_hash)
     if key is not None:
         path = self._get_local_prefix(content_hash)
         try:
             os.makedirs(path)
         except:
             pass
         file_name = safe_filename(file_name, default='data')
         path = os.path.join(path, file_name)
         self.client.download_file(self.bucket, key, path)
         return path
Esempio n. 21
0
 def ingest(self, file_path):
     self.result.flag(self.result.FLAG_WORKBOOK)
     for table_name in self.get_tables(file_path):
         csv_name = safe_filename(table_name, extension='csv')
         csv_path = join_path(self.work_path, csv_name)
         self.dump_table(file_path, table_name, csv_path)
         child_id = join_path(self.result.id, table_name)
         self.manager.handle_child(self.result,
                                   csv_path,
                                   id=child_id,
                                   title=table_name,
                                   file_name=csv_name,
                                   mime_type='text/csv')
Esempio n. 22
0
 def ingest_attachment(self, name, mime_type, body, temp_dir):
     file_name = safe_filename(name, default='attachment')
     name = stringify(name) or file_name
     file_path = join_path(temp_dir, file_name)
     with open(file_path, 'w') as fh:
         if isinstance(body, six.text_type):
             body = body.encode('utf-8')
         if body is not None:
             fh.write(body)
     self.manager.handle_child(self.result, file_path,
                               id=join_path(self.result.id, name),
                               file_name=name,
                               mime_type=mime_type)
Esempio n. 23
0
def ingest_upload(id):
    collection = obj_or_404(Collection.by_id(id))
    require(request.authz.can_write(collection.id))
    meta, foreign_id = _load_metadata()
    parent_id = _load_parent(collection, meta)
    upload_dir = mkdtemp()
    try:
        documents = []
        for storage in request.files.values():
            path = safe_filename(storage.filename)
            path = os.path.join(upload_dir, path)
            storage.save(path)
            content_hash = checksum(path)
            document = Document.by_keys(collection=collection,
                                        parent_id=parent_id,
                                        foreign_id=foreign_id,
                                        content_hash=content_hash)
            document.mime_type = storage.mimetype
            if storage.filename:
                document.file_name = os.path.basename(storage.filename)
            document.update(meta)
            ingest_document(document, path,
                            role_id=request.authz.id)
            documents.append(document)

        if not len(request.files):
            # If there is no files uploaded, try to create an empty
            # directory instead. Maybe this should be more explicit,
            # but it seemed like the most simple way of fitting it
            # into the API.
            document = Document.by_keys(collection=collection,
                                        parent_id=parent_id,
                                        foreign_id=foreign_id)
            document.update(meta)
            ingest_document(document, upload_dir,
                            role_id=request.authz.id)
            documents.append(document)
    finally:
        shutil.rmtree(upload_dir)

    # Update child counts in index.
    if parent_id is not None:
        index_document_id.delay(parent_id)

    return jsonify({
        'status': 'ok',
        'documents': [DocumentSchema().dump(d).data for d in documents]
    })
Esempio n. 24
0
def entity_filename(proxy, base_name=None, extension=None):
    """Derive a safe filename for the given entity."""
    if proxy.schema.is_a("Document"):
        for extension_ in proxy.get("extension", quiet=True):
            if extension is not None:
                break
            extension = extension_
        for file_name in proxy.get("fileName", quiet=True):
            base_name_, extension_ = splitext(file_name)
            if base_name is None and len(base_name_):
                base_name = base_name_
            if extension is None and len(extension_):
                extension = extension_
        for mime_type in proxy.get("mimeType", quiet=True):
            if extension is not None:
                break
            extension = guess_extension(mime_type)
    base_name = base_name or proxy.id
    return safe_filename(base_name, extension=extension)
Esempio n. 25
0
    def archive_file(self, file_path, content_hash=None, mime_type=None):
        """Import the given file into the archive."""
        if content_hash is None:
            content_hash = checksum(file_path)

        if content_hash is None:
            return

        if self._locate_key(content_hash):
            return content_hash

        archive_prefix = self._get_prefix(content_hash)
        archive_path = self.path.joinpath(archive_prefix)
        archive_path.mkdir(parents=True, exist_ok=True)
        file_name = safe_filename(file_path, default='data')
        archive_path = archive_path.joinpath(file_name)
        with open(file_path, 'rb') as fin:
            with open(archive_path, 'wb') as fout:
                shutil.copyfileobj(fin, fout, BUF_SIZE)
        return content_hash
Esempio n. 26
0
    def csv_child_iter(self, iter, name):
        with self.create_temp_dir() as temp_dir:
            out_name = safe_filename(name, extension='csv')
            out_path = join_path(temp_dir, out_name)
            row_count = 0
            with io.open(out_path, 'w', newline='', encoding='utf-8') as fh:
                writer = csv.writer(fh, quoting=csv.QUOTE_ALL)
                for row in iter:
                    writer.writerow(row)
                    row_count += 1

            log.info("Generated [%s]: %s, %s rows", name, out_name, row_count)

            child_id = join_path(self.result.id, name)
            self.manager.handle_child(self.result,
                                      out_path,
                                      id=child_id,
                                      title=name,
                                      file_name=out_name,
                                      mime_type='text/csv')
Esempio n. 27
0
 def extract_attachment(self, zipf, message, attachment, temp_dir):
     url = attachment.get('OPFAttachmentURL')
     name = attachment.get('OPFAttachmentName')
     name = name or attachment.get('OPFAttachmentContentID')
     mime_type = attachment.get('OPFAttachmentContentType')
     if url is None and name is None:
         return
     if url is not None:
         foreign_id = os.path.join(self.result.id, url)
         file_path = self.extract_file(zipf, url, temp_dir)
     else:
         foreign_id = os.path.join(message.id, name)
         file_path = os.path.join(temp_dir, safe_filename(name))
         fh = open(file_path, 'w')
         fh.close()
     self.manager.handle_child(message,
                               file_path,
                               id=foreign_id,
                               file_name=name,
                               mime_type=mime_type)
Esempio n. 28
0
    def archive_file(self, file_path, content_hash=None):
        """Import the given file into the archive."""
        if content_hash is None:
            content_hash = checksum(file_path)

        if self._locate_key(content_hash):
            return content_hash

        archive_prefix = self._get_prefix(content_hash)
        archive_path = os.path.join(self.path, archive_prefix)
        try:
            os.makedirs(archive_path)
        except Exception:
            return content_hash

        file_name = safe_filename(file_path, default='data')
        archive_path = os.path.join(archive_path, file_name)
        with open(file_path, 'rb') as fin:
            with open(archive_path, 'wb') as fout:
                shutil.copyfileobj(fin, fout, BUF_SIZE)
        return content_hash
Esempio n. 29
0
    def ingest_attachment(self, entity, name, mime_type, body):
        has_body = body is not None and len(body)
        if stringify(name) is None and not has_body:
            # Hello, Outlook.
            return

        file_name = safe_filename(name, default='attachment')
        file_path = self.make_work_file(file_name)
        with open(file_path, 'wb') as fh:
            if isinstance(body, str):
                body = body.encode('utf-8')
            if body is not None:
                fh.write(body)

        checksum = self.manager.store(file_path, mime_type=mime_type)
        file_path.unlink()

        child = self.manager.make_entity('Document', parent=entity)
        child.make_id(name, checksum)
        child.add('contentHash', checksum)
        child.add('fileName', name)
        child.add('mimeType', mime_type)
        self.manager.queue_entity(child)
Esempio n. 30
0
    def csv_child_iter(self, iter, name):
        out_name = safe_filename(name, default='sheet.csv', extension='csv')
        out_path = join_path(self.work_path, out_name)
        row_count = 0
        with io.open(out_path, 'w', newline='', encoding='utf-8') as fh:
            writer = csv.writer(fh, quoting=csv.QUOTE_ALL)
            for row in iter:
                writer.writerow(row)
                row_count += 1

        name = stringify(name) or 'sheet'
        if row_count == 0:
            log.warning("Skip [%s]: no rows", name)
            return

        log.info("Generated [%s]: %s, %s rows", name, out_name, row_count)

        child_id = join_path(self.result.id, name)
        self.manager.handle_child(self.result,
                                  out_path,
                                  id=child_id,
                                  title=name,
                                  file_name=out_name,
                                  mime_type='text/csv')
Esempio n. 31
0
 def safe_file_name(self):
     """File name is a slugified version of the file title that is safe to
     use as part of a file system path."""
     return safe_filename(self.file_name, default='data')