def _serialize(self, obj): pk = obj.get('id') obj['id'] = str(pk) authz = request.authz collection_id = obj.pop('collection_id', None) obj['collection'] = self.resolve(Collection, collection_id, CollectionSerializer) schema = model.get(obj.get('schema')) if schema is None: return None obj['schemata'] = schema.names properties = obj.get('properties', {}) for prop in schema.properties.values(): if prop.type != registry.entity: continue values = ensure_list(properties.get(prop.name)) properties[prop.name] = [] for value in values: entity = self.resolve(Entity, value, EntitySerializer) properties[prop.name].append(entity) links = { 'self': url_for('entities_api.view', entity_id=pk), 'references': url_for('entities_api.references', entity_id=pk), 'tags': url_for('entities_api.tags', entity_id=pk), 'ui': entity_url(pk) } if schema.is_a(Document.SCHEMA): links['content'] = url_for('entities_api.content', entity_id=pk) file_name = first(properties.get('fileName')) content_hash = first(properties.get('contentHash')) if content_hash: mime_type = first(properties.get('mimeType')) name = safe_filename(file_name, default=pk) links['file'] = archive_url(request.authz.id, content_hash, file_name=name, mime_type=mime_type) pdf_hash = first(properties.get('pdfHash')) if pdf_hash: name = safe_filename(file_name, default=pk, extension='.pdf') links['pdf'] = archive_url(request.authz.id, pdf_hash, file_name=name, mime_type=PDF) csv_hash = first(properties.get('csvHash')) if csv_hash: name = safe_filename(file_name, default=pk, extension='.csv') links['csv'] = archive_url(request.authz.id, csv_hash, file_name=name, mime_type=CSV) obj['links'] = links obj['writeable'] = authz.can(collection_id, authz.WRITE) obj.pop('_index', None) return self._clean_response(obj)
def make_filename(self, entity): """Some of the file importers actually care about the file extension, so this is trying to make sure we use a temporary file name that has an appropriate extension.""" for file_name in entity.get('fileName', quiet=True): _, extension = os.path.splitext(file_name) if len(extension): return safe_filename(file_name) extension = first(entity.get('extension', quiet=True)) if extension is None: mime_type = first(entity.get('mimeType', quiet=True)) if mime_type is not None: extension = guess_extension(mime_type) extension = extension or 'bin' return safe_filename('data', extension=extension)
def ingest_upload(collection_id): require(request.authz.can(collection_id, request.authz.WRITE)) sync = get_flag('sync') meta, foreign_id = _load_metadata() parent_id = _load_parent(collection_id, meta) upload_dir = mkdtemp(prefix='aleph.upload.') try: path = None content_hash = None for storage in request.files.values(): path = safe_filename(storage.filename, default='upload') path = os.path.join(upload_dir, path) storage.save(path) content_hash = checksum(path) document = Document.by_keys(collection_id=collection_id, parent_id=parent_id, foreign_id=foreign_id, content_hash=content_hash) document.update(meta) document.schema = Document.SCHEMA if content_hash is None: document.schema = Document.SCHEMA_FOLDER ingest_document(document, path, role_id=request.authz.id, content_hash=content_hash) finally: shutil.rmtree(upload_dir) if document.collection.casefile: # Make sure collection counts are always accurate. update_document(document, sync=sync) return jsonify({ 'status': 'ok', 'id': stringify(document.id) }, status=201)
def directory(context, data): """Store the collected files to a given directory.""" with context.http.rehash(data) as result: if not result.ok: return content_hash = data.get("content_hash") if content_hash is None: context.emit_warning("No content hash in data.") return path = _get_directory_path(context) file_name = data.get("file_name", result.file_name) mime_type = normalize_mimetype( CaseInsensitiveDict(data.get("headers", {})).get("content-type") ) extension = _get_file_extension(file_name, mime_type) file_name = file_name or "data" file_name = safe_filename(file_name, extension=extension) file_name = "%s.%s" % (content_hash, file_name) data["_file_name"] = file_name file_path = os.path.join(path, file_name) if not os.path.exists(file_path): shutil.copyfile(result.file_path, file_path) context.log.info("Store [directory]: %s", file_name) meta_path = os.path.join(path, "%s.json" % content_hash) with open(meta_path, "w") as fh: json.dump(data, fh)
def directory(context, data): """Store the collected files to a given directory.""" with context.http.rehash(data) as result: if not result.ok: return content_hash = data.get('content_hash') if content_hash is None: context.emit_warning("No content hash in data.") return path = _get_directory_path(context) file_name = data.get('file_name', result.file_name) mime_type = normalize_mimetype(data.get('headers', {}).get('Content-Type')) extension = _get_file_extension(file_name, mime_type) file_name = file_name or 'data' file_name = safe_filename(file_name, extension=extension) file_name = '%s.%s' % (content_hash, file_name) data['_file_name'] = file_name file_path = os.path.join(path, file_name) if not os.path.exists(file_path): shutil.copyfile(result.file_path, file_path) context.log.info("Store [directory]: %s", file_name) meta_path = os.path.join(path, '%s.json' % content_hash) with open(meta_path, 'w') as fh: json.dump(data, fh)
def directory(context, data): """Store the collected files to a given directory.""" with context.http.rehash(data) as result: if not result.ok: return content_hash = data.get('content_hash') if content_hash is None: context.emit_warning("No content hash in data.") return path = _get_directory_path(context) file_name = data.get('file_name', result.file_name) file_name = safe_filename(file_name, default='raw') file_name = '%s.%s' % (content_hash, file_name) data['_file_name'] = file_name file_path = os.path.join(path, file_name) if not os.path.exists(file_path): shutil.copyfile(result.file_path, file_path) context.log.info("Store [directory]: %s", file_name) meta_path = os.path.join(path, '%s.json' % content_hash) with open(meta_path, 'w') as fh: json.dump(data, fh)
def ingest_upload(collection_id): require(request.authz.can(collection_id, request.authz.WRITE)) sync = get_flag('sync') meta, foreign_id = _load_metadata() parent_id = _load_parent(collection_id, meta) upload_dir = mkdtemp(prefix='aleph.upload.') try: path = None content_hash = None for storage in request.files.values(): path = safe_filename(storage.filename, default='upload') path = os.path.join(upload_dir, path) storage.save(path) content_hash = checksum(path) document = Document.by_keys(collection_id=collection_id, parent_id=parent_id, foreign_id=foreign_id, content_hash=content_hash) document.update(meta) document.schema = Document.SCHEMA if content_hash is None: document.schema = Document.SCHEMA_FOLDER ingest_document(document, path, role_id=request.authz.id, content_hash=content_hash) finally: shutil.rmtree(upload_dir) # Make sure collection counts are always accurate. update_document(document, shallow=True, sync=sync) return jsonify({'status': 'ok', 'id': stringify(document.id)}, status=201)
def ingest_attachment(self, name, mime_type, body): has_body = body is not None and len(body) if safe_string(name) is None and not has_body: # Hello, Outlook. return file_name = safe_filename(name, default='attachment') name = safe_string(name) or file_name foreign_id = join_path(self.result.id, name) file_path = join_path(self.work_path, file_name) with open(file_path, 'wb') as fh: if isinstance(body, str): body = body.encode('utf-8') if body is not None: fh.write(body) if isinstance(mime_type, bytes): mime_type = mime_type.decode('utf-8') self.manager.handle_child(self.result, file_path, id=foreign_id, file_name=name, mime_type=mime_type)
def ingest_upload(collection_id): collection = get_db_collection(collection_id, request.authz.WRITE) job_id = get_session_id() sync = get_flag('sync', default=False) meta, foreign_id = _load_metadata() parent = _load_parent(collection, meta) upload_dir = ensure_path(mkdtemp(prefix='aleph.upload.')) try: content_hash = None for storage in request.files.values(): path = safe_filename(storage.filename, default='upload') path = upload_dir.joinpath(path) storage.save(str(path)) content_hash = archive.archive_file(path) document = Document.save(collection=collection, parent=parent, foreign_id=foreign_id, content_hash=content_hash, meta=meta, uploader_id=request.authz.id) collection.touch() db.session.commit() proxy = document.to_proxy() if proxy.schema.is_a(Document.SCHEMA_FOLDER) and sync: index_proxy(collection, proxy, sync=sync) ingest_entity(collection, proxy, job_id=job_id, sync=sync) document_id = collection.ns.sign(document.id) _notify(collection, document_id) finally: shutil.rmtree(upload_dir) return jsonify({'status': 'ok', 'id': document_id}, status=201)
def complete_export(export_id, file_path): export = Export.by_id(export_id) file_path = ensure_path(file_path) export.file_name = safe_filename(file_path) export.file_size = file_path.stat().st_size export.content_hash = checksum(file_path) try: archive.archive_file(file_path, content_hash=export.content_hash, mime_type=export.mime_type) export.set_status(status=Status.SUCCESS) except Exception: log.exception("Failed to upload export: %s", export) export.set_status(status=Status.FAILED) db.session.commit() params = {"export": export} role = Role.by_id(export.creator_id) log.info("Export [%r] complete: %s", export, export.status) publish( Events.COMPLETE_EXPORT, params=params, channels=[role], ) send_export_notification(export)
def ingest_upload(collection_id): collection = get_db_collection(collection_id, request.authz.WRITE) meta, foreign_id = _load_metadata() parent = _load_parent(collection, meta) upload_dir = ensure_path(mkdtemp(prefix='aleph.upload.')) try: content_hash = None for storage in request.files.values(): path = safe_filename(storage.filename, default='upload') path = upload_dir.joinpath(path) storage.save(str(path)) content_hash = archive.archive_file(path) document = Document.save(collection=collection, parent=parent, foreign_id=foreign_id, content_hash=content_hash, meta=meta, uploader_id=request.authz.id) db.session.commit() proxy = document.to_proxy() ingest_entity(collection, proxy) finally: shutil.rmtree(upload_dir) return jsonify({ 'status': 'ok', 'id': stringify(document.id) }, status=201)
def _local_path(self, content_hash, file_name, temp_path): path = self._get_local_prefix(content_hash, temp_path=temp_path) try: os.makedirs(path) except Exception: pass file_name = safe_filename(file_name, default='data') return os.path.join(path, file_name)
def dump_table(self, file_path, table_name, temp_dir): out_file = safe_filename(table_name, extension='csv') out_file = join_path(temp_dir, out_file) mdb_export = self.find_command('mdb-export') args = [mdb_export, '-b', 'strip', file_path, table_name] with open(out_file, 'w') as fh: self.subprocess.call(args, stdout=fh) return out_file
def set_filepath(self, file_path): file_path = ensure_path(file_path) file_name = safe_filename(file_path) file_size = file_path.stat().st_size self.file_name = file_name self.file_size = file_size self._file_path = file_path self.content_hash = checksum(file_path)
def extract_file(self, zipf, name, temp_dir): base_name = safe_filename(os.path.basename(name)) out_file = os.path.join(temp_dir, base_name) with open(out_file, 'w+b') as outfh: try: with zipf.open(name) as infh: shutil.copyfileobj(infh, outfh) except KeyError: log.warning("Cannot load zip member: %s", name) return out_file
def export_entities(export_id): export = Export.by_id(export_id) log.info("Export entities [%r]...", export) export_dir = ensure_path(mkdtemp(prefix="aleph.export.")) collections = {} try: filters = [export.meta.get("query", {"match_none": {}})] file_path = export_dir.joinpath("export.zip") with ZipFile(file_path, mode="w") as zf: excel_name = safe_filename(export.label, extension="xlsx") excel_path = export_dir.joinpath(excel_name) exporter = ExcelExporter(excel_path, extra=EXTRA_HEADERS) for idx, entity in enumerate(iter_proxies(filters=filters)): collection_id = entity.context.get("collection_id") if collection_id not in collections: collections[collection_id] = get_collection(collection_id) collection = collections[collection_id] if collection is None: continue extra = [entity_url(entity.id), collection.get("label")] exporter.write(entity, extra=extra) write_document(export_dir, zf, collection, entity) if file_path.stat().st_size >= settings.EXPORT_MAX_SIZE: concern = "total size of the" zf.writestr("EXPORT_TOO_LARGE.txt", WARNING % concern) break if idx >= settings.EXPORT_MAX_RESULTS: concern = "number of" zf.writestr("EXPORT_TOO_LARGE.txt", WARNING % concern) break exporter.finalize() zf.write(excel_path, arcname=excel_name) file_name = "Export: %s" % export.label file_name = safe_filename(file_name, extension="zip") complete_export(export_id, file_path, file_name) except Exception: log.exception("Failed to process export [%s]", export_id) export = Export.by_id(export_id) export.set_status(status=Status.FAILED) db.session.commit() finally: shutil.rmtree(export_dir)
def ingest_upload(id): collection = get_db_collection(id, request.authz.WRITE) meta, foreign_id = _load_metadata(collection) parent_id = _load_parent(collection, meta) upload_dir = mkdtemp(prefix='aleph.upload.') try: documents = [] for storage in request.files.values(): path = safe_filename(storage.filename, default='upload') path = os.path.join(upload_dir, path) storage.save(path) content_hash = checksum(path) document = Document.by_keys(collection=collection, parent_id=parent_id, foreign_id=foreign_id, content_hash=content_hash) document.update(meta) document.uploader_id = request.authz.id ingest_document(document, path) documents.append(document) if not len(request.files): # If there is no files uploaded, try to create an empty # directory instead. Maybe this should be more explicit, # but it seemed like the most simple way of fitting it # into the API. document = Document.by_keys(collection=collection, parent_id=parent_id, foreign_id=foreign_id) document.schema = Document.SCHEMA_FOLDER document.update(meta) document.uploader_id = request.authz.id ingest_document(document, None) documents.append(document) finally: shutil.rmtree(upload_dir) if collection.casefile: for document in documents: params = {'document': document, 'collection': collection} publish(Events.INGEST_DOCUMENT, actor_id=document.uploader_id, params=params) # Update child counts in index. if parent_id is not None: index_document_id.apply_async([parent_id], priority=1) refresh_index(index=entities_index()) return jsonify({ 'status': 'ok', 'documents': [CombinedSchema().dump(d).data for d in documents] })
def ingest_upload(id): collection = get_db_collection(id, request.authz.WRITE) meta, foreign_id = _load_metadata(collection) parent_id = _load_parent(collection, meta) upload_dir = mkdtemp(prefix='aleph.upload.') try: documents = [] for storage in request.files.values(): path = safe_filename(storage.filename, default='upload') path = os.path.join(upload_dir, path) storage.save(path) content_hash = checksum(path) document = Document.by_keys(collection_id=collection.id, parent_id=parent_id, foreign_id=foreign_id, content_hash=content_hash) document.update(meta) document.schema = Document.SCHEMA ingest_document(document, path, role_id=request.authz.id, content_hash=content_hash) documents.append(document) if not len(request.files): # If there is no files uploaded, try to create an empty # directory instead. Maybe this should be more explicit, # but it seemed like the most simple way of fitting it # into the API. document = Document.by_keys(collection_id=collection.id, parent_id=parent_id, foreign_id=foreign_id) document.update(meta) document.schema = Document.SCHEMA_FOLDER ingest_document(document, None, role_id=request.authz.id) documents.append(document) finally: shutil.rmtree(upload_dir) # Update child counts in index. if parent_id is not None: index_document_id.apply_async([parent_id], priority=1) # Make sure collection counts are always accurate. if get_flag('sync'): for document in documents: update_document(document, shallow=True, sync=True) return jsonify({ 'status': 'ok', 'documents': [CombinedSchema().dump(d).data for d in documents] })
def extract_file(self, zipf, name): """Extract a message file from the OLM zip archive""" path = pathlib.Path(name) base_name = safe_filename(path.name) out_file = self.make_work_file(base_name) with open(out_file, "w+b") as outfh: try: with zipf.open(name) as infh: shutil.copyfileobj(infh, outfh) except KeyError: log.warning("Cannot load zip member: %s", name) return out_file
def load_file(self, content_hash, file_name=None): key = self._locate_key(content_hash) if key is not None: path = self._get_local_prefix(content_hash) try: os.makedirs(path) except: pass file_name = safe_filename(file_name, default='data') path = os.path.join(path, file_name) self.client.download_file(self.bucket, key, path) return path
def ingest(self, file_path): self.result.flag(self.result.FLAG_WORKBOOK) for table_name in self.get_tables(file_path): csv_name = safe_filename(table_name, extension='csv') csv_path = join_path(self.work_path, csv_name) self.dump_table(file_path, table_name, csv_path) child_id = join_path(self.result.id, table_name) self.manager.handle_child(self.result, csv_path, id=child_id, title=table_name, file_name=csv_name, mime_type='text/csv')
def ingest_attachment(self, name, mime_type, body, temp_dir): file_name = safe_filename(name, default='attachment') name = stringify(name) or file_name file_path = join_path(temp_dir, file_name) with open(file_path, 'w') as fh: if isinstance(body, six.text_type): body = body.encode('utf-8') if body is not None: fh.write(body) self.manager.handle_child(self.result, file_path, id=join_path(self.result.id, name), file_name=name, mime_type=mime_type)
def ingest_upload(id): collection = obj_or_404(Collection.by_id(id)) require(request.authz.can_write(collection.id)) meta, foreign_id = _load_metadata() parent_id = _load_parent(collection, meta) upload_dir = mkdtemp() try: documents = [] for storage in request.files.values(): path = safe_filename(storage.filename) path = os.path.join(upload_dir, path) storage.save(path) content_hash = checksum(path) document = Document.by_keys(collection=collection, parent_id=parent_id, foreign_id=foreign_id, content_hash=content_hash) document.mime_type = storage.mimetype if storage.filename: document.file_name = os.path.basename(storage.filename) document.update(meta) ingest_document(document, path, role_id=request.authz.id) documents.append(document) if not len(request.files): # If there is no files uploaded, try to create an empty # directory instead. Maybe this should be more explicit, # but it seemed like the most simple way of fitting it # into the API. document = Document.by_keys(collection=collection, parent_id=parent_id, foreign_id=foreign_id) document.update(meta) ingest_document(document, upload_dir, role_id=request.authz.id) documents.append(document) finally: shutil.rmtree(upload_dir) # Update child counts in index. if parent_id is not None: index_document_id.delay(parent_id) return jsonify({ 'status': 'ok', 'documents': [DocumentSchema().dump(d).data for d in documents] })
def entity_filename(proxy, base_name=None, extension=None): """Derive a safe filename for the given entity.""" if proxy.schema.is_a("Document"): for extension_ in proxy.get("extension", quiet=True): if extension is not None: break extension = extension_ for file_name in proxy.get("fileName", quiet=True): base_name_, extension_ = splitext(file_name) if base_name is None and len(base_name_): base_name = base_name_ if extension is None and len(extension_): extension = extension_ for mime_type in proxy.get("mimeType", quiet=True): if extension is not None: break extension = guess_extension(mime_type) base_name = base_name or proxy.id return safe_filename(base_name, extension=extension)
def archive_file(self, file_path, content_hash=None, mime_type=None): """Import the given file into the archive.""" if content_hash is None: content_hash = checksum(file_path) if content_hash is None: return if self._locate_key(content_hash): return content_hash archive_prefix = self._get_prefix(content_hash) archive_path = self.path.joinpath(archive_prefix) archive_path.mkdir(parents=True, exist_ok=True) file_name = safe_filename(file_path, default='data') archive_path = archive_path.joinpath(file_name) with open(file_path, 'rb') as fin: with open(archive_path, 'wb') as fout: shutil.copyfileobj(fin, fout, BUF_SIZE) return content_hash
def csv_child_iter(self, iter, name): with self.create_temp_dir() as temp_dir: out_name = safe_filename(name, extension='csv') out_path = join_path(temp_dir, out_name) row_count = 0 with io.open(out_path, 'w', newline='', encoding='utf-8') as fh: writer = csv.writer(fh, quoting=csv.QUOTE_ALL) for row in iter: writer.writerow(row) row_count += 1 log.info("Generated [%s]: %s, %s rows", name, out_name, row_count) child_id = join_path(self.result.id, name) self.manager.handle_child(self.result, out_path, id=child_id, title=name, file_name=out_name, mime_type='text/csv')
def extract_attachment(self, zipf, message, attachment, temp_dir): url = attachment.get('OPFAttachmentURL') name = attachment.get('OPFAttachmentName') name = name or attachment.get('OPFAttachmentContentID') mime_type = attachment.get('OPFAttachmentContentType') if url is None and name is None: return if url is not None: foreign_id = os.path.join(self.result.id, url) file_path = self.extract_file(zipf, url, temp_dir) else: foreign_id = os.path.join(message.id, name) file_path = os.path.join(temp_dir, safe_filename(name)) fh = open(file_path, 'w') fh.close() self.manager.handle_child(message, file_path, id=foreign_id, file_name=name, mime_type=mime_type)
def archive_file(self, file_path, content_hash=None): """Import the given file into the archive.""" if content_hash is None: content_hash = checksum(file_path) if self._locate_key(content_hash): return content_hash archive_prefix = self._get_prefix(content_hash) archive_path = os.path.join(self.path, archive_prefix) try: os.makedirs(archive_path) except Exception: return content_hash file_name = safe_filename(file_path, default='data') archive_path = os.path.join(archive_path, file_name) with open(file_path, 'rb') as fin: with open(archive_path, 'wb') as fout: shutil.copyfileobj(fin, fout, BUF_SIZE) return content_hash
def ingest_attachment(self, entity, name, mime_type, body): has_body = body is not None and len(body) if stringify(name) is None and not has_body: # Hello, Outlook. return file_name = safe_filename(name, default='attachment') file_path = self.make_work_file(file_name) with open(file_path, 'wb') as fh: if isinstance(body, str): body = body.encode('utf-8') if body is not None: fh.write(body) checksum = self.manager.store(file_path, mime_type=mime_type) file_path.unlink() child = self.manager.make_entity('Document', parent=entity) child.make_id(name, checksum) child.add('contentHash', checksum) child.add('fileName', name) child.add('mimeType', mime_type) self.manager.queue_entity(child)
def csv_child_iter(self, iter, name): out_name = safe_filename(name, default='sheet.csv', extension='csv') out_path = join_path(self.work_path, out_name) row_count = 0 with io.open(out_path, 'w', newline='', encoding='utf-8') as fh: writer = csv.writer(fh, quoting=csv.QUOTE_ALL) for row in iter: writer.writerow(row) row_count += 1 name = stringify(name) or 'sheet' if row_count == 0: log.warning("Skip [%s]: no rows", name) return log.info("Generated [%s]: %s, %s rows", name, out_name, row_count) child_id = join_path(self.result.id, name) self.manager.handle_child(self.result, out_path, id=child_id, title=name, file_name=out_name, mime_type='text/csv')
def safe_file_name(self): """File name is a slugified version of the file title that is safe to use as part of a file system path.""" return safe_filename(self.file_name, default='data')