def ingest_upload(collection_id): require(request.authz.can(collection_id, request.authz.WRITE)) sync = get_flag('sync') meta, foreign_id = _load_metadata() parent_id = _load_parent(collection_id, meta) upload_dir = mkdtemp(prefix='aleph.upload.') try: path = None content_hash = None for storage in request.files.values(): path = safe_filename(storage.filename, default='upload') path = os.path.join(upload_dir, path) storage.save(path) content_hash = checksum(path) document = Document.by_keys(collection_id=collection_id, parent_id=parent_id, foreign_id=foreign_id, content_hash=content_hash) document.update(meta) document.schema = Document.SCHEMA if content_hash is None: document.schema = Document.SCHEMA_FOLDER ingest_document(document, path, role_id=request.authz.id, content_hash=content_hash) finally: shutil.rmtree(upload_dir) # Make sure collection counts are always accurate. update_document(document, shallow=True, sync=sync) return jsonify({'status': 'ok', 'id': stringify(document.id)}, status=201)
def ingest_upload(collection_id): require(request.authz.can(collection_id, request.authz.WRITE)) sync = get_flag('sync') meta, foreign_id = _load_metadata() parent_id = _load_parent(collection_id, meta) upload_dir = mkdtemp(prefix='aleph.upload.') try: path = None content_hash = None for storage in request.files.values(): path = safe_filename(storage.filename, default='upload') path = os.path.join(upload_dir, path) storage.save(path) content_hash = checksum(path) document = Document.by_keys(collection_id=collection_id, parent_id=parent_id, foreign_id=foreign_id, content_hash=content_hash) document.update(meta) document.schema = Document.SCHEMA if content_hash is None: document.schema = Document.SCHEMA_FOLDER ingest_document(document, path, role_id=request.authz.id, content_hash=content_hash) finally: shutil.rmtree(upload_dir) if document.collection.casefile: # Make sure collection counts are always accurate. update_document(document, sync=sync) return jsonify({ 'status': 'ok', 'id': stringify(document.id) }, status=201)
def test_load_sample_directory(self): samples_path = self.get_fixture_path('samples') document = Document.by_keys(collection_id=self.collection.id, foreign_id='samples') db.session.commit() db.session.refresh(document) ingest_document(document, samples_path) assert Document.all().count() == 5, Document.all().count()
def test_load_pdf_file(self): pdf_path = self.get_fixture_path('demo.pdf') document = Document.by_keys(collection_id=self.collection.id, foreign_id='demo.pdf') db.session.commit() db.session.refresh(document) ingest_document(document, pdf_path) assert Document.all().count() == 1, Document.all().count()
def test_load_sample_directory(self): samples_path = self.get_fixture_path('samples') document = Document.by_keys(collection=self.collection, foreign_id='samples') db.session.commit() db.session.refresh(document) ingest_document(document, samples_path) assert Document.all().count() == 5, Document.all().count()
def test_load_pdf_file(self): pdf_path = self.get_fixture_path('demo.pdf') document = Document.by_keys(collection=self.collection, foreign_id='demo.pdf') db.session.commit() db.session.refresh(document) ingest_document(document, pdf_path) assert Document.all().count() == 1, Document.all().count()
def ingest_upload(id): collection = get_db_collection(id, request.authz.WRITE) meta, foreign_id = _load_metadata(collection) parent_id = _load_parent(collection, meta) upload_dir = mkdtemp(prefix='aleph.upload.') try: documents = [] for storage in request.files.values(): path = safe_filename(storage.filename, default='upload') path = os.path.join(upload_dir, path) storage.save(path) content_hash = checksum(path) document = Document.by_keys(collection=collection, parent_id=parent_id, foreign_id=foreign_id, content_hash=content_hash) document.update(meta) document.uploader_id = request.authz.id ingest_document(document, path) documents.append(document) if not len(request.files): # If there is no files uploaded, try to create an empty # directory instead. Maybe this should be more explicit, # but it seemed like the most simple way of fitting it # into the API. document = Document.by_keys(collection=collection, parent_id=parent_id, foreign_id=foreign_id) document.schema = Document.SCHEMA_FOLDER document.update(meta) document.uploader_id = request.authz.id ingest_document(document, None) documents.append(document) finally: shutil.rmtree(upload_dir) if collection.casefile: for document in documents: params = {'document': document, 'collection': collection} publish(Events.INGEST_DOCUMENT, actor_id=document.uploader_id, params=params) # Update child counts in index. if parent_id is not None: index_document_id.apply_async([parent_id], priority=1) refresh_index(index=entities_index()) return jsonify({ 'status': 'ok', 'documents': [CombinedSchema().dump(d).data for d in documents] })
def ingest_upload(id): collection = get_db_collection(id, request.authz.WRITE) meta, foreign_id = _load_metadata(collection) parent_id = _load_parent(collection, meta) upload_dir = mkdtemp(prefix='aleph.upload.') try: documents = [] for storage in request.files.values(): path = safe_filename(storage.filename, default='upload') path = os.path.join(upload_dir, path) storage.save(path) content_hash = checksum(path) document = Document.by_keys(collection_id=collection.id, parent_id=parent_id, foreign_id=foreign_id, content_hash=content_hash) document.update(meta) document.schema = Document.SCHEMA ingest_document(document, path, role_id=request.authz.id, content_hash=content_hash) documents.append(document) if not len(request.files): # If there is no files uploaded, try to create an empty # directory instead. Maybe this should be more explicit, # but it seemed like the most simple way of fitting it # into the API. document = Document.by_keys(collection_id=collection.id, parent_id=parent_id, foreign_id=foreign_id) document.update(meta) document.schema = Document.SCHEMA_FOLDER ingest_document(document, None, role_id=request.authz.id) documents.append(document) finally: shutil.rmtree(upload_dir) # Update child counts in index. if parent_id is not None: index_document_id.apply_async([parent_id], priority=1) # Make sure collection counts are always accurate. if get_flag('sync'): for document in documents: update_document(document, shallow=True, sync=True) return jsonify({ 'status': 'ok', 'documents': [CombinedSchema().dump(d).data for d in documents] })
def test_load_csv_file(self): csv_path = self.get_fixture_path('experts.csv') document = Document.by_keys(collection_id=self.collection.id, foreign_id='experts.csv') document.file_name = 'experts.csv' db.session.commit() db.session.refresh(document) ingest_document(document, csv_path) assert Document.all().count() == 1, Document.all().count() records = db.session.query(DocumentRecord).all() assert len(records) == 14, len(records) rec0 = records[0] assert str(rec0.id) in repr(rec0), repr(rec0) assert 'nationality' in rec0.data, rec0.data assert 'name' in rec0.data, rec0.data doc = rec0.document doc.delete_records() records = db.session.query(DocumentRecord).all() assert len(records) == 0, len(records)
def test_load_csv_file(self): csv_path = self.get_fixture_path('experts.csv') document = Document.by_keys(collection=self.collection, foreign_id='experts.csv') document.file_name = 'experts.csv' db.session.commit() db.session.refresh(document) ingest_document(document, csv_path) assert Document.all().count() == 1, Document.all().count() records = db.session.query(DocumentRecord).all() assert len(records) == 14, len(records) rec0 = records[0] assert str(rec0.id) in repr(rec0), repr(rec0) assert 'nationality' in rec0.data, rec0.data assert 'name' in rec0.data, rec0.data doc = rec0.document doc.delete_records() records = db.session.query(DocumentRecord).all() assert len(records) == 0, len(records)
def crawldir(path, language=None, foreign_id=None): """Crawl the given directory.""" path = decode_path(os.path.abspath(os.path.normpath(path))) if path is None or not os.path.exists(path): log.error("Invalid path: %r", path) return path_name = os.path.basename(path) if foreign_id is None: foreign_id = 'directory:%s' % slugify(path) collection = create_collection(foreign_id, { 'label': path_name, 'languages': language }) log.info('Crawling %s to %s...', path, foreign_id) document = Document.by_keys(collection=collection, foreign_id=path) document.file_name = path_name db.session.commit() ingest_document(document, path)