def ingest_upload(collection_id): require(request.authz.can(collection_id, request.authz.WRITE)) sync = get_flag('sync') meta, foreign_id = _load_metadata() parent_id = _load_parent(collection_id, meta) upload_dir = mkdtemp(prefix='aleph.upload.') try: path = None content_hash = None for storage in request.files.values(): path = safe_filename(storage.filename, default='upload') path = os.path.join(upload_dir, path) storage.save(path) content_hash = checksum(path) document = Document.by_keys(collection_id=collection_id, parent_id=parent_id, foreign_id=foreign_id, content_hash=content_hash) document.update(meta) document.schema = Document.SCHEMA if content_hash is None: document.schema = Document.SCHEMA_FOLDER ingest_document(document, path, role_id=request.authz.id, content_hash=content_hash) finally: shutil.rmtree(upload_dir) if document.collection.casefile: # Make sure collection counts are always accurate. update_document(document, sync=sync) return jsonify({ 'status': 'ok', 'id': stringify(document.id) }, status=201)
def ingest_upload(collection_id): collection = obj_or_404(Collection.by_id(collection_id)) request.authz.require(request.authz.collection_write(collection.id)) log_event(request) crawler_run = make_textid() try: meta = json.loads(request.form.get('meta', '{}')) except Exception as ex: raise BadRequest(unicode(ex)) documents = [] for storage in request.files.values(): sec_fn = os.path.join(upload_folder, secure_filename(storage.filename)) storage.save(sec_fn) content_hash = checksum(sec_fn) document = Document.by_keys(collection=collection, content_hash=content_hash) document.crawler = 'user_upload:%s' % request.authz.role.id document.crawler_run = crawler_run document.mime_type = storage.mimetype document.file_name = storage.filename try: meta = json.loads(request.form.get('meta', '{}')) validate(meta, 'metadata.json#') document.meta.update(meta) except Exception as ex: raise BadRequest(unicode(ex)) ingest_document(document, sec_fn, user_queue=True) os.unlink(sec_fn) documents.append(document) return jsonify({'status': 'ok', 'documents': documents})
def crawldir(path, language=None, country=None, foreign_id=None): """Crawl the given directory.""" path = decode_path(path) if path is None or not os.path.exists(path): log.error("Invalid path: %r", path) return path = os.path.abspath(os.path.normpath(path)) path_name = os.path.basename(path) if foreign_id is None: foreign_id = 'directory:%s' % slugify(path) collection = Collection.by_foreign_id(foreign_id) if collection is None: collection = Collection.create({ 'foreign_id': foreign_id, 'label': path_name, 'managed': True }) if language is not None: collection.languages = [language] if country is not None: collection.countries = [country] db.session.commit() update_collection(collection) log.info('Crawling %r to %r...', path, collection.foreign_id) document = Document.by_keys(collection=collection, foreign_id=path) document.file_name = path_name ingest_document(document, path)
def ingest_upload(id): collection = get_db_collection(id, request.authz.WRITE) meta, foreign_id = _load_metadata(collection) parent_id = _load_parent(collection, meta) upload_dir = mkdtemp(prefix='aleph.upload.') try: documents = [] for storage in request.files.values(): path = safe_filename(storage.filename, default='upload') path = os.path.join(upload_dir, path) storage.save(path) content_hash = checksum(path) document = Document.by_keys(collection=collection, parent_id=parent_id, foreign_id=foreign_id, content_hash=content_hash) document.update(meta) ingest_document(document, path, role_id=request.authz.id) documents.append(document) if not len(request.files): # If there is no files uploaded, try to create an empty # directory instead. Maybe this should be more explicit, # but it seemed like the most simple way of fitting it # into the API. document = Document.by_keys(collection=collection, parent_id=parent_id, foreign_id=foreign_id) document.schema = Document.SCHEMA_FOLDER document.update(meta) ingest_document(document, None, role_id=request.authz.id, shallow=True) documents.append(document) finally: shutil.rmtree(upload_dir) # Update child counts in index. if parent_id is not None: index_document_id.apply_async([parent_id], priority=1) return jsonify({ 'status': 'ok', 'documents': [CombinedSchema().dump(d).data for d in documents] })
def test_load_sample_directory(self): samples_path = self.get_fixture_path('samples') document = Document.by_keys(collection_id=self.collection.id, foreign_id='samples') db.session.commit() db.session.refresh(document) ingest_document(document, samples_path) assert Document.all().count() == 5, Document.all().count()
def test_load_pdf_file(self): pdf_path = self.get_fixture_path('demo.pdf') document = Document.by_keys(collection_id=self.collection.id, foreign_id='demo.pdf') db.session.commit() db.session.refresh(document) ingest_document(document, pdf_path) assert Document.all().count() == 1, Document.all().count()
def test_load_sample_directory(self): samples_path = self.get_fixture_path('samples') document = Document.by_keys(collection=self.collection, foreign_id='samples') db.session.commit() db.session.refresh(document) ingest_document(document, samples_path) assert Document.all().count() == 5, Document.all().count()
def test_load_pdf_file(self): pdf_path = self.get_fixture_path('demo.pdf') document = Document.by_keys(collection=self.collection, foreign_id='demo.pdf') db.session.commit() db.session.refresh(document) ingest_document(document, pdf_path) assert Document.all().count() == 1, Document.all().count()
def create_document(self, foreign_id=None, content_hash=None): document = Document.by_keys(collection=self.collection, foreign_id=foreign_id, content_hash=content_hash) document.crawler = self.get_id() document.crawler_run = self.crawler_run document.status = Document.STATUS_PENDING return document
def handle_child(self, parent, file_path, title=None, mime_type=None, id=None, file_name=None): file_path = decode_path(file_path) assert id is not None, (parent, file_path) doc = Document.by_keys(parent_id=parent.document.id, collection=parent.document.collection, foreign_id=id) doc.title = title or doc.meta.get('title') doc.file_name = file_name or doc.meta.get('file_name') doc.mime_type = mime_type or doc.meta.get('mime_type') from aleph.logic.documents.ingest import ingest_document ingest_document(doc, file_path, role_id=parent.document.uploader_id) return DocumentResult(self, doc, file_path=file_path)
def handle_child(self, parent, file_path, title=None, mime_type=None, id=None, file_name=None): file_path = decode_path(file_path) file_name = decode_path(file_name) or os.path.basename(file_path) content_hash = None if not os.path.isdir(file_path): content_hash = checksum(file_path) document = Document.by_keys(parent_id=parent.document.id, collection=parent.document.collection, foreign_id=id, content_hash=content_hash) document.title = title or document.meta.get('title') document.file_name = file_name or document.meta.get('file_name') document.mime_type = mime_type or document.meta.get('mime_type') from aleph.ingest import ingest_document ingest_document(document, file_path, user_queue=parent.user_queue)
def test_load_csv_file(self): csv_path = self.get_fixture_path('experts.csv') document = Document.by_keys(collection=self.collection, foreign_id='experts.csv') db.session.commit() db.session.refresh(document) ingest_document(document, csv_path) assert Document.all().count() == 1, Document.all().count() records = db.session.query(DocumentRecord).all() assert len(records) == 14, len(records) rec0 = records[0] assert str(rec0.id) in repr(rec0), repr(rec0) assert 'nationality' in rec0.data, rec0.data assert 'name' in rec0.data, rec0.data doc = rec0.document doc.delete_records() records = db.session.query(DocumentRecord).all() assert len(records) == 0, len(records)
def test_load_csv_file(self): csv_path = self.get_fixture_path('experts.csv') document = Document.by_keys(collection_id=self.collection.id, foreign_id='experts.csv') document.file_name = 'experts.csv' db.session.commit() db.session.refresh(document) ingest_document(document, csv_path) assert Document.all().count() == 1, Document.all().count() records = db.session.query(DocumentRecord).all() assert len(records) == 14, len(records) rec0 = records[0] assert str(rec0.id) in repr(rec0), repr(rec0) assert 'nationality' in rec0.data, rec0.data assert 'name' in rec0.data, rec0.data doc = rec0.document doc.delete_records() records = db.session.query(DocumentRecord).all() assert len(records) == 0, len(records)
def crawldir(path, language=None, foreign_id=None): """Crawl the given directory.""" path = decode_path(os.path.abspath(os.path.normpath(path))) if path is None or not os.path.exists(path): log.error("Invalid path: %r", path) return path_name = os.path.basename(path) if foreign_id is None: foreign_id = 'directory:%s' % slugify(path) collection = create_collection(foreign_id, { 'label': path_name, 'languages': language }) log.info('Crawling %s to %s...', path, foreign_id) document = Document.by_keys(collection=collection, foreign_id=path) document.file_name = path_name db.session.commit() ingest_document(document, path)
def _load_parent(collection, meta): # Determine the parent document for the document that is to be # ingested. This can either be specified using a document ID, # or using a foreign ID (because the document ID may not be as # easily accessible to the client). if 'parent' not in meta: return data = meta.get('parent') parent = None if not is_mapping(data): parent = Document.by_id(data, collection_id=collection.id) elif 'id' in data: parent = Document.by_id(data.get('id'), collection_id=collection.id) elif 'foreign_id' in data: parent = Document.by_keys(collection=collection, foreign_id=data.get('foreign_id')) if parent is None: raise BadRequest(response=jsonify( { 'status': 'error', 'message': 'Cannot load parent document' }, status=400)) return parent.id