def ingest(document_id, file_path=None): """Process a given document by extracting its contents. This may include creating or updating child documents.""" document = Document.by_id(document_id) if document is None: log.error("Could not find document: %s", document_id) return # Work path will be used by storagelayer to cache a local # copy of data from an S3-based archive, and by ingestors # to perform processing and generate intermediary files. work_path = mkdtemp(prefix="aleph.ingest.") if file_path is None: file_path = archive.load_file(document.content_hash, file_name=document.safe_file_name, temp_path=work_path) try: manager = get_manager() result = DocumentResult(manager, document, file_path=file_path) get_manager().ingest(file_path, result=result, work_path=work_path) log.debug('Ingested [%s:%s]: %s', document.id, document.schema, document.name) db.session.commit() process_document(document) except Exception: db.session.rollback() document = Document.by_id(document_id) log.exception("Ingest failed [%s]: %s", document.id, document.name) finally: # Removing the temp_path given to storagelayer makes it redundant # to also call cleanup on the archive. remove_directory(work_path)
def ingest(document_id, file_path=None, refresh=False): """Process a given document by extracting its contents. This may include creating or updating child documents.""" document = Document.by_id(document_id) if document is None: log.error("Could not find document: %s", document_id) return # Work path will be used by storagelayer to cache a local # copy of data from an S3-based archive, and by ingestors # to perform processing and generate intermediary files. work_path = mkdtemp(prefix="aleph.ingest.") if file_path is None: file_path = archive.load_file(document.content_hash, file_name=document.safe_file_name, temp_path=work_path) try: manager = get_manager() result = DocumentResult(manager, document, file_path=file_path) get_manager().ingest(file_path, result=result, work_path=work_path) document.status = Document.STATUS_SUCCESS log.debug('Ingested [%s:%s]: %s', document.id, document.schema, document.name) if document.collection.casefile and not refresh: params = {'collection': document.collection, 'document': document} publish(Events.INGEST_DOCUMENT, actor_id=document.uploader_id, params=params) db.session.commit() except Exception: db.session.rollback() document = Document.by_id(document_id) log.exception("Ingest failed [%s]: %s", document.id, document.name) document.status = Document.STATUS_FAIL db.session.commit() finally: # Removing the temp_path given to storagelayer makes it redundant # to also call cleanup on the archive. remove_directory(work_path) extract_document_tags(document) # delete_entity(document.id, exclude=document.schema) index_document(document) refresh_entity(document)
def analyze_document_id(document_id): """Analyze a document after looking it up by ID.""" document = Document.by_id(document_id) if document is None: log.info("Could not find document: %r", document_id) return analyze_document(document)
def index_document(document_id): document = Document.by_id(document_id) if document is None: log.info("Could not find document: %r", document_id) return try: log.info("Index document: %r", document) data = document.to_index_dict() data['entities'] = generate_entities(document) data['title_latin'] = latinize_text(data.get('title')) data['summary_latin'] = latinize_text(data.get('summary')) get_es().index(index=get_es_index(), doc_type=TYPE_DOCUMENT, body=data, id=document.id) clear_children(document) if document.type == Document.TYPE_TEXT: bulk(get_es(), generate_pages(document), stats_only=True, chunk_size=2000, request_timeout=60.0) if document.type == Document.TYPE_TABULAR: bulk(get_es(), generate_records(document), stats_only=True, chunk_size=2000, request_timeout=60.0) except Exception as ex: log.exception(ex) process.exception(process.INDEX, component=__name__, document_id=document.id, meta=document.meta, source_id=document.source_id, exception=ex)
def get_document(document_id): document = Document.by_id(document_id) if document is None: raise NotFound() readable = [c for c in document.collection_ids if authz.collection_read(c)] authz.require(len(readable)) return document
def test_upload_csv_doc(self): _, headers = self.login(is_admin=True) meta = { "countries": ["de", "usa"], "languages": ["en"], "mime_type": "text/csv", "source_url": "http://pudo.org/experts.csv", } csv_path = self.get_fixture_path("experts.csv") data = { "meta": json.dumps(meta), "foo": open(csv_path, "rb"), } res = self.client.post(self.url, data=data, headers=headers) assert res.status_code == 201, (res, res.data) assert "id" in res.json, res.json db_id, _ = res.json.get("id").split(".", 1) doc = Document.by_id(db_id) assert doc.schema == Document.SCHEMA, doc.schema assert doc.meta["countries"] == ["de", "us"], doc.meta assert doc.meta["languages"] == ["eng"], doc.meta status = get_status(self.col) assert status.get("pending") == 1, status job = status.get("jobs")[0] assert job.get("pending") == 1, job stage = job.get("stages")[0] assert stage.get("stage") == OP_INGEST, stage assert stage.get("pending") == 1, stage
def index_document(document_id): clear_session() document = Document.by_id(document_id) if document is None: log.info("Could not find document: %r", document_id) return log.info("Index document: %r", document) data = document.to_dict() data['entities'] = generate_entities(document) data['title_latin'] = latinize_text(data.get('title')) data['summary_latin'] = latinize_text(data.get('summary')) es.index(index=es_index, doc_type=TYPE_DOCUMENT, body=data, id=document.id) clear_children(document) try: if document.type == Document.TYPE_TEXT: bulk(es, generate_pages(document), stats_only=True, chunk_size=2000, request_timeout=60.0) if document.type == Document.TYPE_TABULAR: bulk(es, generate_records(document), stats_only=True, chunk_size=2000, request_timeout=60.0) except Exception as ex: log.exception(ex)
def index_document_id(document_id): document = Document.by_id(document_id) if document is None: log.info("Could not find document: %r", document_id) return index_document(document) index_records(document)
def prune_entity(collection, entity_id=None, job_id=None): """Prune handles the full deletion of an entity outside of the HTTP request cycle. This involves cleaning up adjacent entities like xref results, notifications and so on.""" # This is recursive and will also delete any entities which # reference the given entity. Usually this is going to be child # documents, or directoships referencing a person. It's a pretty # dangerous operation, though. log.info("[%s] Prune entity: %s", collection, entity_id) for adjacent in index.iter_adjacent(collection.id, entity_id): log.warning("Recursive delete: %s", adjacent.get("id")) delete_entity(collection, adjacent, job_id=job_id) flush_notifications(entity_id, clazz=Entity) obj = Entity.by_id(entity_id, collection=collection) if obj is not None: obj.delete() doc = Document.by_id(entity_id, collection=collection) if doc is not None: doc.delete() EntitySetItem.delete_by_entity(entity_id) Mapping.delete_by_table(entity_id) xref_index.delete_xref(collection, entity_id=entity_id) aggregator = get_aggregator(collection) aggregator.delete(entity_id=entity_id) refresh_entity(collection, entity_id) collection.touch() db.session.commit()
def get_document(document_id, action=Authz.READ): document = Document.by_id(document_id) if document is None: raise NotFound() collections = request.authz.collections.get(action) request.authz.require(document.collection_id in collections) return document
def test_upload_csv_doc(self): _, headers = self.login(is_admin=True) meta = { 'countries': ['de', 'us'], 'languages': ['en'], 'mime_type': 'text/csv', 'source_url': 'http://pudo.org/experts.csv' } csv_path = self.get_fixture_path('experts.csv') data = { 'meta': json.dumps(meta), 'foo': open(csv_path, 'rb'), } res = self.client.post(self.url, data=data, headers=headers) assert res.status_code == 201, (res, res.data) assert 'id' in res.json, res.json doc = Document.by_id(res.json.get('id')) assert doc.schema == Document.SCHEMA, doc.schema status = get_status(self.col) assert status.get('pending') == 1, status op = status.get('operations')[0] assert op.get('pending') == 1, op assert op.get('operation') == OP_INGEST, op
def test_upload_csv_doc(self): _, headers = self.login(is_admin=True) meta = { 'countries': ['de', 'usa'], 'languages': ['en'], 'mime_type': 'text/csv', 'source_url': 'http://pudo.org/experts.csv' } csv_path = self.get_fixture_path('experts.csv') data = { 'meta': json.dumps(meta), 'foo': open(csv_path, 'rb'), } res = self.client.post(self.url, data=data, headers=headers) assert res.status_code == 201, (res, res.data) assert 'id' in res.json, res.json db_id, _ = res.json.get('id').split('.', 1) doc = Document.by_id(db_id) assert doc.schema == Document.SCHEMA, doc.schema assert doc.meta['countries'] == ['de', 'us'], doc.meta assert doc.meta['languages'] == ['eng'], doc.meta status = get_status(self.col) assert status.get('pending') == 1, status job = status.get('jobs')[0] assert job.get('pending') == 1, job stage = job.get('stages')[0] assert stage.get('stage') == OP_INGEST, stage assert stage.get('pending') == 1, stage
def ingest(document_id, user_queue=False): """Process a given document by extracting its contents. This may include creating or updating child documents.""" document = Document.by_id(document_id) if document is None: log.error("Could not find document: %s", document_id) return get_manager().ingest_document(document, user_queue=user_queue)
def analyze_document(document_id): clear_session() document = Document.by_id(document_id) if document is None: log.info("Could not find document: %r", document_id) return log.info("Analyze document: %r", document) for cls in get_analyzers(): cls().analyze(document, document.meta) index_document(document_id)
def analyze_document(document_id): document = Document.by_id(document_id) if document is None: log.info("Could not find document: %r", document_id) return log.info("Analyze document: %r", document) try: for cls in get_analyzers(): cls().analyze(document, document.meta) except Exception as ex: log.exception(ex) index_document(document_id)
def ingest(document_id, role_id=None): """Process a given document by extracting its contents. This may include creating or updating child documents.""" document = Document.by_id(document_id) if document is None: log.error("Could not find document: %s", document_id) return get_manager().ingest_document(document, role_id=role_id) pending = Document.pending_count(collection_id=document.collection.id) if pending == 0: ingest_complete(document.collection, role_id=role_id)
def _load_parent(collection_id, meta): """Determine the parent document for the document that is to be ingested.""" parent_id = meta.get('parent_id') if parent_id is None: return parent = Document.by_id(parent_id, collection_id=collection_id) if parent is None: raise BadRequest(response=jsonify({ 'status': 'error', 'message': 'Cannot load parent document' }, status=400)) return parent_id
def analyze_document(document_id): document = Document.by_id(document_id) if document is None: log.info("Could not find document: %r", document_id) return log.info("Analyze document: %r", document) for cls in get_analyzers(): try: cls().analyze(document, document.meta) except Exception as ex: log.exception(ex) process.exception(process.ANALYZE, component=cls.__name__, document_id=document.id, meta=document.meta, source_id=document.source_id, exception=ex) index_document(document_id)
def _load_parent(collection, meta): # Determine the parent document for the document that is to be # ingested. This can either be specified using a document ID, # or using a foreign ID (because the document ID may not be as # easily accessible to the client). if 'parent' not in meta: return data = meta.get('parent') parent = None if not is_mapping(data): parent = Document.by_id(data, collection_id=collection.id) elif 'id' in data: parent = Document.by_id(data.get('id'), collection_id=collection.id) elif 'foreign_id' in data: parent = Document.by_keys(collection=collection, foreign_id=data.get('foreign_id')) if parent is None: raise BadRequest(response=jsonify( { 'status': 'error', 'message': 'Cannot load parent document' }, status=400)) return parent.id
def index_document(document_id): document = Document.by_id(document_id) if document is None: log.info("Could not find document: %r", document_id) return log.info("Index document: %r", document) data = document.to_index_dict() data['entities'] = generate_entities(document) data['title_latin'] = latinize_text(data.get('title')) data['summary_latin'] = latinize_text(data.get('summary')) get_es().index(index=get_es_index(), doc_type=TYPE_DOCUMENT, body=data, id=document.id) clear_records(document) bulk_op(generate_records(document))
def index_document(document_id): document = Document.by_id(document_id) if document is None: log.info("Could not find document: %r", document_id) return log.info("Index document: %r", document) data = document.to_index_dict() data['entities'] = generate_entities(document) data['title_latin'] = latinize_text(data.get('title')) data['summary_latin'] = latinize_text(data.get('summary')) get_es().index(index=get_es_index(), doc_type=TYPE_DOCUMENT, body=data, id=document.id) clear_records(document) bulk(get_es(), generate_records(document), stats_only=True, chunk_size=2000, request_timeout=60.0)
def _load_parent(collection, meta): """Determine the parent document for the document that is to be ingested.""" parent = ensure_dict(meta.get("parent")) parent_id = meta.get("parent_id", parent.get("id")) if parent_id is None: return parent = Document.by_id(parent_id, collection=collection) if parent is None: raise BadRequest( response=jsonify( {"status": "error", "message": "Cannot load parent document"}, status=400, ) ) return parent
def delete_entity(entity, deleted_at=None, sync=False): # This is recursive and will also delete any entities which # reference the given entity. Usually this is going to be child # documents, or directoships referencing a person. It's a pretty # dangerous operation, though. for adjacent in index.iter_adjacent(entity): log.warning("Recursive delete: %r", adjacent) delete_entity(adjacent, deleted_at=deleted_at, sync=sync) flush_notifications(entity.get('id'), clazz=Entity) obj = Entity.by_id(entity.get('id')) if obj is not None: obj.delete(deleted_at=deleted_at) doc = Document.by_id(entity.get('id')) if doc is not None: doc.delete(deleted_at=deleted_at) index.delete_entity(entity.get('id'), sync=sync) refresh_entity(entity)
def ingest(document_id, role_id=None): """Process a given document by extracting its contents. This may include creating or updating child documents.""" document = Document.by_id(document_id) if document is None: log.error("Could not find document: %s", document_id) return get_manager().ingest_document(document, role_id=role_id) # is this too often? from aleph.logic.collections import update_collection update_collection(document.collection) from aleph.logic.notifications import publish params = {'document': document, 'collection': document.collection} publish(Events.INGEST_DOCUMENT, actor_id=role_id, params=params)
def generate_entity_references(entity): if entity.state != Entity.STATE_ACTIVE: return # This is all a bit hacky: we're re-generating all the entity # references for the given entity by effectively re-implementing # the RegexEntityAnalyzer. The alternative was to conduct a # search for potential matching documents, re-analyze them and # re-index them. This proved to be too slow in reality. log.info("Updating document references: %r", entity) rex = '|'.join(entity.regex_terms) rex = re.compile('( |^)(%s)( |$)' % rex) documents = defaultdict(int) try: for document_id, text in scan_entity_mentions(entity): text = normalize_strong(text) if text is None or len(text) <= 2: continue for match in rex.finditer(text): documents[document_id] += 1 except Exception: log.exception('Failed to fully scan documents for entity refresh.') q = db.session.query(Reference) q = q.filter(Reference.entity_id == entity.id) q = q.filter(Reference.origin == 'regex') q.delete(synchronize_session='fetch') log.info("Re-matching %r gave %r documents.", entity, len(documents)) for document_id, weight in documents.items(): doc = Document.by_id(document_id) if doc is None: continue ref = Reference() ref.document_id = document_id ref.entity_id = entity.id ref.origin = 'regex' ref.weight = weight db.session.add(ref) db.session.commit() delete_entity_references(entity.id) update_entity_references(entity.id)
def ingest_url(self, document_id, url): """Load the given URL into the document specified by document_id.""" document = Document.by_id(document_id) if document is None: log.error("Could not find document: %s", document_id) return tmp_path = make_tempfile(document.file_name, suffix=document.extension) try: log.info("Ingesting URL: %s", url) res = requests.get(url, stream=True) if res.status_code >= 500: countdown = 3600 ** self.request.retries self.retry(countdown=countdown) return if res.status_code >= 400: document.status = Document.STATUS_FAIL document.error_message = "HTTP %s: %s" % (res.status_code, url) db.session.commit() return with open(tmp_path, 'w') as fh: for chunk in res.iter_content(chunk_size=1024): if chunk: fh.write(chunk) if not document.has_meta('source_url'): document.source_url = res.url if not document.foreign_id: document.foreign_id = res.url document.headers = res.headers document.content_hash = archive.archive_file(tmp_path) db.session.commit() get_manager().ingest_document(document) except IOError as ioe: log.info("IO Failure: %r", ioe) countdown = 3600 ** self.request.retries self.retry(countdown=countdown) except Exception as ex: document.status = Document.STATUS_FAIL document.error_type = type(ex).__name__ document.error_message = six.text_type(ex) db.session.commit() log.exception(ex) finally: db.session.remove() remove_tempfile(tmp_path)
def ingest(document_id, role_id=None): """Process a given document by extracting its contents. This may include creating or updating child documents.""" document = Document.by_id(document_id) if document is None: log.error("Could not find document: %s", document_id) return get_manager().ingest_document(document, role_id=role_id) if document.collection.casefile: index_collection(document.collection) params = { 'document': document, 'collection': document.collection } publish(Events.INGEST_DOCUMENT, actor_id=role_id, params=params)
def generate_entity_references(entity): if entity.state != Entity.STATE_ACTIVE: return rex = '|'.join(entity.regex_terms) rex = re.compile('( |^)(%s)( |$)' % rex) documents = defaultdict(int) try: for document_id, text in scan_entity_mentions(entity): text = normalize_strong(text) if text is None or len(text) <= 2: continue for match in rex.finditer(text): documents[document_id] += 1 except Exception: log.exception('Failed to fully scan documents for entity refresh.') q = db.session.query(Reference) q = q.filter(Reference.entity_id == entity.id) q = q.filter(Reference.origin == 'regex') q.delete(synchronize_session='fetch') log.info("Re-matching %r gave %r documents.", entity, len(documents)) for document_id, weight in documents.items(): doc = Document.by_id(document_id) if doc is None: continue ref = Reference() ref.document_id = document_id ref.entity_id = entity.id ref.origin = 'regex' ref.weight = weight db.session.add(ref) db.session.commit() delete_entity_references(entity.id) q = db.session.query(func.distinct(Reference.document_id)) q = q.filter(Reference.entity_id == entity.id) for document_id, in q: index_document(document_id, index_records=False)
def generate_entity_references(entity): # This is all a bit hacky: we're re-generating all the entity # references for the given entity by effectively re-implementing # the RegexEntityAnalyzer. The alternative was to conduct a # search for potential matching documents, re-analyze them and # re-index them. This proved to be too slow in reality. if entity.state != Entity.STATE_ACTIVE: entity.delete_references(origin='regex') return log.info("Updating document references: %r", entity) rex = '|'.join([t for t in entity.regex_terms]) rex = re.compile('(%s)' % rex) documents = defaultdict(int) try: for document_id, text in scan_entity_mentions(entity): text = match_form(text) if text is None or len(text) <= 2: continue for match in rex.finditer(text): documents[document_id] += 1 except Exception as ex: log.exception(ex) log.info("Re-matching %r gave %r documents.", entity, len(documents)) entity.delete_references(origin='regex') for document_id, weight in documents.items(): doc = Document.by_id(document_id) if doc is None: continue ref = Reference() ref.document_id = doc.id ref.entity_id = entity.id ref.origin = 'regex' ref.weight = weight db.session.add(ref) db.session.commit() delete_entity_references(entity.id) update_entity_references(entity)
def analyze_document(document_id): document = Document.by_id(document_id) if document is None: log.info("Could not find document: %r", document_id) return log.info("Analyze document: %r", document) analyzers = [] meta = document.meta for cls in get_analyzers(): try: analyzer = cls(document, meta) analyzer.prepare() analyzers.append(analyzer) except Exception as ex: log.exception(ex) if document.type == Document.TYPE_TEXT: for page in document.pages: for analyzer in analyzers: analyzer.on_page(page) for text in page.text_parts(): for analyzer in analyzers: analyzer.on_text(text) if document.type == Document.TYPE_TABULAR: for record in document.records: for analyzer in analyzers: analyzer.on_record(record) for text in record.text_parts(): for analyzer in analyzers: analyzer.on_text(text) for analyzer in analyzers: try: analyzer.finalize() except Exception as ex: log.exception(ex) document.meta = meta db.session.add(document) db.session.commit() index_document(document_id)
def delete_entity(collection, entity, deleted_at=None, sync=False): # This is recursive and will also delete any entities which # reference the given entity. Usually this is going to be child # documents, or directoships referencing a person. It's a pretty # dangerous operation, though. entity_id = collection.ns.sign(entity.get("id")) for adjacent in index.iter_adjacent(entity): log.warning("Recursive delete: %r", adjacent) delete_entity(collection, adjacent, deleted_at=deleted_at, sync=sync) flush_notifications(entity_id, clazz=Entity) obj = Entity.by_id(entity_id, collection=collection) if obj is not None: obj.delete() doc = Document.by_id(entity_id, collection=collection) if doc is not None: doc.delete() index.delete_entity(entity_id, sync=sync) EntitySetItem.delete_by_entity(entity_id) Mapping.delete_by_table(entity_id) xref_index.delete_xref(collection, entity_id=entity_id, sync=sync) delete_aggregator_entity(collection, entity_id) refresh_entity(collection, entity_id)
def index_document_id(document_id, index_records=True): document = Document.by_id(document_id) if document is None: log.info("Could not find document: %r", document_id) return index_document(document)
def analyze_document_id(document_id): document = Document.by_id(document_id) if document is None: log.info("Could not find document: %r", document_id) return analyze_document(document)
def process_document_id(document_id): """Perform post-ingest tasks like analysis and indexing.""" analyze_document(Document.by_id(document_id))
def get_document(document_id): document = Document.by_id(document_id) if document is None: raise NotFound() authz.require(authz.source_read(document.source_id)) return document
def get_db_document(document_id, action=Authz.READ): document = obj_or_404(Document.by_id(document_id)) require(request.authz.can(document.collection_id, action)) return document