def index_document(document): if document.status == Document.STATUS_PENDING: delete_entity(document.id) return name = document.name log.info("Index document [%s]: %s", document.id, name) data = { 'status': document.status, 'content_hash': document.content_hash, 'foreign_id': document.foreign_id, 'error_message': document.error_message, 'uploader_id': document.uploader_id, 'title': document.title, 'name': name, 'summary': document.summary, 'author': document.author, 'generator': document.generator, 'file_size': document.file_size, 'file_name': document.file_name, 'source_url': document.source_url, 'languages': document.languages, 'countries': document.countries, 'keywords': document.keywords, 'date': document.date, 'authored_at': document.authored_at, 'modified_at': document.modified_at, 'published_at': document.published_at, 'retrieved_at': document.retrieved_at, 'dates': document.dates, 'extension': document.extension, 'encoding': document.encoding, 'mime_type': document.mime_type, 'pdf_version': document.pdf_version, 'columns': document.columns, 'ancestors': document.ancestors, 'children': document.children.count() } texts = list(document.texts) texts.extend(document.columns) if document.parent_id is not None: texts.append(document.parent.title) data['parent'] = { 'id': document.parent_id, 'schema': document.parent.schema, 'title': document.parent.title, } for (field, values) in generate_tags(document): if field not in data: data[field] = list(values) else: data[field].extend(values) texts.extend(values) return index_single(document, data, texts)
def ingest(document_id, file_path=None, refresh=False): """Process a given document by extracting its contents. This may include creating or updating child documents.""" document = Document.by_id(document_id) if document is None: log.error("Could not find document: %s", document_id) return # Work path will be used by storagelayer to cache a local # copy of data from an S3-based archive, and by ingestors # to perform processing and generate intermediary files. work_path = mkdtemp(prefix="aleph.ingest.") if file_path is None: file_path = archive.load_file(document.content_hash, file_name=document.safe_file_name, temp_path=work_path) try: manager = get_manager() result = DocumentResult(manager, document, file_path=file_path) get_manager().ingest(file_path, result=result, work_path=work_path) document.status = Document.STATUS_SUCCESS log.debug('Ingested [%s:%s]: %s', document.id, document.schema, document.name) if document.collection.casefile and not refresh: params = {'collection': document.collection, 'document': document} publish(Events.INGEST_DOCUMENT, actor_id=document.uploader_id, params=params) db.session.commit() except Exception: db.session.rollback() document = Document.by_id(document_id) log.exception("Ingest failed [%s]: %s", document.id, document.name) document.status = Document.STATUS_FAIL db.session.commit() finally: # Removing the temp_path given to storagelayer makes it redundant # to also call cleanup on the archive. remove_directory(work_path) extract_document_tags(document) delete_entity(document.id, exclude=document.schema) index_document(document) refresh_entity(document)
def delete_entity(entity, deleted_at=None, sync=False): # This is recursive and will also delete any entities which # reference the given entity. Usually this is going to be child # documents, or directoships referencing a person. It's a pretty # dangerous operation, though. for adjacent in index.iter_adjacent(entity): log.warning("Recursive delete: %r", adjacent) delete_entity(adjacent, deleted_at=deleted_at, sync=sync) flush_notifications(entity.get('id'), clazz=Entity) obj = Entity.by_id(entity.get('id')) if obj is not None: obj.delete(deleted_at=deleted_at) doc = Document.by_id(entity.get('id')) if doc is not None: doc.delete(deleted_at=deleted_at) index.delete_entity(entity.get('id'), sync=sync) refresh_entity(entity)
def delete_entity(collection, entity, deleted_at=None, sync=False): # This is recursive and will also delete any entities which # reference the given entity. Usually this is going to be child # documents, or directoships referencing a person. It's a pretty # dangerous operation, though. entity_id = collection.ns.sign(entity.get("id")) for adjacent in index.iter_adjacent(entity): log.warning("Recursive delete: %r", adjacent) delete_entity(collection, adjacent, deleted_at=deleted_at, sync=sync) flush_notifications(entity_id, clazz=Entity) obj = Entity.by_id(entity_id, collection=collection) if obj is not None: obj.delete() doc = Document.by_id(entity_id, collection=collection) if doc is not None: doc.delete() index.delete_entity(entity_id, sync=sync) EntitySetItem.delete_by_entity(entity_id) Mapping.delete_by_table(entity_id) xref_index.delete_xref(collection, entity_id=entity_id, sync=sync) delete_aggregator_entity(collection, entity_id) refresh_entity(collection, entity_id)
def delete_document(document_id, sync=False): delete_records(document_id=document_id, sync=False) delete_entity(document_id, sync=sync)
def delete_document(document_id): clear_records(document_id) delete_entity(document_id)
def delete_entity(entity, deleted_at=None): flush_notifications(entity) entity.delete(deleted_at=deleted_at) index.delete_entity(entity.id)
def delete_entity(collection, entity, sync=False, job_id=None): """Delete entity from index and redis, queue full prune.""" entity_id = collection.ns.sign(entity.get("id")) index.delete_entity(entity_id, sync=sync) refresh_entity(collection, entity_id) queue_task(collection, OP_PRUNE_ENTITY, job_id=job_id, entity_id=entity_id)
def delete_entity(entity, deleted_at=None, sync=False): flush_notifications(entity) entity.delete(deleted_at=deleted_at) refresh_entity(entity) index.delete_entity(entity.id, sync=sync)
def delete_entity(entity, deleted_at=None, sync=False): flush_notifications(entity) collection = entity.collection entity.delete(deleted_at=deleted_at) index.delete_entity(entity.id, sync=sync) refresh_collection(collection, sync=sync)
def delete_document(document_id): clear_records(document_id) delete_entity(document_id) refresh_index(index=records_index())
def delete_entity(entity, deleted_at=None, sync=False): flush_notifications(entity) entity.delete(deleted_at=deleted_at) refresh_entity(entity) index.delete_entity(entity.id, sync=sync)
def delete_entity(entity, deleted_at=None, sync=False): flush_notifications(entity) entity.delete(deleted_at=deleted_at) index.delete_entity(entity.id, sync=sync) # TODO: implement recursion? refresh_entity(entity)