def xref_collection(stage, collection): """Cross-reference all the entities and documents in a collection.""" index.delete_xref(collection) matchable = [s.name for s in model if s.matchable] entities = iter_entities(collection_id=collection.id, schemata=matchable) for entity in entities: queue_task(collection, OP_XREF_ITEM, job_id=stage.job.id, payload={'entity_id': entity.get('id')})
def xref_collection(collection_id, other_id=None): """Cross-reference all the entities and documents in a collection.""" matchable = [s.name for s in model if s.matchable] entities = iter_entities(collection_id=collection_id, schemata=matchable, excludes=['text', 'roles', 'properties.*']) for entity in entities: _xref_item(entity, collection_id=other_id)
def xref_collection(stage, collection, against_collection_ids=None): """Cross-reference all the entities and documents in a collection.""" matchable = [s.name for s in model if s.matchable] entities = iter_entities(collection_id=collection.id, schemata=matchable) for entity in entities: payload = { 'entity_id': entity.get('id'), 'against_collection_ids': against_collection_ids } queue_task(collection, OP_XREF_ITEM, job_id=stage.job.id, payload=payload)
def entities(collection_id=None): require(request.authz.can_export()) schemata = ensure_list(request.args.getlist('schema')) excludes = ['text', 'roles', 'fingerprints'] includes = ensure_list(request.args.getlist('include')) includes = [f for f in includes if f not in excludes] if collection_id is not None: get_db_collection(id, request.authz.READ) record_audit(Audit.ACT_COLLECTION, id=id) entities = iter_entities(authz=request.authz, collection_id=collection_id, schemata=schemata, excludes=excludes, includes=includes) return stream_ijson(entities)
def generate_sitemap(collection_id): """Generate entries for a collection-based sitemap.xml file.""" # cf. https://www.sitemaps.org/protocol.html entities = iter_entities(authz=Authz.from_role(None), collection_id=collection_id, schemata=[Entity.THING], includes=['schemata', 'updated_at']) # strictly, the limit for sitemap.xml is 50,000 for entity in islice(entities, 49500): updated_at = entity.get('updated_at', '').split('T', 1)[0] if Document.SCHEMA in entity.get('schemata', []): url = document_url(entity.get('id')) else: url = entity_url(entity.get('id')) yield (url, updated_at)
def export_collection(collection): uri = URIRef(ui_url('collections', collection.id)) g = Graph() g.add((uri, RDF.type, DCMI.Collection)) g.add((uri, RDFS.label, Literal(collection.label))) g.add((uri, DCMI.identifier, Literal(collection.foreign_id))) g.add((uri, ALEPH.category, ALEPH[collection.category])) for line in itergraph(g): yield line entities = iter_entities(collection_id=collection.id, excludes=['text']) for entity in entities: g = export_entity(entity, uri) for line in itergraph(g): yield line
def entities(collection_id=None): require(request.authz.can_stream()) log.debug("Stream entities [%r] begins... (coll: %s)", request.authz, collection_id) schemata = ensure_list(request.args.getlist('schema')) excludes = ['text', 'roles', 'fingerprints'] includes = ensure_list(request.args.getlist('include')) includes = [f for f in includes if f not in excludes] if collection_id is not None: get_db_collection(collection_id, request.authz.READ) record_audit(Audit.ACT_COLLECTION, id=collection_id) entities = iter_entities(authz=request.authz, collection_id=collection_id, schemata=schemata, excludes=excludes, includes=includes) return stream_ijson(entities)
def entities(collection_id=None): """ --- get: summary: Stream collection entities. description: > Stream a JSON form of each entity in the given collection, or throughout the entire database. parameters: - description: The collection ID. in: path name: collection_id required: true schema: minimum: 1 type: integer responses: '200': description: OK content: application/x-ndjson: schema: type: array items: $ref: '#/components/schemas/Entity' tags: - Entity """ log.debug("Stream entities [%r] begins... (coll: %s)", request.authz, collection_id) schemata = ensure_list(request.args.getlist("schema")) includes = ensure_list(request.args.getlist("include")) includes = includes or PROXY_INCLUDES if collection_id is not None: get_db_collection(collection_id, request.authz.WRITE) else: require(request.authz.is_admin) entities = iter_entities( authz=request.authz, collection_id=collection_id, schemata=schemata, includes=includes, ) return stream_ijson(entities)
def entities(collection_id=None): """ --- get: summary: Stream collection entities. description: > Stream a JSON form of each entity in the given collection, or throughout the entire database. parameters: - description: The collection ID. in: path name: collection_id required: true schema: minimum: 1 type: integer responses: '200': description: OK content: application/x-ndjson: schema: type: array items: $ref: '#/components/schemas/Entity' tags: - Entity """ require(request.authz.can_stream()) log.debug("Stream entities [%r] begins... (coll: %s)", request.authz, collection_id) schemata = ensure_list(request.args.getlist('schema')) excludes = ['text', 'fingerprints'] includes = ensure_list(request.args.getlist('include')) includes = [f for f in includes if f not in excludes] if collection_id is not None: get_db_collection(collection_id, request.authz.READ) entities = iter_entities(authz=request.authz, collection_id=collection_id, schemata=schemata, excludes=excludes, includes=includes) return stream_ijson(entities)
def generate_sitemap(collection_id): """Generate entries for a collection-based sitemap.xml file.""" # cf. https://www.sitemaps.org/protocol.html document = model.get(Document.SCHEMA) entities = iter_entities(authz=Authz.from_role(None), collection_id=collection_id, schemata=[Entity.THING], includes=['schema', 'updated_at']) # strictly, the limit for sitemap.xml is 50,000 for entity in islice(entities, 49500): updated_at = entity.get('updated_at', '').split('T', 1)[0] updated_at = max(settings.SITEMAP_FLOOR, updated_at) schema = model.get(entity.get('schema')) if schema is None: continue if schema.is_a(document): url = document_url(entity.get('id')) else: url = entity_url(entity.get('id')) yield (url, updated_at)