def generate_sitemap(collection_id): """Generate entries for a collection-based sitemap.xml file.""" # cf. https://www.sitemaps.org/protocol.html query = { 'query': { 'bool': { 'filter': [ {'term': {'collection_id': collection_id}}, {'term': {'schemata': Entity.THING}}, authz_query(Authz.from_role(None)) ] } }, '_source': {'includes': ['schemata', 'updated_at']} } scanner = scan(es, index=entities_index(), query=query) # strictly, the limit for sitemap.xml is 50,000 for res in islice(scanner, 49500): source = res.get('_source', {}) updated_at = source.get('updated_at', '').split('T', 1)[0] if Document.SCHEMA in source.get('schemata', []): url = document_url(res.get('_id')) else: url = entity_url(res.get('_id')) yield (url, updated_at)
def _resp_canonical(resp, document_id): # EXPERIMENTAL HACK # the idea here is to tell search engines that they should not index # source documents, but instead go for the UI version of the site. link_header = '<%s>; rel="canonical"' % document_url(document_id) resp.headers['Link'] = link_header return resp
def generate_sitemap(collection_id): """Generate entries for a collection-based sitemap.xml file.""" # cf. https://www.sitemaps.org/protocol.html entities = iter_entities(authz=Authz.from_role(None), collection_id=collection_id, schemata=[Entity.THING], includes=['schemata', 'updated_at']) # strictly, the limit for sitemap.xml is 50,000 for entity in islice(entities, 49500): updated_at = entity.get('updated_at', '').split('T', 1)[0] if Document.SCHEMA in entity.get('schemata', []): url = document_url(entity.get('id')) else: url = entity_url(entity.get('id')) yield (url, updated_at)
def document_links(self, data, pk, schemata): links = { 'self': url_for('documents_api.view', document_id=pk), 'tags': url_for('entities_api.tags', id=pk), 'ui': document_url(pk) } if data.get('content_hash'): links['file'] = url_for('documents_api.file', document_id=pk, _authorize=True) if schemata.intersection([Document.SCHEMA_PDF]): links['pdf'] = url_for('documents_api.pdf', document_id=pk, _authorize=True) if schemata.intersection([Document.SCHEMA_PDF, Document.SCHEMA_TABLE]): links['records'] = url_for('documents_api.records', document_id=pk) if schemata.intersection([Document.SCHEMA_FOLDER]): query = (('filter:parent.id', pk),) links['children'] = url_for('documents_api.index', _query=query) return links
def generate_sitemap(collection_id): """Generate entries for a collection-based sitemap.xml file.""" # cf. https://www.sitemaps.org/protocol.html document = model.get(Document.SCHEMA) entities = iter_entities(authz=Authz.from_role(None), collection_id=collection_id, schemata=[Entity.THING], includes=['schema', 'updated_at']) # strictly, the limit for sitemap.xml is 50,000 for entity in islice(entities, 49500): updated_at = entity.get('updated_at', '').split('T', 1)[0] updated_at = max(settings.SITEMAP_FLOOR, updated_at) schema = model.get(entity.get('schema')) if schema is None: continue if schema.is_a(document): url = document_url(entity.get('id')) else: url = entity_url(entity.get('id')) yield (url, updated_at)
def resolve_id(object_id, clazz): """From an object ID and class type, generate a human-readable label and a link that can be rendered into the notification. """ if clazz == Role: role = Role.by_id(object_id) return role.name, None elif clazz == Alert: alert = Alert.by_id(object_id) return alert.query, None elif clazz == Collection: collection = Collection.by_id(object_id) if collection is not None: return collection.label, collection_url(object_id) elif clazz in [Document, Entity]: entity = get_entity(object_id) if entity is not None: if Document.SCHEMA in entity.get('schemata'): title = entity.get('title', entity.get('file_name')) return title, document_url(object_id) else: return entity.get('name'), entity_url(object_id) return None, None
def sitemap(id): """Generate entries for a collection-based sitemap.xml file.""" # cf. https://www.sitemaps.org/protocol.html collection = get_db_collection(id, request.authz.READ) document = model.get(Document.SCHEMA) entries = [] for entity in get_sitemap_entities(id): updated_at = entity.get('updated_at', '').split('T', 1)[0] updated_at = max(settings.SITEMAP_FLOOR, updated_at) schema = model.get(entity.get('schema')) if schema is None: continue if schema.is_a(document): url = document_url(entity.get('id')) else: url = entity_url(entity.get('id')) entries.append((url, updated_at)) url = collection_url(collection_id=collection.id) updated_at = collection.updated_at.date().isoformat() return render_xml('sitemap.xml', url=url, updated_at=updated_at, entries=entries)