def view(entity_id): """ --- get: summary: Get an entity description: Return the entity with id `entity_id` parameters: - in: path name: entity_id required: true schema: type: string responses: '200': description: OK content: application/json: schema: $ref: '#/components/schemas/Entity' tags: - Entity """ enable_cache() entity = get_index_entity(entity_id, request.authz.READ, excludes=['text', 'numeric.*']) tag_request(collection_id=entity.get('collection_id')) proxy = model.get_proxy(entity) html = proxy.first('bodyHtml', quiet=True) source_url = proxy.first('sourceUrl', quiet=True) encoding = proxy.first('encoding', quiet=True) entity['safeHtml'] = sanitize_html(html, source_url, encoding=encoding) entity['shallow'] = False return EntitySerializer.jsonify(entity)
def content(document_id): enable_cache() document = get_db_document(document_id) record_audit(Audit.ACT_ENTITY, id=document_id) return jsonify({ 'headers': document.headers, 'text': document.body_text, 'html': sanitize_html(document.body_raw, document.source_url) })
def view(document_id): enable_cache() data = get_index_document(document_id) document = get_db_document(document_id) data['headers'] = document.headers # TODO: should this be it's own API? Probably so, but for that it would # be unclear if we should JSON wrap it, or serve plain with the correct # MIME type? if Document.SCHEMA_HTML in document.model.names: data['html'] = sanitize_html(document.body_raw, document.source_url) if Document.SCHEMA_TEXT in document.model.names: data['text'] = document.body_text return jsonify(data, schema=CombinedSchema)
def content(entity_id): """ --- get: summary: Get the content of an entity description: > Return the text and/or html content of the entity with id `entity_id` parameters: - in: path name: entity_id required: true schema: type: string responses: '200': content: application/json: schema: properties: headers: type: object html: type: string text: type: string type: object description: OK '404': description: Not Found tags: - Entity """ enable_cache() entity = get_index_entity(entity_id, request.authz.READ) tag_request(collection_id=entity.get('collection_id')) for entity in entities_by_ids([entity_id], schemata=entity.get('schema'), excludes=['text']): proxy = model.get_proxy(entity) html = proxy.first('bodyHtml', quiet=True) source_url = proxy.first('sourceUrl', quiet=True) encoding = proxy.first('encoding', quiet=True) html = sanitize_html(html, source_url, encoding=encoding) headers = proxy.first('headers', quiet=True) headers = registry.json.unpack(headers) return jsonify({ 'headers': headers, 'text': proxy.first('bodyText', quiet=True), 'html': html }) return ('', 404)
def test_sanitize_html(self): html_str = '<!doctype html><html><head><title>Article</title><style type="text/css">body { }</style><script>alert("We love Angular")</script><link rel="stylesheet" href="http://xss.rocks/xss.css"></head><body><article id="story"><h1>We welcome our new React overlords</h1><img src=" javascript:alert(\'XSS\');" alt="" /><p>Published on <time onmouseover="alert(\'XSS\')">1 January 2018</time></p><p>Really the only thing better than the <a href="/blockchain">blockchain</a> is ReactJS.</p></article><video> <source onerror = "javascript: alert (XSS)"></video></body></html>' processed = sanitize_html( html_str, 'https://example.org/welcome-react') html = document_fromstring(processed) assert html.find('.//img') is None, html assert html.find('.//video') is None, html assert html.find('.//style') is None, html assert html.find('.//script') is None, html assert len(html.findall('.//article')) == 1, html assert html.find('.//time').get('onmouseover') == None, html assert html.find( './/a').get('href') == 'https://example.org/blockchain', html assert html.find('.//a').get('target') == '_blank', html assert 'nofollow' in html.find('.//a').get('rel'), html
def test_sanitize_html(self): html_str = '<!doctype html><html><head><title>Article</title><style type="text/css">body { }</style><script>alert("We love Angular")</script><link rel="stylesheet" href="http://xss.rocks/xss.css"></head><body><article id="story"><h1>We welcome our new React overlords</h1><img src=" javascript:alert(\'XSS\');" alt="" /><p>Published on <time onmouseover="alert(\'XSS\')">1 January 2018</time></p><p>Really the only thing better than the <a href="/blockchain">blockchain</a> is ReactJS.</p></article><video> <source onerror = "javascript: alert (XSS)"></video></body></html>' processed = sanitize_html(html_str, 'https://example.org/welcome-react') html = document_fromstring(processed) assert html.find('.//img') is None, html assert html.find('.//video') is None, html assert html.find('.//style') is None, html assert html.find('.//script') is None, html assert len(html.findall('.//article')) == 1, html assert html.find('.//time').get('onmouseover') == None, html assert html.find('.//a').get( 'href') == 'https://example.org/blockchain', html assert html.find('.//a').get('target') == '_blank', html assert 'nofollow' in html.find('.//a').get('rel'), html
def content(entity_id): enable_cache() entity = get_index_entity(entity_id, request.authz.READ) tag_request(collection_id=entity.get('collection_id')) for entity in entities_by_ids([entity_id], schemata=entity.get('schema'), excludes=['text']): proxy = model.get_proxy(entity) html = sanitize_html(proxy.first('bodyHtml', quiet=True), proxy.first('sourceUrl', quiet=True)) headers = proxy.first('headers', quiet=True) headers = registry.json.unpack(headers) return jsonify({ 'headers': headers, 'text': proxy.first('bodyText', quiet=True), 'html': html }) return ('', 404)
def content(entity_id): enable_cache() entity = get_index_entity(entity_id, request.authz.READ) tag_request(collection_id=entity.get('collection_id')) for entity in entities_by_ids([entity_id], schemata=entity.get('schema'), excludes=['text']): proxy = model.get_proxy(entity) record_audit(Audit.ACT_ENTITY, id=entity_id) html = sanitize_html(proxy.first('bodyHtml', quiet=True), proxy.first('sourceUrl', quiet=True)) headers = proxy.first('headers', quiet=True) headers = registry.json.unpack(headers) return jsonify({ 'headers': headers, 'text': proxy.first('bodyText', quiet=True), 'html': html }) return ('', 404)