Esempio n. 1
0
def view(entity_id):
    """
    ---
    get:
      summary: Get an entity
      description: Return the entity with id `entity_id`
      parameters:
      - in: path
        name: entity_id
        required: true
        schema:
          type: string
      responses:
        '200':
          description: OK
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/Entity'
      tags:
      - Entity
    """
    enable_cache()
    entity = get_index_entity(entity_id,
                              request.authz.READ,
                              excludes=['text', 'numeric.*'])
    tag_request(collection_id=entity.get('collection_id'))
    proxy = model.get_proxy(entity)
    html = proxy.first('bodyHtml', quiet=True)
    source_url = proxy.first('sourceUrl', quiet=True)
    encoding = proxy.first('encoding', quiet=True)
    entity['safeHtml'] = sanitize_html(html, source_url, encoding=encoding)
    entity['shallow'] = False
    return EntitySerializer.jsonify(entity)
Esempio n. 2
0
def content(document_id):
    enable_cache()
    document = get_db_document(document_id)
    record_audit(Audit.ACT_ENTITY, id=document_id)
    return jsonify({
        'headers':
        document.headers,
        'text':
        document.body_text,
        'html':
        sanitize_html(document.body_raw, document.source_url)
    })
Esempio n. 3
0
def view(document_id):
    enable_cache()
    data = get_index_document(document_id)
    document = get_db_document(document_id)
    data['headers'] = document.headers
    # TODO: should this be it's own API? Probably so, but for that it would
    # be unclear if we should JSON wrap it, or serve plain with the correct
    # MIME type?
    if Document.SCHEMA_HTML in document.model.names:
        data['html'] = sanitize_html(document.body_raw, document.source_url)
    if Document.SCHEMA_TEXT in document.model.names:
        data['text'] = document.body_text
    return jsonify(data, schema=CombinedSchema)
Esempio n. 4
0
def content(entity_id):
    """
    ---
    get:
      summary: Get the content of an entity
      description: >
        Return the text and/or html content of the entity with id `entity_id`
      parameters:
      - in: path
        name: entity_id
        required: true
        schema:
          type: string
      responses:
        '200':
          content:
            application/json:
              schema:
                properties:
                  headers:
                    type: object
                  html:
                    type: string
                  text:
                    type: string
                type: object
          description: OK
        '404':
          description: Not Found
      tags:
      - Entity
    """
    enable_cache()
    entity = get_index_entity(entity_id, request.authz.READ)
    tag_request(collection_id=entity.get('collection_id'))
    for entity in entities_by_ids([entity_id],
                                  schemata=entity.get('schema'),
                                  excludes=['text']):
        proxy = model.get_proxy(entity)
        html = proxy.first('bodyHtml', quiet=True)
        source_url = proxy.first('sourceUrl', quiet=True)
        encoding = proxy.first('encoding', quiet=True)
        html = sanitize_html(html, source_url, encoding=encoding)
        headers = proxy.first('headers', quiet=True)
        headers = registry.json.unpack(headers)
        return jsonify({
            'headers': headers,
            'text': proxy.first('bodyText', quiet=True),
            'html': html
        })
    return ('', 404)
Esempio n. 5
0
 def test_sanitize_html(self):
     html_str = '<!doctype html><html><head><title>Article</title><style type="text/css">body { }</style><script>alert("We love Angular")</script><link rel="stylesheet" href="http://xss.rocks/xss.css"></head><body><article id="story"><h1>We welcome our new React overlords</h1><img src="&#14;  javascript:alert(\'XSS\');" alt="" /><p>Published on <time onmouseover="alert(\'XSS\')">1 January 2018</time></p><p>Really the only thing better than the <a href="/blockchain">blockchain</a> is ReactJS.</p></article><video> <source onerror = "javascript: alert (XSS)"></video></body></html>'
     processed = sanitize_html(
         html_str, 'https://example.org/welcome-react')
     html = document_fromstring(processed)
     assert html.find('.//img') is None, html
     assert html.find('.//video') is None, html
     assert html.find('.//style') is None, html
     assert html.find('.//script') is None, html
     assert len(html.findall('.//article')) == 1, html
     assert html.find('.//time').get('onmouseover') == None, html
     assert html.find(
         './/a').get('href') == 'https://example.org/blockchain', html
     assert html.find('.//a').get('target') == '_blank', html
     assert 'nofollow' in html.find('.//a').get('rel'), html
 def test_sanitize_html(self):
     html_str = '<!doctype html><html><head><title>Article</title><style type="text/css">body { }</style><script>alert("We love Angular")</script><link rel="stylesheet" href="http://xss.rocks/xss.css"></head><body><article id="story"><h1>We welcome our new React overlords</h1><img src="&#14;  javascript:alert(\'XSS\');" alt="" /><p>Published on <time onmouseover="alert(\'XSS\')">1 January 2018</time></p><p>Really the only thing better than the <a href="/blockchain">blockchain</a> is ReactJS.</p></article><video> <source onerror = "javascript: alert (XSS)"></video></body></html>'
     processed = sanitize_html(html_str,
                               'https://example.org/welcome-react')
     html = document_fromstring(processed)
     assert html.find('.//img') is None, html
     assert html.find('.//video') is None, html
     assert html.find('.//style') is None, html
     assert html.find('.//script') is None, html
     assert len(html.findall('.//article')) == 1, html
     assert html.find('.//time').get('onmouseover') == None, html
     assert html.find('.//a').get(
         'href') == 'https://example.org/blockchain', html
     assert html.find('.//a').get('target') == '_blank', html
     assert 'nofollow' in html.find('.//a').get('rel'), html
Esempio n. 7
0
def content(entity_id):
    enable_cache()
    entity = get_index_entity(entity_id, request.authz.READ)
    tag_request(collection_id=entity.get('collection_id'))
    for entity in entities_by_ids([entity_id],
                                  schemata=entity.get('schema'),
                                  excludes=['text']):
        proxy = model.get_proxy(entity)
        html = sanitize_html(proxy.first('bodyHtml', quiet=True),
                             proxy.first('sourceUrl', quiet=True))
        headers = proxy.first('headers', quiet=True)
        headers = registry.json.unpack(headers)
        return jsonify({
            'headers': headers,
            'text': proxy.first('bodyText', quiet=True),
            'html': html
        })
    return ('', 404)
Esempio n. 8
0
def content(entity_id):
    enable_cache()
    entity = get_index_entity(entity_id, request.authz.READ)
    tag_request(collection_id=entity.get('collection_id'))
    for entity in entities_by_ids([entity_id],
                                  schemata=entity.get('schema'),
                                  excludes=['text']):
        proxy = model.get_proxy(entity)
        record_audit(Audit.ACT_ENTITY, id=entity_id)
        html = sanitize_html(proxy.first('bodyHtml', quiet=True),
                             proxy.first('sourceUrl', quiet=True))
        headers = proxy.first('headers', quiet=True)
        headers = registry.json.unpack(headers)
        return jsonify({
            'headers': headers,
            'text': proxy.first('bodyText', quiet=True),
            'html': html
        })
    return ('', 404)