Beispiel #1
0
def extract_bleach(text):
    """Extract tags from text in a form suitable for bleach."""
    extractor = MarkupExtractor()
    parser = HTMLParser(collect_ids=False, target=extractor)
    parser.feed(text)
    return {
        "tags": extractor.found_tags,
        "attributes": extractor.found_attributes
    }
Beispiel #2
0
def parse_html(name, html):
    """
    Parse the provided html to find a dataset
    """
    parser = HTMLParser(encoding="UTF-8")
    try:
        parser.feed(html)
        dom = parser.close()
    except:
        return {}

    dataset = {
        u'name': name,
        u'title': get_title(dom),
        u'notes': get_notes(dom),
        u'owner_org': get_owner_org(dom),
        u'resources': parse_resources(dom),
    }

    dataset.update(get_extra_meta(dom))

    return dataset
def dm_to_html_v2(dmtext, klass=DMWalkState, tag=None):
    mark = str(uuid.uuid4())
    dmtext, has_blinds = fix_tags(dmtext)
    p = HTMLParser()
    p.feed(f"<div id='{mark}'>".encode("ascii"))
    p.feed(dmtext)
    p.feed(b"</div>")

    doc = p.close()
    root = doc.find(f".//div[@id = '{mark}']")

    synth_root = klass(for_region=tag)
    synth_root.ingest_element(root)

    return synth_root
def dm_to_html(dmtext, tag=None):
    mark = str(uuid.uuid4())
    dmtext, has_blinds = fix_tags(dmtext)
    p = HTMLParser()
    p.feed(f"<div id='{mark}'>".encode("ascii"))
    p.feed(dmtext)
    p.feed(b"</div>")

    doc = p.close()
    root = doc.find(f".//div[@id = '{mark}']")

    synth_root = DMWalkState(for_region=tag)
    synth_root.ingest_element(root)

    text = synth_root.get_html()
    if has_blinds:
        text = FIX_BLIND_TIMESTAMP_CODES.sub(fix_blind_ts, text)

    return (
        text,
        list(synth_root.card_master_references) or None,
        list(synth_root.image_references) or None,
    )