Ejemplo n.º 1
0
def extract(source, filters):
    filters = dict(filter.split("=") for filter in filters)
    s = fetch_url(source)[1] if is_url(source) else open(source, "r").read()
    doc = parse_html(s)

    result = {}
    for k, v in filters.items():
        es = doc.cssselect(v)
        if "." in k:
            k, a = k.split(".")
            texts = htmls = [e.attrib.get(a, "") for e in es]
        else:
            htmls = [doc_to_str(e) for e in es]
            texts = [doc_to_text(e) for e in es]
        
        result["_{0:s}".format(k)] = htmls
        result[k] = texts
    return result
Ejemplo n.º 2
0
def doc_to_text(doc):
    return unichar_to_text(html_to_text(unescape(doc_to_str(doc))))