def write_html(raw_html, filename):
    f = open(filename, "w+", encoding="utf-8")
    try:
        html = BeautifulSoup(raw_html, 'html.parser')
    except:
        html = BeautifulSoup("404 Not Found!", 'html.parser')
    f.write(html.__unicode__())
    f.close()
Example #2
0
def soup_it(text, settings):

    soup = BeautifulSoup(text, settings.get('parser', 'lxml'))
    u_soup = soup.__unicode__()

    prettify = False
    # clean up automated augmentation and we do not use prettify for now
    if prettify:
        # this totally change the input representation to indented structure
        # better to read, but may to much
        u_soup = soup.prettify()
        if u_soup.startswith('<html>\n <body>'):
            u_soup = u_soup[15:-16]
    else:
        # we lose all multiple whitespaces, there is no indentation finally
        # if there was in the input
        if u_soup.startswith('<html><body>'):
            u_soup = u_soup[12:-14]
    return soup, u_soup