def write_html(raw_html, filename): f = open(filename, "w+", encoding="utf-8") try: html = BeautifulSoup(raw_html, 'html.parser') except: html = BeautifulSoup("404 Not Found!", 'html.parser') f.write(html.__unicode__()) f.close()
def soup_it(text, settings): soup = BeautifulSoup(text, settings.get('parser', 'lxml')) u_soup = soup.__unicode__() prettify = False # clean up automated augmentation and we do not use prettify for now if prettify: # this totally change the input representation to indented structure # better to read, but may to much u_soup = soup.prettify() if u_soup.startswith('<html>\n <body>'): u_soup = u_soup[15:-16] else: # we lose all multiple whitespaces, there is no indentation finally # if there was in the input if u_soup.startswith('<html><body>'): u_soup = u_soup[12:-14] return soup, u_soup