Esempio n. 1
0
def clean(file_name, directory="."):
    content = open(file_name, "r").read()
    article = Extractor(content, loglevel=logging.INFO).extracted()
    #article = cgi.escape(article).encode('ascii', 'xmlcharrefreplace')
    #return article
    if article is None:
        print "Error processing html file"
        sys.exit(1)
    html_parser = html.HTMLParser(encoding="utf-8")
    html_doc = html.fromstring(content, parser=html_parser)
    head_doc = html_doc.find('head')
    source_url = head_doc.cssselect('link[rel="canonical"]')[0].get('href')

    reconstructed_body = u"<html><body>" + article.replace(
        "<h2", "<h1").replace("</h2>", "</h1>") + u"</body></html>"
    source_header_string = "<h3>Source</h3>"
    source_link = "<p><a href='" + source_url + "' rel='tag'>" + source_url + "</a></p>"
    # further remove useless stuff
    body_doc = html.fromstring(reconstructed_body).find('body')

    try:
        post_content_doc = body_doc.xpath("//div[@class='post-content']")[0]
        post_content_doc.append(lxml.etree.XML(source_header_string))
        post_content_doc.append(lxml.etree.XML(source_link))
    except:
        print file_name

    basename = os.path.basename(file_name)
    cleaned_file = os.path.splitext(basename)[0] + "_cleaned.html"
    #out = html.tostring(head_doc) + html.tostring(body_doc)
    result = html.tostring(body_doc)
    with codecs.open(directory + cleaned_file, 'w',
                     'utf-8') as cleaned_file_handle:
        cleaned_file_handle.write(result)
Esempio n. 2
0
def clean(file_name, directory="."):
    content = codecs.open(file_name, "r", "utf-8").read()
    article = Extractor(content, loglevel=logging.INFO).extracted()
    if article is None:
        print "Error processing html file"
        sys.exit(1)
    html_parser = html.HTMLParser(encoding="utf-8")
    html_doc = html.fromstring(content, parser=html_parser)
    head_doc = html_doc.find('head')
    source_url = head_doc.cssselect('meta[property="og:url"]')[0].get(
        'content')

    reconstructed_body = u"<html><body>" + article.replace(
        "<h2", "<h1").replace("</h2>", "</h1>") + u"</body></html>"
    source_header_string = "<h3>Source</h3>"
    source_link = "<p><a href='" + source_url + "' rel='tag'>" + source_url + "</a></p>"
    # further remove useless stuff
    body_doc = html.fromstring(reconstructed_body).find('body')
    for bad in body_doc.xpath("//div[@class='comments-main']"):
        bad.getparent().remove(bad)
    for ad_by_google in body_doc.xpath("//ins[@class='adsbygoogle']"):
        ad_by_google.getparent().remove(ad_by_google)
    for bad_h3 in body_doc.xpath("//h3"):
        bad_h3.getparent().remove(bad_h3)

    post_content_doc = body_doc.xpath("//div[@class='post-content']")[0]
    post_content_doc.append(lxml.etree.XML(source_header_string))
    post_content_doc.append(lxml.etree.XML(source_link))
    basename = os.path.basename(file_name)
    cleaned_file = os.path.splitext(basename)[0] + "_cleaned.html"
    result = html.tostring(body_doc)
    with codecs.open(directory + cleaned_file, 'w',
                     'utf-8') as cleaned_file_handle:
        cleaned_file_handle.write(result)
Esempio n. 3
0
def clean(file_name, directory="."):
    cleaned_file = os.path.splitext(basename)[0] + "_cleaned.html"
    # don't clean files that already have been cleaned
    if os.path.isfile(cleaned_file):
        return
    content = codecs.open(file_name, "r", 'utf-8').read()

    head_pos = content.find('<head>')

    # insert the encoding of the file
    content = content[:head_pos +
                      6] + '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">' + content[
                          head_pos + 6:]
    article = Extractor(content, loglevel=logging.INFO).extracted()

    if article is None:
        print "Error processing html file"
        sys.exit(1)
    html_parser = html.HTMLParser(encoding="utf-8")
    html_doc = html.fromstring(content, parser=html_parser)
    head_doc = html_doc.find('head')
    source_url = head_doc.cssselect('meta[property="og:url"]')[0].get(
        'content')
    title = html_doc.find('.//title').text_content()

    # if the title is unfortunately removed by boilerpipy, then add it back in
    if "h2" not in article:
        article = "<h1>" + title[:title.rfind('-')] + "</h1>" + article

    reconstructed_body = "<html><body>" + article.replace(
        "<h2", "<h1").replace("</h2>", "</h1>") + "</body></html>"
    source_header_string = "<h3>Source</h3>"
    source_link = "<p><a href='" + source_url + "' rel='tag'>" + source_url + "</a></p>"
    # further remove useless stuff
    body_doc = html.fromstring(reconstructed_body).find('body')
    for bad in body_doc.xpath("//div[@class='comments-main']"):
        bad.getparent().remove(bad)
    for ad_by_google in body_doc.xpath("//ins[@class='adsbygoogle']"):
        ad_by_google.getparent().remove(ad_by_google)
    for bad_h3 in body_doc.xpath("//h3"):
        bad_h3.getparent().remove(bad_h3)
    for pre_tag in body_doc.xpath("//pre"):
        if 'class' in pre_tag.attrib:
            pre_tag.attrib.pop('class')
        if 'title' in pre_tag.attrib:
            pre_tag.attrib.pop('title')

    post_content_doc = body_doc.xpath("//div[@class='entry-content']")[0]
    post_content_doc.append(lxml.etree.XML(source_header_string))
    post_content_doc.append(lxml.etree.XML(source_link))
    basename = os.path.basename(file_name)
    result = html.tostring(body_doc)
    # replace <code> with <code><pre> for styling later.
    result = result.replace('<pre>',
                            '<pre> <code>').replace('</pre>', '</code> </pre>')
    with open(directory + cleaned_file, 'w') as cleaned_file_handle:
        cleaned_file_handle.write(result.encode('utf-8'))
def clean(file_name, directory="."):
    basename = os.path.basename(file_name)
    content = codecs.open(file_name, "r", 'utf-8').read()

    head_pos = content.find('<head>')

    # insert the encoding of the file
    content = content[:head_pos+6] + '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">' + content[head_pos+6:]
    article = Extractor(content, loglevel = logging.INFO).extracted()

    if article is None:
        print "Error processing html file"
        sys.exit(1)
    html_parser = html.HTMLParser(encoding="utf-8")
    html_doc = html.fromstring(content, parser=html_parser)
    head_doc = html_doc.find('head')
    published_time = head_doc.cssselect('meta[property="article:published_time"]')[0].get('content')[:-6]
    print published_time
    cleaned_file = os.path.splitext(basename)[0] + "_" + published_time + "_cleaned.html"
        # don't clean files that already have been cleaned
    if os.path.isfile(cleaned_file):
        return
    source_url = head_doc.cssselect('meta[property="og:url"]')[0].get('content')
    title = html_doc.find('.//title').text_content()

    # if the title is unfortunately removed by boilerpipy, then add it back in
    if "h2" not in article:
        article = "<h1>" + title[:title.rfind('-')] + "</h1>" + article

    reconstructed_body = "<html><body>" + article.replace("<h2", "<h1").replace("</h2>", "</h1>") + "</body></html>"
    source_header_string = "<h3>Source</h3>"
    source_link = "<p><a href='" + source_url +"' rel='tag'>" + source_url + "</a></p>"
    # further remove useless stuff
    body_doc = html.fromstring(reconstructed_body).find('body')
    for bad in body_doc.xpath("//div[@class='comments-main']"):
        bad.getparent().remove(bad)
    for ad_by_google in body_doc.xpath("//ins[@class='adsbygoogle']"):
        ad_by_google.getparent().remove(ad_by_google)
    for bad_h3 in body_doc.xpath("//h3"):
        bad_h3.getparent().remove(bad_h3)
    for pre_tag in body_doc.xpath("//pre"):
        if 'class' in pre_tag.attrib:
            pre_tag.attrib.pop('class')
        if 'title' in pre_tag.attrib:
            pre_tag.attrib.pop('title')

    post_content_doc = body_doc.xpath("//div[@class='entry-content']")[0]
    post_content_doc.append(lxml.etree.XML(source_header_string))
    post_content_doc.append(lxml.etree.XML(source_link))
    result = html.tostring(body_doc)
    # replace <code> with <code><pre> for styling later.
    result = result.replace('<pre>', '<pre> <code>').replace('</pre>', '</code> </pre>')
    with open(directory + cleaned_file, 'w') as cleaned_file_handle:
        cleaned_file_handle.write(result.encode('utf-8'))
Esempio n. 5
0
def clean(content):
    head_pos = content.find('<head>')

    # insert the encoding of the file
    content = content[:head_pos+6] + '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">' + content[head_pos+6:]
    article = Extractor(content, loglevel=logging.INFO).extracted()

    if article is None:
        print("Error processing html file.")
        sys.exit(1)

    html_parser = html.HTMLParser(encoding="utf-8")
    html_doc = html.fromstring(content, parser=html_parser)
    head_doc = html_doc.find('head')

    source_url = head_doc.cssselect('meta[property="og:url"]')[0].get('content')
    title = html_doc.find('.//title').text_content()

    # Replace
    article = article.replace('<h1 class="tabtitle">C++</h1>', '<p><strong>C++</strong></p>')
    article = article.replace('<h1 class="tabtitle">C</h1>', '<p><strong>C</strong></p>')
    article = article.replace('<h1 class="tabtitle">C/C++</h1>', '<p><strong>C/C++</strong></p>')
    article = article.replace('<h1 class="tabtitle">Java</h1>', '<p><strong>Java</strong></p>')
    article = article.replace('<h1 class="tabtitle">Python</h1>', '<p><strong>Python</strong></p>')

    # if the title is unfortunately removed by boilerpipy, then add it back in
    if "h2" not in article:
        article = "<h1>" + title[:title.rfind('-')] + "</h1>" + article

    reconstructed_body = "<html><body>" + article.replace("<h2", "<h1").replace("</h2>", "</h1>") + "</body></html>"

    if "<body><h1>" not in reconstructed_body:
        reconstructed_body = reconstructed_body.replace("<body>", "<body><h1>" + title[:title.rfind('-')] + "</h1>")

    source_header_string = "<h3>Source</h3>"
    source_link = "<p><a href='" + source_url +"' rel='tag'>" + source_url + "</a></p>"

    # further remove useless stuff
    body_doc = html.fromstring(reconstructed_body).find('body')
    for bad in body_doc.xpath("//div[@class='comments-main']"):
        bad.getparent().remove(bad)
    for ad_by_google in body_doc.xpath("//ins[@class='adsbygoogle']"):
        ad_by_google.getparent().remove(ad_by_google)
    for bad_h3 in body_doc.xpath("//h3"):
        bad_h3.getparent().remove(bad_h3)
    for pre_tag in body_doc.xpath("//pre"):
        if 'class' in pre_tag.attrib:
            pre_tag.attrib.pop('class')
        if 'title' in pre_tag.attrib:
            pre_tag.attrib.pop('title')

    post_content_doc = body_doc.xpath("//div[@class='entry-content']")[0]
    post_content_doc.append(lxml.etree.XML(source_header_string))
    post_content_doc.append(lxml.etree.XML(source_link))
    result = html.tostring(body_doc)

    # replace <code> with <code><pre> for styling later.
    result = result.replace('<pre>', '<pre> <code>').replace('</pre>', '</code> </pre>')

    return result
Esempio n. 6
0
def clean(file_name, directory="."):

    content = codecs.open(file_name, "r", 'utf-8').read()

    head_pos = content.find('<head>')

    # HERE is the key: insert the encoding of the file and everything works out ;)
    content = content[:head_pos+6] + '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">' + content[head_pos+6:]
    article = Extractor(content, loglevel = logging.INFO).extracted()

    if article is None:
        print "Error processing html file"
        sys.exit(1)
    html_parser = html.HTMLParser(encoding="utf-8")
    html_doc = html.fromstring(content, parser=html_parser)
    head_doc = html_doc.find('head')
    source_url = head_doc.cssselect('meta[property="og:url"]')[0].get('content')
    title = html_doc.find('.//title').text_content()

    # if the title is unfortunately removed by boilerpipy, then add it back in
    if "h2" not in article:
        article = "<h1>" + title[:title.rfind('-')] + "</h1>" + article

    reconstructed_body = "<html><body>" + article.replace("<h2", "<h1").replace("</h2>", "</h1>") + "</body></html>"
    source_header_string = "<h3>Source</h3>"
    source_link = "<p><a href='" + source_url +"' rel='tag'>" + source_url + "</a></p>"
    # further remove useless stuff
    body_doc = html.fromstring(reconstructed_body).find('body')
    for bad in body_doc.xpath("//div[@class='comments-main']"):
        bad.getparent().remove(bad)
    for ad_by_google in body_doc.xpath("//ins[@class='adsbygoogle']"):
        ad_by_google.getparent().remove(ad_by_google)
    for bad_h3 in body_doc.xpath("//h3"):
        bad_h3.getparent().remove(bad_h3)

    post_content_doc = body_doc.xpath("//div[@class='post-content']")[0]
    post_content_doc.append(lxml.etree.XML(source_header_string))
    post_content_doc.append(lxml.etree.XML(source_link))
    basename = os.path.basename(file_name)
    cleaned_file = os.path.splitext(basename)[0] + "_cleaned.html"
    result = html.tostring(body_doc)
    with open(directory + cleaned_file, 'w') as cleaned_file_handle:
        cleaned_file_handle.write(result.encode('utf-8'))
Esempio n. 7
0
def clean(file_name, directory="."):

    content = codecs.open(file_name, "r", 'utf-8').read()

    article = Extractor(content, loglevel=logging.INFO).extracted()

    if article is None:
        print "Error processing html file"
        sys.exit(1)
    html_parser = html.HTMLParser(encoding="utf-8")
    html_doc = html.fromstring(content, parser=html_parser)
    head_doc = html_doc.find('head')
    source_url = head_doc.cssselect('link[rel="canonical"]')[0].get('href')
    title = html_doc.find('.//title').text_content()

    # if the title is unfortunately removed by boilerpipy, then add it back in
    if "h2" not in article:
        article = "<h1>" + title[:title.rfind('-')] + "</h1>" + article

    reconstructed_body = "<html><body>" + article.replace(
        "<h2", "<h1").replace("</h2>", "</h1>") + "</body></html>"
    source_header_string = "<h3>Source</h3>"
    source_link = "<p><a href='" + source_url + "' rel='tag'>" + source_url + "</a></p>"
    # further remove useless stuff
    body_doc = html.fromstring(reconstructed_body).find('body')
    try:
        post_content_doc = body_doc.xpath("//div[@class='post-content']")[0]
        post_content_doc.append(lxml.etree.XML(source_header_string))
        post_content_doc.append(lxml.etree.XML(source_link))
    except:
        print file_name

    basename = os.path.basename(file_name)
    cleaned_file = os.path.splitext(basename)[0] + "_cleaned.html"
    result = html.tostring(body_doc)
    with open(directory + cleaned_file, 'w') as cleaned_file_handle:
        cleaned_file_handle.write(result.encode('utf-8'))
Esempio n. 8
0
def clean(file_name, directory="."):

    content = codecs.open(file_name, "r", "utf-8").read()

    article = Extractor(content, loglevel=logging.INFO).extracted()

    if article is None:
        print "Error processing html file"
        sys.exit(1)
    html_parser = html.HTMLParser(encoding="utf-8")
    html_doc = html.fromstring(content, parser=html_parser)
    head_doc = html_doc.find("head")
    source_url = head_doc.cssselect('link[rel="canonical"]')[0].get("href")
    title = html_doc.find(".//title").text_content()

    # if the title is unfortunately removed by boilerpipy, then add it back in
    if "h2" not in article:
        article = "<h1>" + title[: title.rfind("-")] + "</h1>" + article

    reconstructed_body = "<html><body>" + article.replace("<h2", "<h1").replace("</h2>", "</h1>") + "</body></html>"
    source_header_string = "<h3>Source</h3>"
    source_link = "<p><a href='" + source_url + "' rel='tag'>" + source_url + "</a></p>"
    # further remove useless stuff
    body_doc = html.fromstring(reconstructed_body).find("body")
    try:
        post_content_doc = body_doc.xpath("//div[@class='post-content']")[0]
        post_content_doc.append(lxml.etree.XML(source_header_string))
        post_content_doc.append(lxml.etree.XML(source_link))
    except:
        print file_name

    basename = os.path.basename(file_name)
    cleaned_file = os.path.splitext(basename)[0] + "_cleaned.html"
    result = html.tostring(body_doc)
    with open(directory + cleaned_file, "w") as cleaned_file_handle:
        cleaned_file_handle.write(result.encode("utf-8"))
Esempio n. 9
0
def clean(content):
    head_pos = content.find('<head>')

    # insert the encoding of the file
    content = content[:head_pos +
                      6] + '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">' + content[
                          head_pos + 6:]
    article = Extractor(content, loglevel=logging.INFO).extracted()

    if article is None:
        print("Error processing html file.")
        sys.exit(1)

    html_parser = html.HTMLParser(encoding="utf-8")
    html_doc = html.fromstring(content, parser=html_parser)
    head_doc = html_doc.find('head')

    source_url = head_doc.cssselect('meta[property="og:url"]')[0].get(
        'content')
    title = html_doc.find('.//title').text_content()

    # Replace
    article = article.replace('<h1 class="tabtitle">C++</h1>',
                              '<p><strong>C++</strong></p>')
    article = article.replace('<h1 class="tabtitle">C</h1>',
                              '<p><strong>C</strong></p>')
    article = article.replace('<h1 class="tabtitle">C/C++</h1>',
                              '<p><strong>C/C++</strong></p>')
    article = article.replace('<h1 class="tabtitle">Java</h1>',
                              '<p><strong>Java</strong></p>')
    article = article.replace('<h1 class="tabtitle">Python</h1>',
                              '<p><strong>Python</strong></p>')

    # if the title is unfortunately removed by boilerpipy, then add it back in
    if "h2" not in article:
        article = "<h1>" + title[:title.rfind('-')] + "</h1>" + article

    reconstructed_body = "<html><body>" + article.replace(
        "<h2", "<h1").replace("</h2>", "</h1>") + "</body></html>"

    if "<body><h1>" not in reconstructed_body:
        reconstructed_body = reconstructed_body.replace(
            "<body>", "<body><h1>" + title[:title.rfind('-')] + "</h1>")

    source_header_string = "<h3>Source</h3>"
    source_link = "<p><a href='" + source_url + "' rel='tag'>" + source_url + "</a></p>"

    # further remove useless stuff
    body_doc = html.fromstring(reconstructed_body).find('body')
    for bad in body_doc.xpath("//div[@class='comments-main']"):
        bad.getparent().remove(bad)
    for ad_by_google in body_doc.xpath("//ins[@class='adsbygoogle']"):
        ad_by_google.getparent().remove(ad_by_google)
    for bad_h3 in body_doc.xpath("//h3"):
        bad_h3.getparent().remove(bad_h3)
    for pre_tag in body_doc.xpath("//pre"):
        if 'class' in pre_tag.attrib:
            pre_tag.attrib.pop('class')
        if 'title' in pre_tag.attrib:
            pre_tag.attrib.pop('title')

    post_content_doc = body_doc.xpath("//div[@class='entry-content']")[0]
    post_content_doc.append(lxml.etree.XML(source_header_string))
    post_content_doc.append(lxml.etree.XML(source_link))
    result = html.tostring(body_doc)

    # replace <code> with <code><pre> for styling later.
    result = result.replace('<pre>',
                            '<pre> <code>').replace('</pre>', '</code> </pre>')

    return result