Python Document.get_clean_html Exemples, readability.Document.get_clean_html Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : views.py Projet : CalmJerome/Text-translator

def translate_page(request: HttpRequest):
    """
    翻译页面，可以使用url或file_id参数
    :param request:
    :return:
    """
    url = request.GET.get('url')
    file_id = request.GET.get('file_id')
    if url:  # 如果提供了url
        if 'http' not in url:
            url = 'http://' + url
        try:
            # 获取网页
            f = urllib.request.urlopen(url)
            text = f.read().decode()
            article = Document(text)
            article.content()
            # 抽取文本
            html = article.get_clean_html()
            # 计算hash
            m = hashlib.md5()
            m.update(html.encode('utf-8'))
            hashed = m.hexdigest()
            try:  # 如果该用户相同的文本已经存在，读取并返回
                p = UserPreference.objects.get(hashed=hashed,
                                               user=request.user)
            except UserPreference.DoesNotExist:
                p = UserPreference(user=request.user,
                                   original=text,
                                   trans=text,
                                   hashed=hashed)
                p.save()
            text = base64.b64encode(p.trans.encode('utf-8')).decode()
            return render(request,
                          'translate.html',
                          context={
                              'text': text,
                              'hashed': hashed,
                              'inc': p.inc
                          })
        except Exception:
            return redirect('/translate_index?message=' + _("Cannot open url"))
    elif file_id:
        try:
            p = UserPreference.objects.get(pk=file_id)
            text = base64.b64encode(p.trans.encode('utf-8')).decode()
            return render(request,
                          'translate.html',
                          context={
                              'text': text,
                              'hashed': p.hashed,
                              'inc': p.inc
                          })
        except UserPreference.DoesNotExist:
            pass
    return redirect('/')

Exemple #2

0

Afficher le fichier

Fichier : glean.py Projet : kannanvr/Data-Structure

def clean(content):
    content = content.decode("utf-8")

    try:
        # Use Python Readability to clean up the HTML
        doc = Document(content)
        article = doc.get_clean_html()
    except:  # noqa
        print("Error cleaning up the html.")
        sys.exit(1)

    # LXML parsing is used to get title and meta head info from HTML
    html_doc = html.fromstring(content,
                               parser=html.HTMLParser(encoding="utf-8"))
    head_doc = html_doc.find('head')

    reconstructed_body = "<html><body>" + article + "</body></html>"

    # Get title so it can be added as an H1 tag, but remove it from
    # the html itself - so that Pandoc doesn't use it
    title = html_doc.find('.//title')
    title.getparent().remove(title)
    title = title.text_content()

    # Add in the title
    if "<body><h1>" not in reconstructed_body:
        reconstructed_body = reconstructed_body.replace(
            "<body>", "<body><h1>" + title[:title.rfind('-')] + "</h1>")

    # Remove stuff that readability didn't remove
    body_doc = html.fromstring(reconstructed_body).find('body')

    bad_tags = (body_doc.xpath("//button") + body_doc.xpath("//nav") +
                body_doc.xpath("//footer") +
                body_doc.xpath("//div[@id='page']") +
                body_doc.xpath("//form[@id='interview_experience_form']") +
                body_doc.xpath("//div[@id='author']") +
                body_doc.xpath("//div[@id='share-buttons']") +
                body_doc.xpath("//div[@id='ide_link']") +
                body_doc.xpath("//div[@id='disqus_thread']") +
                body_doc.xpath("//div[@id='secondary']") +
                body_doc.xpath("//div[@id='personalNoteDiv']") +
                body_doc.xpath("//div[@id='practiceLinkDiv']") +
                body_doc.xpath("//div[@class='leftSideBarParent']") +
                body_doc.xpath("//div[@class='author_info_box']") +
                body_doc.xpath("//div[@class='plugins']") +
                body_doc.xpath("//div[@class='no-p-tag']") +
                body_doc.xpath("//div[@class='comments-main']") +
                body_doc.xpath("//ins[@class='adsbygoogle']") +
                body_doc.xpath("//h1[@class='entry-title']") +
                body_doc.xpath("//hr") + body_doc.xpath("//h3") +
                body_doc.xpath("//h2"))

    for tag in bad_tags:
        tag.getparent().remove(tag)

    # Remove tags that start with some text - along with their parent
    bad_tags = (body_doc.xpath('//h1[starts-with(text(),"Recommended")]'))
    for tag in bad_tags:
        parent = tag.getparent()
        parent.getparent().remove(parent)

    # Convert all H1 language tags to p tags
    for lang_h1 in body_doc.xpath("//h1[@class='tabtitle']"):
        lang_p = '<p><strong>%s</strong></p>' % lang_h1.text_content()
        lang_h1.addnext(lxml.etree.XML(lang_p))
        lang_h1.getparent().remove(lang_h1)

    # Not too sure if this is needed - but at this point
    # I don't want to remove any code that works
    for pre_tag in body_doc.xpath("//pre"):
        if 'class' in pre_tag.attrib:
            pre_tag.attrib.pop('class')
        if 'title' in pre_tag.attrib:
            pre_tag.attrib.pop('title')

    try:
        # Add Source link to doc - this may fail for various reasons
        src_url = head_doc.cssselect('meta[property="og:url"]')[0].get(
            'content')  # noqa
        src_link = "<p><a href='" + src_url + "' rel='tag'>" + src_url + "</a></p>"  # noqa
        post_content_doc = body_doc.xpath("//div[@class='entry-content']")[0]
        post_content_doc.append(lxml.etree.XML("<h3>Source</h3>"))
        post_content_doc.append(lxml.etree.XML(src_link))
    except:  # noqa
        pass

    # Code in the HTML is in the form of a table
    # We convert the table into a single pre / code tag
    for code_tag in body_doc.xpath('//div[starts-with(@id,"highlighter")]'):
        code = str(code_tag.text_content()).replace("\n\n", "")
        code = html_escape(code)
        code = "<pre> <code>" + code + "</code> </pre>"
        code_tag.addnext(lxml.etree.XML(code))
        code_tag.getparent().remove(code_tag)

    result = html.tostring(body_doc).decode("utf-8")

    return result

Exemple #3

0

Afficher le fichier

import requests

from readability import Document

from summarizer import Summarizer
from summarizer.coreference_handler import CoreferenceHandler

handler = CoreferenceHandler(greedyness=.4)

SAMPLE_URL = 'https://www.nytimes.com/2020/07/14/opinion/coronavirus-shutdown.html'

r = requests.get(SAMPLE_URL)

doc = Document(r.text)
print(doc.title())
body = doc.get_clean_html()
#print(body)

#model = Summarizer(sentence_handler=handler)
model = Summarizer()

print(model(body))