Esempi in Python per Document, esempi in Python per readability.Document

Esempio n. 1

0

Mostra file

def parseArticles(listoflinks):
    """
	input list of links, returned as list of plain txt
	"""
    text = []
    for link in links:
        # clean up html, getting rid of unwanted text
        html1 = urllib.urlopen(link).read()
        readable_title = readability.Document(html1).short_title()
        readable_article = readability.Document(html1).summary()

        # parse html
        article_soupify = BeautifulSoup(readable_article, "lxml")
        text.append(readable_title + article_soupify.get_text())
    return text

Esempio n. 2

0

Mostra file

File: html.py Progetto: shpotes/paper2remarkable

    def retrieve_pdf(self, pdf_url, filename):
        """Turn the HTML article in a clean pdf file"""
        # Steps
        # 1. Pull the HTML page using requests
        # 2. Extract the article part of the page using readability
        # 3. Convert the article HTML to markdown using html2text
        # 4. Convert the markdown back to HTML (this is done to sanitize HTML)
        # 4. Convert the HTML to PDF, pulling in images where needed
        # 5. Save the PDF to the specified filename.
        request_text = get_page_with_retry(pdf_url, return_text=True)
        doc = readability.Document(request_text)
        title = doc.title()
        raw_html = doc.summary(html_partial=True)

        h2t = html2text.HTML2Text()
        h2t.wrap_links = False
        text = h2t.handle(raw_html)

        # Add the title back to the document
        article = "# {title}\n\n{text}".format(title=title, text=text)

        # Convert to html, fixing relative image urls.
        md = markdown.Markdown()
        md.treeprocessors.register(ImgProcessor(pdf_url), "img", 10)
        html_article = md.convert(article)

        if self.debug:
            with open("./paper.html", "w") as fp:
                fp.write(html_article)

        font_config = weasyprint.fonts.FontConfiguration()
        html = weasyprint.HTML(string=html_article, url_fetcher=url_fetcher)
        css = weasyprint.CSS(string=CSS, font_config=font_config)

        html.write_pdf(filename, stylesheets=[css], font_config=font_config)

Esempio n. 3

0

Mostra file

def main():
    #url = "http://johnpaulett.com/2009/10/15/html-to-restructured-text-in-python-using-pandoc/"
    url = "http://antirez.com/post/take-advantage-of-redis-adding-it-to-your-stack.html"
    html = retrive_page(url)
    readable_html = readability.Document(html).summary()
    text = html2text(readable_html)
    print text

Esempio n. 4

0

Mostra file

File: html.py Progetto: arossbach10/paper2remarkable

def make_readable(request_html):
    """Use an extraction method to get the main article html

    This function checks if ReadabiliPy is installed with NodeJS support, as
    that generally yields better results. If that is not available, it falls
    back on readability.
    """

    have_readabilipy_js = False
    try:
        import readabilipy

        have_readabilipy_js = readabilipy.simple_json.have_node()
    except ImportError:
        pass

    if have_readabilipy_js:
        logger.info("Converting HTML using Readability.js")
        article = readabilipy.simple_json_from_html_string(
            request_html, use_readability=True)
        title = article["title"]
        raw_html = article["content"]
    else:
        logger.info("Converting HTML using readability")
        doc = readability.Document(request_html)
        title = doc.title()
        raw_html = doc.summary(html_partial=True)
    return title, raw_html

Esempio n. 5

0

Mostra file

def getRequirements(url: str, sourcetype: str) -> list:
    """Runs the single-link main function."""
    result = str()
    results = list()
    try:
        if sourcetype == "html":
            parser = newspaper.build(url)
            for article in parser.articles:
                a = newspaper.Article(article.url)
                a.download()
                a.parse()
                a.nlp()
                doc = readability.Document(a.html)
                print(doc)
                # print(doc.summary())
                # results = extractRequirements(doc.summary())
                results = extractRequirements(doc)
        elif sourcetype == "text":
            bytesText = simpleGet(url)
            results = extractRequirements(bytesText.decode("utf-8"))
    except Exception as e:
        logging.exception(e)
    finally:
        print(result)
        # result = "".join(results) + "\n"
        # return result
        return results

Esempio n. 6

0

Mostra file

def get_content(html):
    document = readability.Document(html)
    content_html = document.summary()
    content_text = lxml.html.formstring(content_html).text_content().strip()
    short_title = document.short_title()

    return short_title, content_text

Esempio n. 7

0

Mostra file

def get_content(html: str) -> Tuple[str, str]:
    document = readability.Document(html)
    content_html = document.summary()
    # HTMLのタグを除去
    content_text = lxml.html.fromstring(content_html).text_content().strip()
    short_title = document.short_title()

    return short_title, content_text

Esempio n. 8

0

Mostra file

 def parse_article(self, url, html):
     rdoc = readability.Document(html)
     summary = rdoc.summary()
     lang_id, _ = langid.classify(summary)
     article = newspaper.Article(url, config=self.config, language=lang_id)
     article.set_html(html)
     article.parse()
     return article

Esempio n. 9

0

Mostra file

File: get_fulltext.py Progetto: newsbias/crawler

def extract_fulltext(url):
    resp = requests.get(url)
    doc = readability.Document(resp.text)

    summary = doc.summary()
    body = lxml.html.document_fromstring(summary)

    return body.text_content()

Esempio n. 10

0

Mostra file

def write_readable(base_path, fetcher, url_map, url):
    orig = fetcher.urlread(url)

    options = {'url': url, 'urlfetch': fetcher}
    rdbl_doc = readability.Document(orig, **options)
    summary = rdbl_doc.summary()

    path = make_readable_path(base_path, url_map, url)
    return write_file(path, summary.html)

Esempio n. 11

0

Mostra file

File: 01-GDELT_query.py Progetto: fredcorpuz06/NotAnAccident_CoreNLP

def store_pretty(url):
    r = requests.get(url)
    html = r.text
    doc = rd.Document(html)
    article = doc.summary()
    soup = BeautifulSoup(article, 'html.parser')
    uni = soup.get_text(strip=True)
    cleaner = unidecode(uni)
    return cleaner

Esempio n. 12

0

Mostra file

def get_content(html):
    # HTML の文字列から(タイトル，本文)のタプルを取得する
    document = readability.Document(html)
    content_html = document.summary()

    # HTML Tag を除去して本文のテキストのみを取得する
    content_text = lxml.html.fromstring(content_html).text_content().strip()
    short_title = document.short_title()
    return short_title, content_text

Esempio n. 13

0

Mostra file

def summarize_html(html_text: str, ) -> str:
    """
    Uses readability to summarize the HTML response into a summary
    """
    if html_text.strip() == "":
        raise URLCacheException("No html provided to summarize")
    doc: readability.Document = readability.Document(html_text)
    summary: str = doc.summary()
    return summary

Esempio n. 14

0

Mostra file

def get_content(html):
    document = readability.Document(html)
    content_html = document.summary()
    content_text = lxml.html.fromstring(content_html).text_content().strip()
    content_clean = re.sub('[\t\r\n]', '', content_text)
    content_strip = " ".join(content_clean.split())
    #content_final=re.sub(r'\D{2}\s\d{4}.\d{2}.\d{2}\s.{5}','',content_strip)

    return content_strip

Esempio n. 15

0

Mostra file

File: newsreader.py Progetto: felixbiessmann/active-manifesto

 def fetch_url(url):
     """
     get url with readability
     """
     html = urllib.request.urlopen(url).read()
     readable_html = readability.Document(html)
     readable_article = readable_html.summary()
     title = readable_html.short_title()
     text = BeautifulSoup(readable_article, "lxml").get_text()
     return title, text

Esempio n. 16

0

Mostra file

def get_content(html: str) -> Tuple[str, str]:
    """
    HTMLの文字列から (タイトル, 本文) のタプルを取得する。
    """
    document = readability.Document(html)
    content_html = document.summary()
    # HTMLタグを除去して本文のテキストのみを取得する。
    content_text = lxml.html.fromstring(content_html).text_content().strip()
    short_title = document.short_title()

    return short_title, content_text

Esempio n. 17

0

Mostra file

def get_content(html):
    """
    HTMLから，タプルとして(タイトル, 本文)を取り出す．
    """
    document = readability.Document(html)
    content_html = document.summary()

    content_text = lxml.html.fromstring(content_html).text_content().strip()
    title = document.short_title()

    return title, content_text

Esempio n. 18

0

Mostra file

File: utils.py Progetto: eungay/iitp18-multicampus

def get_content(html):
    """
    HTML 문자열에서 (<제목>, <본문>) 형태의 튜플을 찾은 뒤 반환합니다. 
    """
    document = readability.Document(html)
    content_html = document.summary()
    # HTM 태그를 제거하고 텍스트만 추출합니다.
    content_text = lxml.html.fromstring(content_html).text_content().strip()
    short_title = document.short_title()
    
    return short_title, content_text

Esempio n. 19

0

Mostra file

def parse_html_string(html_string):
    # Parse out title and body text
    document = readability.Document(html_string)

    # TODO(ajayjain): use document.short_title()?
    title = document.title()
    body_html = document.summary(html_partial=True)
    body_text = BeautifulSoup(body_html, 'lxml').get_text().strip()
    parsed = ParsedDocument(title=title, content=body_text)

    return parsed

Esempio n. 20

0

Mostra file

def fprocess(entry):
    guid = entry.guid
    title = entry.title.split(" - ")[0]
    published = entry.published
    source = entry.source.title
    link = entry.link

    web_content = readability.Document(requests.get(link).text)
    summary = translate_html(web_content.summary())

    newsStory = NewsStory(guid, title, summary, published, source, link)
    return newsStory

Esempio n. 21

0

Mostra file

File: crawler.py Progetto: nishio-kun/website_multimetrics

    def get_content(self):
        """
        HTML の文字列から タイトル, 本文 を取得します。
        """
        document = readability.Document(self.response.content)
        title = document.title()
        content_html = document.summary()
        content_text = lxml.html.fromstring(
            content_html).text_content().strip()

        self.title = title
        self.body = content_text

Esempio n. 22

0

Mostra file

File: extract_articles.py Progetto: newsbias/crawler

def extract_article_text(article):
    resp = requests.get(article['url'])
    doc = readability.Document(resp.text)

    summary = doc.summary()
    body = lxml.html.document_fromstring(summary)

    return {
        'title': doc.title(),
        'clean_html': summary,
        'body_text': body.text_content()
    }

Esempio n. 23

0

Mostra file

def execute_test(test_data):
    if test_data is None:
        return None
    else:
        base_path = os.path.join(TEST_DATA_PATH, test_data.test.name)
        fetcher = urlfetch.MockUrlFetch(base_path, test_data.test.url_map)
        doc = readability.Document(test_data.orig_html,
                                   url=test_data.test.url,
                                   urlfetch=fetcher)
        summary = doc.summary()
        diff = lxml.html.diff.htmldiff(test_data.rdbl_html, summary.html)
        return ReadabilityTestResult(test_data, summary.html, diff)

Esempio n. 24

0

Mostra file

 def extract(self, url: str, html_text: str):
     doc = readability.Document(html_text)
     self.content = {
         'url': url,
         'text': re.sub('<[^<]+?>', '',
                        doc.summary()),  # need to remove any tags
         'title': doc.title(),
         'publish_date': None,
         'top_image_url': None,
         'authors': None,
         'extraction_method': METHOD_READABILITY,
     }

Esempio n. 25

0

Mostra file

def get_clean_text(html):
    """
    generate clean text for given html
    """
    doc = readability.Document(html)
    try:
        doc._html()
        clean = doc.get_clean_html()
    except Exception as e:
        print(e)
        clean = html
    bsObj = bs(clean)
    return bsObj.get_text()

Esempio n. 26

0

Mostra file

    def get_filename(self, abs_url):
        request_text = get_page_with_retry(abs_url, return_text=True)
        doc = readability.Document(request_text)
        title = doc.title()

        # Clean the title and make it titlecase
        title = clean_string(title)
        title = titlecase.titlecase(title)
        title = title.replace(" ", "_")
        title = clean_string(title)
        name = title.strip("_") + ".pdf"
        name = unidecode.unidecode(name)
        logger.info("Created filename: %s" % name)
        return name

Esempio n. 27

0

Mostra file

def retrieve_url(url):

    # set a "real" user agent
    firefox = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:79.0) Gecko/20100101 Firefox/79.0"

    # retrieve the main text section from the url using the readability module and using the Chrome user agent
    req = requests.get(url, headers={'User-Agent': firefox})
    doc = readability.Document(req.text)
    rawhtml = doc.summary(html_partial=True)

    # remove any html tags from output
    soup = BeautifulSoup(rawhtml, 'html.parser')
    cleantext = soup.get_text().strip('\n').encode('utf-8')

    return str(rawhtml), str(cleantext)

Esempio n. 28

0

Mostra file

def retrieve_url(url):

    # get a "real" user agent
    ua = fake_useragent.UserAgent()
    chrome = ua.chrome

    # retrieve the main text section from the url using the readability module and using the Chrome user agent
    req = requests.get(url, headers={'User-Agent': chrome})
    doc = readability.Document(req.text)
    rawhtml = doc.summary(html_partial=True)

    # remove any html tags from output
    soup = BeautifulSoup(rawhtml, 'html.parser')
    cleantext = soup.get_text().encode('utf-8')

    return str(rawhtml), str(cleantext)

Esempio n. 29

0

Mostra file

File: PageClassfier.py Progetto: hp027/gist

def readability_test(idxs, dist_path="pages"):
    lite_pages = []
    fat_pages = []
    for idx in idxs:
        c = file("%s/%s" % (dist_path, idx['md5'])).read()
        l = len(readability.Document(idx['url']).summary())
        if l < 200:
            lite_pages.append((l, idx['url']))
        elif l > 400:
            fat_pages.append((l, idx['url']))
            # print idx['url']
    for l in lite_pages:
        print l
    print "________________________________________________"
    for f in fat_pages:
        print f

Esempio n. 30

0

Mostra file

def summarize_html(html_text: str) -> str:
    """
    Uses readability to summarize the HTML response into a summary,
    then lxml to remove unnecessary attributes on all elements
    """
    doc: readability.Document = readability.Document(html_text)
    summary: str = doc.summary()
    # remove class/id attributes
    tree = lxml.html.fromstring(summary)
    ctree = cleaner.clean_html(tree)
    # clean_html replaces the top-level
    # element with html, set it back to HTML if possible
    if ctree.tag == "div":
        ctree.tag = "html"
    html_bytes: bytes = lxml.html.tostring(ctree)
    # should html.unescape be called here? Or should that be handled
    # elsewhere/when parsing into text
    return html_bytes.decode("utf-8")