コード例 #1
0
def parseArticles(listoflinks):
    """
	input list of links, returned as list of plain txt
	"""
    text = []
    for link in links:
        # clean up html, getting rid of unwanted text
        html1 = urllib.urlopen(link).read()
        readable_title = readability.Document(html1).short_title()
        readable_article = readability.Document(html1).summary()

        # parse html
        article_soupify = BeautifulSoup(readable_article, "lxml")
        text.append(readable_title + article_soupify.get_text())
    return text
コード例 #2
0
ファイル: html.py プロジェクト: shpotes/paper2remarkable
    def retrieve_pdf(self, pdf_url, filename):
        """Turn the HTML article in a clean pdf file"""
        # Steps
        # 1. Pull the HTML page using requests
        # 2. Extract the article part of the page using readability
        # 3. Convert the article HTML to markdown using html2text
        # 4. Convert the markdown back to HTML (this is done to sanitize HTML)
        # 4. Convert the HTML to PDF, pulling in images where needed
        # 5. Save the PDF to the specified filename.
        request_text = get_page_with_retry(pdf_url, return_text=True)
        doc = readability.Document(request_text)
        title = doc.title()
        raw_html = doc.summary(html_partial=True)

        h2t = html2text.HTML2Text()
        h2t.wrap_links = False
        text = h2t.handle(raw_html)

        # Add the title back to the document
        article = "# {title}\n\n{text}".format(title=title, text=text)

        # Convert to html, fixing relative image urls.
        md = markdown.Markdown()
        md.treeprocessors.register(ImgProcessor(pdf_url), "img", 10)
        html_article = md.convert(article)

        if self.debug:
            with open("./paper.html", "w") as fp:
                fp.write(html_article)

        font_config = weasyprint.fonts.FontConfiguration()
        html = weasyprint.HTML(string=html_article, url_fetcher=url_fetcher)
        css = weasyprint.CSS(string=CSS, font_config=font_config)

        html.write_pdf(filename, stylesheets=[css], font_config=font_config)
コード例 #3
0
def main():
    #url = "http://johnpaulett.com/2009/10/15/html-to-restructured-text-in-python-using-pandoc/"
    url = "http://antirez.com/post/take-advantage-of-redis-adding-it-to-your-stack.html"
    html = retrive_page(url)
    readable_html = readability.Document(html).summary()
    text = html2text(readable_html)
    print text
コード例 #4
0
ファイル: html.py プロジェクト: arossbach10/paper2remarkable
def make_readable(request_html):
    """Use an extraction method to get the main article html

    This function checks if ReadabiliPy is installed with NodeJS support, as
    that generally yields better results. If that is not available, it falls
    back on readability.
    """

    have_readabilipy_js = False
    try:
        import readabilipy

        have_readabilipy_js = readabilipy.simple_json.have_node()
    except ImportError:
        pass

    if have_readabilipy_js:
        logger.info("Converting HTML using Readability.js")
        article = readabilipy.simple_json_from_html_string(
            request_html, use_readability=True)
        title = article["title"]
        raw_html = article["content"]
    else:
        logger.info("Converting HTML using readability")
        doc = readability.Document(request_html)
        title = doc.title()
        raw_html = doc.summary(html_partial=True)
    return title, raw_html
コード例 #5
0
def getRequirements(url: str, sourcetype: str) -> list:
    """Runs the single-link main function."""
    result = str()
    results = list()
    try:
        if sourcetype == "html":
            parser = newspaper.build(url)
            for article in parser.articles:
                a = newspaper.Article(article.url)
                a.download()
                a.parse()
                a.nlp()
                doc = readability.Document(a.html)
                print(doc)
                # print(doc.summary())
                # results = extractRequirements(doc.summary())
                results = extractRequirements(doc)
        elif sourcetype == "text":
            bytesText = simpleGet(url)
            results = extractRequirements(bytesText.decode("utf-8"))
    except Exception as e:
        logging.exception(e)
    finally:
        print(result)
        # result = "".join(results) + "\n"
        # return result
        return results
コード例 #6
0
def get_content(html):
    document = readability.Document(html)
    content_html = document.summary()
    content_text = lxml.html.formstring(content_html).text_content().strip()
    short_title = document.short_title()

    return short_title, content_text
コード例 #7
0
def get_content(html: str) -> Tuple[str, str]:
    document = readability.Document(html)
    content_html = document.summary()
    # HTMLのタグを除去
    content_text = lxml.html.fromstring(content_html).text_content().strip()
    short_title = document.short_title()

    return short_title, content_text
コード例 #8
0
 def parse_article(self, url, html):
     rdoc = readability.Document(html)
     summary = rdoc.summary()
     lang_id, _ = langid.classify(summary)
     article = newspaper.Article(url, config=self.config, language=lang_id)
     article.set_html(html)
     article.parse()
     return article
コード例 #9
0
ファイル: get_fulltext.py プロジェクト: newsbias/crawler
def extract_fulltext(url):
    resp = requests.get(url)
    doc = readability.Document(resp.text)

    summary = doc.summary()
    body = lxml.html.document_fromstring(summary)

    return body.text_content()
コード例 #10
0
def write_readable(base_path, fetcher, url_map, url):
    orig = fetcher.urlread(url)

    options = {'url': url, 'urlfetch': fetcher}
    rdbl_doc = readability.Document(orig, **options)
    summary = rdbl_doc.summary()

    path = make_readable_path(base_path, url_map, url)
    return write_file(path, summary.html)
コード例 #11
0
def store_pretty(url):
    r = requests.get(url)
    html = r.text
    doc = rd.Document(html)
    article = doc.summary()
    soup = BeautifulSoup(article, 'html.parser')
    uni = soup.get_text(strip=True)
    cleaner = unidecode(uni)
    return cleaner
コード例 #12
0
def get_content(html):
    # HTML の文字列から(タイトル,本文)のタプルを取得する
    document = readability.Document(html)
    content_html = document.summary()

    # HTML Tag を除去して本文のテキストのみを取得する
    content_text = lxml.html.fromstring(content_html).text_content().strip()
    short_title = document.short_title()
    return short_title, content_text
コード例 #13
0
def summarize_html(html_text: str, ) -> str:
    """
    Uses readability to summarize the HTML response into a summary
    """
    if html_text.strip() == "":
        raise URLCacheException("No html provided to summarize")
    doc: readability.Document = readability.Document(html_text)
    summary: str = doc.summary()
    return summary
コード例 #14
0
def get_content(html):
    document = readability.Document(html)
    content_html = document.summary()
    content_text = lxml.html.fromstring(content_html).text_content().strip()
    content_clean = re.sub('[\t\r\n]', '', content_text)
    content_strip = " ".join(content_clean.split())
    #content_final=re.sub(r'\D{2}\s\d{4}.\d{2}.\d{2}\s.{5}','',content_strip)

    return content_strip
コード例 #15
0
 def fetch_url(url):
     """
     get url with readability
     """
     html = urllib.request.urlopen(url).read()
     readable_html = readability.Document(html)
     readable_article = readable_html.summary()
     title = readable_html.short_title()
     text = BeautifulSoup(readable_article, "lxml").get_text()
     return title, text
コード例 #16
0
def get_content(html: str) -> Tuple[str, str]:
    """
    HTMLの文字列から (タイトル, 本文) のタプルを取得する。
    """
    document = readability.Document(html)
    content_html = document.summary()
    # HTMLタグを除去して本文のテキストのみを取得する。
    content_text = lxml.html.fromstring(content_html).text_content().strip()
    short_title = document.short_title()

    return short_title, content_text
コード例 #17
0
def get_content(html):
    """
    HTMLから,タプルとして(タイトル, 本文)を取り出す.
    """
    document = readability.Document(html)
    content_html = document.summary()

    content_text = lxml.html.fromstring(content_html).text_content().strip()
    title = document.short_title()

    return title, content_text
コード例 #18
0
ファイル: utils.py プロジェクト: eungay/iitp18-multicampus
def get_content(html):
    """
    HTML 문자열에서 (<제목>, <본문>) 형태의 튜플을 찾은 뒤 반환합니다. 
    """
    document = readability.Document(html)
    content_html = document.summary()
    # HTM 태그를 제거하고 텍스트만 추출합니다.
    content_text = lxml.html.fromstring(content_html).text_content().strip()
    short_title = document.short_title()
    
    return short_title, content_text
コード例 #19
0
def parse_html_string(html_string):
    # Parse out title and body text
    document = readability.Document(html_string)

    # TODO(ajayjain): use document.short_title()?
    title = document.title()
    body_html = document.summary(html_partial=True)
    body_text = BeautifulSoup(body_html, 'lxml').get_text().strip()
    parsed = ParsedDocument(title=title, content=body_text)

    return parsed
コード例 #20
0
def fprocess(entry):
    guid = entry.guid
    title = entry.title.split(" - ")[0]
    published = entry.published
    source = entry.source.title
    link = entry.link

    web_content = readability.Document(requests.get(link).text)
    summary = translate_html(web_content.summary())

    newsStory = NewsStory(guid, title, summary, published, source, link)
    return newsStory
コード例 #21
0
    def get_content(self):
        """
        HTML の文字列から タイトル, 本文 を取得します。
        """
        document = readability.Document(self.response.content)
        title = document.title()
        content_html = document.summary()
        content_text = lxml.html.fromstring(
            content_html).text_content().strip()

        self.title = title
        self.body = content_text
コード例 #22
0
ファイル: extract_articles.py プロジェクト: newsbias/crawler
def extract_article_text(article):
    resp = requests.get(article['url'])
    doc = readability.Document(resp.text)

    summary = doc.summary()
    body = lxml.html.document_fromstring(summary)

    return {
        'title': doc.title(),
        'clean_html': summary,
        'body_text': body.text_content()
    }
コード例 #23
0
def execute_test(test_data):
    if test_data is None:
        return None
    else:
        base_path = os.path.join(TEST_DATA_PATH, test_data.test.name)
        fetcher = urlfetch.MockUrlFetch(base_path, test_data.test.url_map)
        doc = readability.Document(test_data.orig_html,
                                   url=test_data.test.url,
                                   urlfetch=fetcher)
        summary = doc.summary()
        diff = lxml.html.diff.htmldiff(test_data.rdbl_html, summary.html)
        return ReadabilityTestResult(test_data, summary.html, diff)
コード例 #24
0
 def extract(self, url: str, html_text: str):
     doc = readability.Document(html_text)
     self.content = {
         'url': url,
         'text': re.sub('<[^<]+?>', '',
                        doc.summary()),  # need to remove any tags
         'title': doc.title(),
         'publish_date': None,
         'top_image_url': None,
         'authors': None,
         'extraction_method': METHOD_READABILITY,
     }
コード例 #25
0
def get_clean_text(html):
    """
    generate clean text for given html
    """
    doc = readability.Document(html)
    try:
        doc._html()
        clean = doc.get_clean_html()
    except Exception as e:
        print(e)
        clean = html
    bsObj = bs(clean)
    return bsObj.get_text()
コード例 #26
0
    def get_filename(self, abs_url):
        request_text = get_page_with_retry(abs_url, return_text=True)
        doc = readability.Document(request_text)
        title = doc.title()

        # Clean the title and make it titlecase
        title = clean_string(title)
        title = titlecase.titlecase(title)
        title = title.replace(" ", "_")
        title = clean_string(title)
        name = title.strip("_") + ".pdf"
        name = unidecode.unidecode(name)
        logger.info("Created filename: %s" % name)
        return name
コード例 #27
0
def retrieve_url(url):

    # set a "real" user agent
    firefox = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:79.0) Gecko/20100101 Firefox/79.0"

    # retrieve the main text section from the url using the readability module and using the Chrome user agent
    req = requests.get(url, headers={'User-Agent': firefox})
    doc = readability.Document(req.text)
    rawhtml = doc.summary(html_partial=True)

    # remove any html tags from output
    soup = BeautifulSoup(rawhtml, 'html.parser')
    cleantext = soup.get_text().strip('\n').encode('utf-8')

    return str(rawhtml), str(cleantext)
コード例 #28
0
def retrieve_url(url):

    # get a "real" user agent
    ua = fake_useragent.UserAgent()
    chrome = ua.chrome

    # retrieve the main text section from the url using the readability module and using the Chrome user agent
    req = requests.get(url, headers={'User-Agent': chrome})
    doc = readability.Document(req.text)
    rawhtml = doc.summary(html_partial=True)

    # remove any html tags from output
    soup = BeautifulSoup(rawhtml, 'html.parser')
    cleantext = soup.get_text().encode('utf-8')

    return str(rawhtml), str(cleantext)
コード例 #29
0
ファイル: PageClassfier.py プロジェクト: hp027/gist
def readability_test(idxs, dist_path="pages"):
    lite_pages = []
    fat_pages = []
    for idx in idxs:
        c = file("%s/%s" % (dist_path, idx['md5'])).read()
        l = len(readability.Document(idx['url']).summary())
        if l < 200:
            lite_pages.append((l, idx['url']))
        elif l > 400:
            fat_pages.append((l, idx['url']))
            # print idx['url']
    for l in lite_pages:
        print l
    print "________________________________________________"
    for f in fat_pages:
        print f
コード例 #30
0
def summarize_html(html_text: str) -> str:
    """
    Uses readability to summarize the HTML response into a summary,
    then lxml to remove unnecessary attributes on all elements
    """
    doc: readability.Document = readability.Document(html_text)
    summary: str = doc.summary()
    # remove class/id attributes
    tree = lxml.html.fromstring(summary)
    ctree = cleaner.clean_html(tree)
    # clean_html replaces the top-level
    # element with html, set it back to HTML if possible
    if ctree.tag == "div":
        ctree.tag = "html"
    html_bytes: bytes = lxml.html.tostring(ctree)
    # should html.unescape be called here? Or should that be handled
    # elsewhere/when parsing into text
    return html_bytes.decode("utf-8")