Python Documentの例、readability.Document Pythonの例

コード例 #1

0

ファイルを表示

def parseArticles(listoflinks):
    """
	input list of links, returned as list of plain txt
	"""
    text = []
    for link in links:
        # clean up html, getting rid of unwanted text
        html1 = urllib.urlopen(link).read()
        readable_title = readability.Document(html1).short_title()
        readable_article = readability.Document(html1).summary()

        # parse html
        article_soupify = BeautifulSoup(readable_article, "lxml")
        text.append(readable_title + article_soupify.get_text())
    return text

コード例 #2

0

ファイルを表示

ファイル: html.py プロジェクト: shpotes/paper2remarkable

    def retrieve_pdf(self, pdf_url, filename):
        """Turn the HTML article in a clean pdf file"""
        # Steps
        # 1. Pull the HTML page using requests
        # 2. Extract the article part of the page using readability
        # 3. Convert the article HTML to markdown using html2text
        # 4. Convert the markdown back to HTML (this is done to sanitize HTML)
        # 4. Convert the HTML to PDF, pulling in images where needed
        # 5. Save the PDF to the specified filename.
        request_text = get_page_with_retry(pdf_url, return_text=True)
        doc = readability.Document(request_text)
        title = doc.title()
        raw_html = doc.summary(html_partial=True)

        h2t = html2text.HTML2Text()
        h2t.wrap_links = False
        text = h2t.handle(raw_html)

        # Add the title back to the document
        article = "# {title}\n\n{text}".format(title=title, text=text)

        # Convert to html, fixing relative image urls.
        md = markdown.Markdown()
        md.treeprocessors.register(ImgProcessor(pdf_url), "img", 10)
        html_article = md.convert(article)

        if self.debug:
            with open("./paper.html", "w") as fp:
                fp.write(html_article)

        font_config = weasyprint.fonts.FontConfiguration()
        html = weasyprint.HTML(string=html_article, url_fetcher=url_fetcher)
        css = weasyprint.CSS(string=CSS, font_config=font_config)

        html.write_pdf(filename, stylesheets=[css], font_config=font_config)

コード例 #3

0

ファイルを表示

def main():
    #url = "http://johnpaulett.com/2009/10/15/html-to-restructured-text-in-python-using-pandoc/"
    url = "http://antirez.com/post/take-advantage-of-redis-adding-it-to-your-stack.html"
    html = retrive_page(url)
    readable_html = readability.Document(html).summary()
    text = html2text(readable_html)
    print text

コード例 #4

0

ファイルを表示

ファイル: html.py プロジェクト: arossbach10/paper2remarkable

def make_readable(request_html):
    """Use an extraction method to get the main article html

    This function checks if ReadabiliPy is installed with NodeJS support, as
    that generally yields better results. If that is not available, it falls
    back on readability.
    """

    have_readabilipy_js = False
    try:
        import readabilipy

        have_readabilipy_js = readabilipy.simple_json.have_node()
    except ImportError:
        pass

    if have_readabilipy_js:
        logger.info("Converting HTML using Readability.js")
        article = readabilipy.simple_json_from_html_string(
            request_html, use_readability=True)
        title = article["title"]
        raw_html = article["content"]
    else:
        logger.info("Converting HTML using readability")
        doc = readability.Document(request_html)
        title = doc.title()
        raw_html = doc.summary(html_partial=True)
    return title, raw_html

コード例 #5

0

ファイルを表示

def getRequirements(url: str, sourcetype: str) -> list:
    """Runs the single-link main function."""
    result = str()
    results = list()
    try:
        if sourcetype == "html":
            parser = newspaper.build(url)
            for article in parser.articles:
                a = newspaper.Article(article.url)
                a.download()
                a.parse()
                a.nlp()
                doc = readability.Document(a.html)
                print(doc)
                # print(doc.summary())
                # results = extractRequirements(doc.summary())
                results = extractRequirements(doc)
        elif sourcetype == "text":
            bytesText = simpleGet(url)
            results = extractRequirements(bytesText.decode("utf-8"))
    except Exception as e:
        logging.exception(e)
    finally:
        print(result)
        # result = "".join(results) + "\n"
        # return result
        return results

コード例 #6

0

ファイルを表示

def get_content(html):
    document = readability.Document(html)
    content_html = document.summary()
    content_text = lxml.html.formstring(content_html).text_content().strip()
    short_title = document.short_title()

    return short_title, content_text

コード例 #7

0

ファイルを表示

def get_content(html: str) -> Tuple[str, str]:
    document = readability.Document(html)
    content_html = document.summary()
    # HTMLのタグを除去
    content_text = lxml.html.fromstring(content_html).text_content().strip()
    short_title = document.short_title()

    return short_title, content_text

コード例 #8

0

ファイルを表示

 def parse_article(self, url, html):
     rdoc = readability.Document(html)
     summary = rdoc.summary()
     lang_id, _ = langid.classify(summary)
     article = newspaper.Article(url, config=self.config, language=lang_id)
     article.set_html(html)
     article.parse()
     return article

コード例 #9

0

ファイルを表示

ファイル: get_fulltext.py プロジェクト: newsbias/crawler

def extract_fulltext(url):
    resp = requests.get(url)
    doc = readability.Document(resp.text)

    summary = doc.summary()
    body = lxml.html.document_fromstring(summary)

    return body.text_content()

コード例 #10

0

ファイルを表示

def write_readable(base_path, fetcher, url_map, url):
    orig = fetcher.urlread(url)

    options = {'url': url, 'urlfetch': fetcher}
    rdbl_doc = readability.Document(orig, **options)
    summary = rdbl_doc.summary()

    path = make_readable_path(base_path, url_map, url)
    return write_file(path, summary.html)

コード例 #11

0

ファイルを表示

ファイル: 01-GDELT_query.py プロジェクト: fredcorpuz06/NotAnAccident_CoreNLP

def store_pretty(url):
    r = requests.get(url)
    html = r.text
    doc = rd.Document(html)
    article = doc.summary()
    soup = BeautifulSoup(article, 'html.parser')
    uni = soup.get_text(strip=True)
    cleaner = unidecode(uni)
    return cleaner

コード例 #12

0

ファイルを表示

def get_content(html):
    # HTML の文字列から(タイトル，本文)のタプルを取得する
    document = readability.Document(html)
    content_html = document.summary()

    # HTML Tag を除去して本文のテキストのみを取得する
    content_text = lxml.html.fromstring(content_html).text_content().strip()
    short_title = document.short_title()
    return short_title, content_text

コード例 #13

0

ファイルを表示

def summarize_html(html_text: str, ) -> str:
    """
    Uses readability to summarize the HTML response into a summary
    """
    if html_text.strip() == "":
        raise URLCacheException("No html provided to summarize")
    doc: readability.Document = readability.Document(html_text)
    summary: str = doc.summary()
    return summary

コード例 #14

0

ファイルを表示

def get_content(html):
    document = readability.Document(html)
    content_html = document.summary()
    content_text = lxml.html.fromstring(content_html).text_content().strip()
    content_clean = re.sub('[\t\r\n]', '', content_text)
    content_strip = " ".join(content_clean.split())
    #content_final=re.sub(r'\D{2}\s\d{4}.\d{2}.\d{2}\s.{5}','',content_strip)

    return content_strip

コード例 #15

0

ファイルを表示

ファイル: newsreader.py プロジェクト: felixbiessmann/active-manifesto

 def fetch_url(url):
     """
     get url with readability
     """
     html = urllib.request.urlopen(url).read()
     readable_html = readability.Document(html)
     readable_article = readable_html.summary()
     title = readable_html.short_title()
     text = BeautifulSoup(readable_article, "lxml").get_text()
     return title, text

コード例 #16

0

ファイルを表示

def get_content(html: str) -> Tuple[str, str]:
    """
    HTMLの文字列から (タイトル, 本文) のタプルを取得する。
    """
    document = readability.Document(html)
    content_html = document.summary()
    # HTMLタグを除去して本文のテキストのみを取得する。
    content_text = lxml.html.fromstring(content_html).text_content().strip()
    short_title = document.short_title()

    return short_title, content_text

コード例 #17

0

ファイルを表示

def get_content(html):
    """
    HTMLから，タプルとして(タイトル, 本文)を取り出す．
    """
    document = readability.Document(html)
    content_html = document.summary()

    content_text = lxml.html.fromstring(content_html).text_content().strip()
    title = document.short_title()

    return title, content_text

コード例 #18

0

ファイルを表示

ファイル: utils.py プロジェクト: eungay/iitp18-multicampus

def get_content(html):
    """
    HTML 문자열에서 (<제목>, <본문>) 형태의 튜플을 찾은 뒤 반환합니다. 
    """
    document = readability.Document(html)
    content_html = document.summary()
    # HTM 태그를 제거하고 텍스트만 추출합니다.
    content_text = lxml.html.fromstring(content_html).text_content().strip()
    short_title = document.short_title()
    
    return short_title, content_text

コード例 #19

0

ファイルを表示

def parse_html_string(html_string):
    # Parse out title and body text
    document = readability.Document(html_string)

    # TODO(ajayjain): use document.short_title()?
    title = document.title()
    body_html = document.summary(html_partial=True)
    body_text = BeautifulSoup(body_html, 'lxml').get_text().strip()
    parsed = ParsedDocument(title=title, content=body_text)

    return parsed

コード例 #20

0

ファイルを表示

def fprocess(entry):
    guid = entry.guid
    title = entry.title.split(" - ")[0]
    published = entry.published
    source = entry.source.title
    link = entry.link

    web_content = readability.Document(requests.get(link).text)
    summary = translate_html(web_content.summary())

    newsStory = NewsStory(guid, title, summary, published, source, link)
    return newsStory

コード例 #21

0

ファイルを表示

ファイル: crawler.py プロジェクト: nishio-kun/website_multimetrics

    def get_content(self):
        """
        HTML の文字列から タイトル, 本文 を取得します。
        """
        document = readability.Document(self.response.content)
        title = document.title()
        content_html = document.summary()
        content_text = lxml.html.fromstring(
            content_html).text_content().strip()

        self.title = title
        self.body = content_text

コード例 #22

0

ファイルを表示

ファイル: extract_articles.py プロジェクト: newsbias/crawler

def extract_article_text(article):
    resp = requests.get(article['url'])
    doc = readability.Document(resp.text)

    summary = doc.summary()
    body = lxml.html.document_fromstring(summary)

    return {
        'title': doc.title(),
        'clean_html': summary,
        'body_text': body.text_content()
    }

コード例 #23

0

ファイルを表示

def execute_test(test_data):
    if test_data is None:
        return None
    else:
        base_path = os.path.join(TEST_DATA_PATH, test_data.test.name)
        fetcher = urlfetch.MockUrlFetch(base_path, test_data.test.url_map)
        doc = readability.Document(test_data.orig_html,
                                   url=test_data.test.url,
                                   urlfetch=fetcher)
        summary = doc.summary()
        diff = lxml.html.diff.htmldiff(test_data.rdbl_html, summary.html)
        return ReadabilityTestResult(test_data, summary.html, diff)

コード例 #24

0

ファイルを表示

 def extract(self, url: str, html_text: str):
     doc = readability.Document(html_text)
     self.content = {
         'url': url,
         'text': re.sub('<[^<]+?>', '',
                        doc.summary()),  # need to remove any tags
         'title': doc.title(),
         'publish_date': None,
         'top_image_url': None,
         'authors': None,
         'extraction_method': METHOD_READABILITY,
     }

コード例 #25

0

ファイルを表示

def get_clean_text(html):
    """
    generate clean text for given html
    """
    doc = readability.Document(html)
    try:
        doc._html()
        clean = doc.get_clean_html()
    except Exception as e:
        print(e)
        clean = html
    bsObj = bs(clean)
    return bsObj.get_text()

コード例 #26

0

ファイルを表示

    def get_filename(self, abs_url):
        request_text = get_page_with_retry(abs_url, return_text=True)
        doc = readability.Document(request_text)
        title = doc.title()

        # Clean the title and make it titlecase
        title = clean_string(title)
        title = titlecase.titlecase(title)
        title = title.replace(" ", "_")
        title = clean_string(title)
        name = title.strip("_") + ".pdf"
        name = unidecode.unidecode(name)
        logger.info("Created filename: %s" % name)
        return name

コード例 #27

0

ファイルを表示

def retrieve_url(url):

    # set a "real" user agent
    firefox = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:79.0) Gecko/20100101 Firefox/79.0"

    # retrieve the main text section from the url using the readability module and using the Chrome user agent
    req = requests.get(url, headers={'User-Agent': firefox})
    doc = readability.Document(req.text)
    rawhtml = doc.summary(html_partial=True)

    # remove any html tags from output
    soup = BeautifulSoup(rawhtml, 'html.parser')
    cleantext = soup.get_text().strip('\n').encode('utf-8')

    return str(rawhtml), str(cleantext)

コード例 #28

0

ファイルを表示

def retrieve_url(url):

    # get a "real" user agent
    ua = fake_useragent.UserAgent()
    chrome = ua.chrome

    # retrieve the main text section from the url using the readability module and using the Chrome user agent
    req = requests.get(url, headers={'User-Agent': chrome})
    doc = readability.Document(req.text)
    rawhtml = doc.summary(html_partial=True)

    # remove any html tags from output
    soup = BeautifulSoup(rawhtml, 'html.parser')
    cleantext = soup.get_text().encode('utf-8')

    return str(rawhtml), str(cleantext)

コード例 #29

0

ファイルを表示

ファイル: PageClassfier.py プロジェクト: hp027/gist

def readability_test(idxs, dist_path="pages"):
    lite_pages = []
    fat_pages = []
    for idx in idxs:
        c = file("%s/%s" % (dist_path, idx['md5'])).read()
        l = len(readability.Document(idx['url']).summary())
        if l < 200:
            lite_pages.append((l, idx['url']))
        elif l > 400:
            fat_pages.append((l, idx['url']))
            # print idx['url']
    for l in lite_pages:
        print l
    print "________________________________________________"
    for f in fat_pages:
        print f

コード例 #30

0

ファイルを表示

def summarize_html(html_text: str) -> str:
    """
    Uses readability to summarize the HTML response into a summary,
    then lxml to remove unnecessary attributes on all elements
    """
    doc: readability.Document = readability.Document(html_text)
    summary: str = doc.summary()
    # remove class/id attributes
    tree = lxml.html.fromstring(summary)
    ctree = cleaner.clean_html(tree)
    # clean_html replaces the top-level
    # element with html, set it back to HTML if possible
    if ctree.tag == "div":
        ctree.tag = "html"
    html_bytes: bytes = lxml.html.tostring(ctree)
    # should html.unescape be called here? Or should that be handled
    # elsewhere/when parsing into text
    return html_bytes.decode("utf-8")