def test_si_sample_html_partial(self):
     """Using the si sample, make sure we can get the article alone."""
     sample = load_sample('si-game.sample.html')
     doc = Document('http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html',
                    sample)
     res = doc.get_clean_article()
     self.assertEqual('<div><div class="', res[0:17])
    def test_correct_cleanup(self):
        sample = """
        <html>
            <body>
                <section>test section</section>
                <article class="">
<p>Lot of text here.</p>
                <div id="advertisement"><a href="link">Ad</a></div>
<p>More text is written here, and contains punctuation and dots.</p>
</article>
                <aside id="comment1"/>
                <div id="comment2">
                    <a href="asd">spam</a>
                    <a href="asd">spam</a>
                    <a href="asd">spam</a>
                </div>
                <div id="comment3"/>
                <aside id="comment4">A small comment.</aside>
                <div id="comment5"><p>The comment is also helpful, but it's
                    still not the correct item to be extracted.</p>
                    <p>It's even longer than the article itself!"</p></div>
            </body>
        </html>
        """
        doc = Document(sample)
        s = doc.summary()
        #print(s)
        assert('punctuation' in s)
        assert(not 'comment' in s)
        assert(not 'aside' in s)
 def test_si_sample_html_partial(self):
     """Using the si sample, make sure we can get the article alone."""
     sample = load_sample('si-game.sample.html')
     doc = Document(sample)
     doc.parse(["summary"], html_partial=True)
     res = doc.summary()
     self.assertEqual('<div><h1>Tigers-R', res[0:17])
Exemple #4
0
 def process_item(self, article, spider):
     
     doc = Document(article['text'])
     article['text'] = strip_tags(doc.summary())
     article['hash'] = hashlib.sha256(article['url']).hexdigest()
     
     return article
Exemple #5
0
def convert(link):
    """
    use burify's readability implementation to transcode a web page
    and return the transcoded page and images found in it
    """
    if not link:
        logger.error('Cannot transcode nothing!')
        return None, None, None

    try:
        data = transcoder.prepare_link(link)
        if data:
            article = Document(data)
            if article:
                images, content = _collect_images(
                    article.summary(html_partial=False), link)
                return article.short_title(), content, images
            else:
                logger.info('Burify cannot recognize the data')
                return None, None, None
        else:
            logger.info('Cannot parse %s correctly' % link)
            return None, None, None
    except Exception as k:
        logger.error('%s for %s' % (str(k), str(link)))
        return None, None, None
 def test_si_sample(self):
     """Using the si sample, load article with only opening body element"""
     sample = load_sample('si-game.sample.html')
     doc = Document(sample)
     doc.parse(["summary"])
     res = doc.summary()
     self.assertEqual('<html><body><h1>Tigers-Roya', res[0:27])
 def test_lxml_obj_result(self):
     """Feed Document with an lxml obj instead of an html string. Expect an lxml response"""
     utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
     sample = lxml.html.document_fromstring(load_sample('nyt-article-video.sample.html'), parser=utf8_parser)
     doc = Document(sample, url='http://nytimes.com/')
     res = doc.summary()
     self.assertFalse(isinstance(res, basestring))
Exemple #8
0
 def get(self):
     url = self.get_argument("url", None)
     # https://www.ifanr.com/1080409
     doc = Webcache.find_one({'url': url}, {'_id': 0})
     if doc:
         self.res = dict(doc)
         return self.write_json()
     try:
         sessions = requests.session()
         sessions.headers[
             'User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36'
         response = sessions.get(url)
         # response.encoding = 'utf-8'  # TODO
         response.encoding = get_charset(response)
         doc = Document(response.text)
         title = doc.title()
         summary = doc.summary()
         markdown = html2text.html2text(summary)
         markdown = markdown.replace('-\n', '-')
         markdown = markdown.strip()
         res = {}
         res['url'] = url
         res['title'] = title
         res['markdown'] = markdown
         if title and markdown:
             webcache = Webcache
             webcache.new(res)
             self.res = res
         self.write_json()
     except Exception as e:
         print(e)
 def test_si_sample(self):
     """Using the si sample, load article with only opening body element"""
     sample = load_sample('si-game.sample.html')
     doc = Document(
         sample,
         url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html')
     res = doc.summary()
     self.assertEqual('<html><body><div><div class', res[0:27])
    def test_many_repeated_spaces(self):
        long_space = ' ' * 1000000
        sample = '<html><body><p>foo' + long_space + '</p></body></html>'

        doc = Document(sample)
        s = doc.summary()

        assert 'foo' in s
 def test_si_sample_html_partial(self):
     """Using the si sample, make sure we can get the article alone."""
     sample = load_sample("si-game.sample.html")
     doc = Document(
         sample, url="http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html"
     )
     res = doc.summary(enclose_with_html_tag=True)
     self.assertEqual('<div><div class="', res[0:17])
 def test_lazy_images(self):
     """
     Some sites use <img> elements with data-lazy-src elements pointing to the actual image.
     """
     sample = load_sample('wired.sample.html')
     doc = Document('http://www.wired.com/design/2014/01/will-influential-ui-design-minority-report/', sample)
     article = doc.get_clean_article()
     self.assertIn('<img src="http://www.wired.com/images_blogs/design/2014/01/her-joaquin-phoenix-41-660x371.jpg"', article)
Exemple #13
0
 def get(self):
   urls = self.get_query_arguments('url')
   if urls and len(urls) == 1:
     url = urls[0]
     doc = Document(requests.get(url).text)
     self.write(smartypants(doc.summary()))
     self.write(STYLE)
   else:
     self.write("Please provide ?url=[your-url]")
 def test_best_elem_is_root_and_passing(self):
     sample = (
         '<html class="article" id="body">'
         '   <body>'
         '       <p>1234567890123456789012345</p>'
         '   </body>'
         '</html>'
     )
     doc = Document(sample)
     doc.summary()
Exemple #15
0
    def transform(self, row, chan):
        row['response'] = resolve_future(row['response'])

        doc = Document(row['response'].content)

        row['title'] = doc.title()
        summary = doc.summary()
        row['text'] = html2text(summary, bodywidth=160).replace('****', '').strip()

        yield row
Exemple #16
0
def extract_article(url, ip):
    """Extracts the article using readability"""
    title, summary = None, None
    response = get_url(url, ip)
    if response.status_code == 200:
        doc = Document(response.content)
        summary = unicode(doc.summary())
        title = unicode(doc.title())
        return title, summary
    else:
        return None
    def get_html_article(self, response):
        """
        先调用readability识别正文,再去除标签以及空行,接下来因为模块识别出的正文会混入导航内容,需进一步处理
        具体做法是以换行符分割识别到内容,判断字数.取出是文章的项
        """

        readable_article = Document(response).summary()
        readable_article = self.remove_html_tag(readable_article)
        readable_article = self.remove_empty_line(readable_article)

        article_split = readable_article.split('\n')

        # 记录识别到文章开始和结束的位置
        begin = 0
        end = 0

        begin_find = False
        end_find = False
        has_article = False

        for index in range(len(article_split)):

            # # 当有一段特别大的时候只拿那一段
            # if len(article_split[index]) > 500:
            #     begin, end = index, index
            #     break

            if not begin_find:
                # 一项长度大于40的话就认为是文章的开头
                if len(article_split[index]) > IS_ARTICLE_SIZE:
                    begin = index
                    begin_find = True
                    has_article = True

            elif not end_find:
                if len(article_split[-index - 1]) == 0:
                    continue
                # \u3002\uff01分别对应中文的.跟? 因为一般中文句子结尾都是.跟?
                elif article_split[-index - 1][-1] in u'\u3002\uff01':
                    if len(article_split[-index - 1]) > IS_ARTICLE_SIZE:
                        end = index
                        end_find = True
                        has_article = True

        empty_list=[]

        if not has_article:
            return empty_list
        elif begin == end:
            empty_list.append(article_split[begin])
            return empty_list
        else:
            return article_split[begin: len(article_split) - end]
Exemple #18
0
def view_html(url):
    """Converts an html document to a markdown'd string
    using my own fork of python-readability"""
    try:
        from readability import Document
    except ImportError:
        print("Can't convert document: python-readability is not installed")
        return
    
    html = urlopen(url).read()
    doc=Document(html)
    print(wrap(asciify(BOLD+doc.title()+RESET+"\n"+doc.markdown(),strip_newlines=False),80,''))
Exemple #19
0
 def parse_item(self, response):
     filename = hashlib.sha1(response.url.encode()).hexdigest()
     readability_document = Document(response.body, url=response.url)
     item = BeerReviewPage()
     item['url'] = response.url
     item['filename'] = filename
     item['depth'] = response.meta['depth']
     item['link_text'] = response.meta['link_text']
     item['title'] = readability_document.short_title()
     with open('data/' + filename + '.html','wb') as html_file:
         html_file.write(readability_document.content())
     print '(' + filename + ') ' + item['title'] + " : " + item['url']
     return item
Exemple #20
0
def extract_content_texts(name):
    article_archive = os.path.join(DEFAULT_SAVE_PATH, name, 'raw_articles')
    json_archive = os.path.join(DEFAULT_SAVE_PATH, name, 'json_articles')
    mkdir_p(json_archive)
    for html in glob.glob(article_archive+'/*.html'):
        fname = os.path.basename(html)+'.json'
        savepath = os.path.join(json_archive, fname)
        if os.path.exists(savepath):
            logging.info('Skipping existing json data: {0}'.format(savepath))
            continue
        data = {}
        with open(html, 'r') as myfile:
            doc = Document(myfile.read())
            data['title'] = doc.title()
            data['content'] = doc.content()
            data['summary'] = doc.summary()
            with open(savepath, 'w') as saving:
                json.dump(data, saving)
Exemple #21
0
    def preliminary_parse(self):
        if(not self.is_downloaded):
            raise Exception("not downloaded")
        try:
            d = Document(self.html)
            self._readability_title = d.short_title()
            self._readability_text = d.summary()
            logging.debug(u"readability title: {0}".format(repr(self._readability_title)))
            logging.debug(u"readability text: {0}".format(repr(self._readability_text)))
            if(self._readability_title and self._readability_text):
                return
        except Exception as e:
            logging.warning("error while doing readability parse: {0}".format(str(e)))

        logging.debug("falling back to newspaper parse")
        self.newspaper_article.parse()
        logging.debug(u"newspaper title: {0}".format(repr(self._newspaper_title)))
        logging.debug(u"newspaper text: {0}".format(repr(self._newspaper_text)))
Exemple #22
0
 def get(self):
     sharetype = self.get_argument("sharetype", "goodlink")
     link = self.get_argument("link", '')
     user_id = self.current_user["user_id"]
     assert link
     url = link
     doc = Webcache.find_one({'url': url}, {'_id': 0})
     if not doc:
         sessions = requests.session()
         sessions.headers[
             'User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36'
         response = sessions.get(url)
         # response.encoding = 'utf-8'  # TODO
         response.encoding = get_charset(response)
         logger.info('response.encoding {}'.format(response.encoding))
         doc = Document(response.text)
         doc_title = doc.title()
         summary = doc.summary()
         _markdown = html2text.html2text(summary)
         _markdown = _markdown.replace('-\n', '-').strip()
         res_webcache = {}
         res_webcache['url'] = url
         res_webcache['title'] = doc_title
         res_webcache['markdown'] = _markdown
         if _markdown:
             webcache = Webcache
             webcache.new(res_webcache)
     else:
         logger.info('already')
         doc_title = doc.title
     res = {
         'title': doc_title,
         'sharetype': sharetype,
         'link': link,
     }
     share = Share
     res['user_id'] = user_id
     share = share.new(res)
     user = User.by_sid(user_id)
     user.user_leaf += 10
     user.save()
     self.redirect("/share/" + str(share.id))
Exemple #23
0
 def complement(self):
     for entry in self.entries:
         try:
             response = requests.get(entry.url, timeout=10)
         except requests.RequestException as excp:
             logger.warn('Exception requesting article %s: %s',
                         entry.url, excp.message)
             continue
         document = Document(response.content, url=response.url)
         # Image extraction first
         document._html()  # Trigger parsing
         images = document.html.xpath(
             '//meta[@property="og:image"]/@content')
         images += document.html.xpath(
             '//meta[@name="twitter:image:src"]/@content')
         # Content extraction second
         entry.url = response.url
         entry.image = (images or [''])[0]
         entry.title = document.short_title()
         entry.content = document.summary()
         yield entry
    def extract(self, item):
        """Creates an readability document and returns an ArticleCandidate containing article title and text.

        :param item: A NewscrawlerItem to parse.
        :return: ArticleCandidate containing the recovered article data.
        """

        doc = Document(deepcopy(item['spider_response'].body))
        description = doc.summary()

        article_candidate = ArticleCandidate()
        article_candidate.extractor = self._name
        article_candidate.title = doc.short_title()
        article_candidate.description = description
        article_candidate.text = self._text(item)
        article_candidate.topimage = self._topimage(item)
        article_candidate.author = self._author(item)
        article_candidate.publish_date = self._publish_date(item)
        article_candidate.language = self._language(item)

        return article_candidate
Exemple #25
0
def parse_web_page(text):
    """
    Generic wep page parser with readability.
    Used as a fallback.

    :param text: unicode text
    :return: title, article
    :raise ParserException:
    """
    try:
        from readability import Document
        from readability.readability import Unparseable
    except ImportError:
        raise ParserException('readability is not installed')

    if not text:
        raise ParserException('No decoded text available, aborting!')
    try:
        doc = Document(text)
    except Unparseable as e:
        raise ParserException(e.message)
    else:
        return doc.short_title(), doc.summary(True)
Exemple #26
0
def extract_article_info(text):
    """
    Gets simplified page from the text
    Uses readability module
    """
    doc = Document(text)
    # safe fetch title
    title = doc.short_title()
    if not title:
        title = doc.title()
    # content
    content = doc.summary(html_partial=True)
    image = get_page_image(doc.content())
    # return
    return {'title': title, 'content': content, 'image': image}
Exemple #27
0
def extract_article(html, title=None):
    """
    Wraps around readability.Document and returns the articles
    title and content.
    """
    doc = Document(html, negative_keywords=settings.ARTEX_NEGATIVE_KEYWORDS)
    doc_title = doc.short_title()
    # invoke the summary method to invoke readability's magic
    doc.summary(html_partial=True)
    # obtain the article as HtmlElement tree:
    html_tree = doc.html
    # clean up the article html:
    clean_html = cleanup(html_tree, doc_title)
    # check if the outer element is a tag from negative_keywords
    if elem_attr_contain(clean_html, settings.ARTEX_NEGATIVE_KEYWORDS):
        bad_attr = True
    else:
        bad_attr = False
    if clean_html.tag in settings.ARTEX_NEGATIVE_KEYWORDS or bad_attr:
        # if so, redo extraction with min_text_length set to 0
        doc = Document(html,
                       negative_keywords=settings.ARTEX_NEGATIVE_KEYWORDS,
                       min_text_length=0)
        doc_title = doc.short_title()
        # invoke the summary method to invoke readability's magic
        doc.summary(html_partial=True)
        # obtain the article as HtmlElement tree:
        html_tree = doc.html
        # clean up the article html:
        clean_html = cleanup(html_tree, doc_title)
    content = elem_content_to_string(clean_html)
    if title:
        # if the extracted title is not a subset of given title, use
        # the given title (b/c we assume this is more accurate, but
        # maybe with some unneccessary boilerplate).
        if not doc_title in title or doc_title == '':
            doc_title = title
    return doc_title, content
def _getArticle(url,
                toSimplified=False,
                force_cache=False,
                noAutoConvert=False):
    content = getContent(url, force_cache=force_cache)
    soup = BeautifulSoup(_trimWebpage(content), 'html.parser')
    article_url = _findUrl(url, soup)
    doc = Document(content)
    title = _findTitle(soup, doc)
    to_simplify_calculated = calculateToSimplified(toSimplified, noAutoConvert,
                                                   title)
    article = _Article(
        title, _findAuthor(soup),
        readee.export(url,
                      content=content,
                      list_replace=True,
                      toSimplified=to_simplify_calculated), article_url)
    if to_simplify_calculated:
        article.title = cc.convert(article.title)
        article.author = cc.convert(article.author)
    return article
def apply_tool(tool, str_text, mode="", file_name=''):
    if tool == "BP3":
        list_paragraphs = get_paragraphs_BP3(str_text, mode)
    elif tool == "GOO":
        list_paragraphs = get_paragraphs_GOO(str_text, mode)
    elif tool == "HTML2TEXT":
        text_det = html2text.html2text(str_text)
        list_paragraphs = re.split("\n\n", text_det)
    elif tool == "INSCRIPTIS":
        text_det = inscriptis.get_text(str_text)
        list_paragraphs = re.split("\n", text_det)
    elif tool == "JT":
        list_paragraphs = get_paragraphs_JT(str_text, mode, file_name)
    elif tool == "NEWSPAPER":
        try:
            text_det = fulltext(str_text)
        except:
            text_det = ""
        list_paragraphs = re.split("\n\n", text_det)
    elif tool == "NEWSPLEASE":
        list_paragraphs = get_paragraphs_newsplease(str_text, mode)
    elif tool == "READABILITY":
        try:
            text_det = Document(str_text).summary(html_partial=True)
        except:
            text_det = ""
        list_paragraphs = re.split("\n", text_det)
    elif tool == "TRAF":
        list_paragraphs = get_paragraphs_traf(str_text, mode)
    elif tool == "TRAF_BL":
        list_paragraphs = get_paragraphs_traf_baseline(str_text, mode)
    elif tool == "READ_py":
        try:
            list_paragraphs = get_paragraphs_readabilipy(str_text, mode)
        except:
            print("Error readabilipy")
            list_paragraphs = [""]
    elif tool == "HTML-text":
        list_paragraphs = get_paragraphs_html_text(str_text, mode)
    return list_paragraphs
Exemple #30
0
def readability():
    import requests
    from readability import Document
    from bs4 import BeautifulSoup

    data = dict(default_data)
    data['message'] = "Article Extraction by Readability"
    data['params'] = {}
    data['error'] = ''
    data['readability'] = {}

    if request.method == 'GET':
        data['params']['url'] = request.args.get('url')
        if not data['params']['url']:
            data['error'] = '[url] parameter not found'
            return jsonify(data)

        response = requests.get(data['params']['url'])
        doc = Document(response.text)

    elif request.method == 'POST':
        params = request.form  # postdata

        if not params:
            data['error'] = 'Missing parameters'
            return jsonify(data)

        if not params['html']:
            data['error'] = 'html parameter not found'
            return jsonify(data)

        doc = Document(params['html'])

    data['readability']['title'] = doc.title()
    data['readability']['short_title'] = doc.short_title()
    #data['readability']['content'] = doc.content()
    data['readability']['article_html'] = doc.summary(html_partial=True)

    soup = BeautifulSoup(data['readability']['article_html'])
    data['readability']['text'] = soup.get_text()

    return jsonify(data)
def get_content(response):
    import chardet
    from readability import Document
    import html2text
    char_encoding = chardet.detect(response.content)  # bytes
    # print(char_encoding)
    if char_encoding["encoding"] == "utf-8" or char_encoding[
            "encoding"] == "utf8":
        doc = Document(response.content.decode("utf-8"))
    else:
        doc = Document(response.content.decode("gbk", "ignore"))
    title = doc.title()
    content = doc.summary()
    h = html2text.HTML2Text()
    h.ignore_links = True
    # h.ignore_images = True
    d_data = h.handle(content).replace("-\n", "-")
    return d_data.rstrip()
Exemple #32
0
    def run(self):
        global filename, time_out
        while  not self._queue.empty():
            url = self._queue.get().strip()
            url_list = [url]

            headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'}
            try:
                r = requests.get(url, verify=False, timeout=time_out, headers = headers)
            except:
                if 'https://' not in url:
                    try:
                        r = requests.get(url.replace('http://','https://'), verify=False, timeout=time_out, headers = headers)
                    except:
                        self.lock_file(filename.replace('.txt','.pass.txt'),url+'\n')
                        continue
                else:
                    continue
##            else:
##                if r.status_code//100 > 4:
##                    continue
            
            url_list = self.url_redirects(r,url_list)
##            if url_list:
            try:
                html = r.content
##                if not html:
##                    continue
                url_list.append(Document(html).title())                
                #url_list.append(Document(requests.get(requests.get(url, verify=False, headers = headers).url, verify=False, timeout=10, headers = headers).text).title())
                #url_list.append(BeautifulSoup(requests.get(url, verify=False, timeout=10, headers = headers).text.encode('iso-8859-1').decode('utf-8'), 'lxml').title.string)
            except:
                pass
                
            print(url_list)
            self.lock_file(filename.replace('.txt','.port_link.txt'),str(url_list)+' ['+str(r.status_code)+']\n')
            self.lock_file(filename.replace('.txt','.pass.txt'),url+'\n')
Exemple #33
0
    if not request.headers["content-type"][:9] in ["text/html", "text/plain"]:
        return False

    return True


def get_site_content(link):
    """Try and extract site content from url"""
    rv = ""

    try:
        r = requests.get(link, timeout=15.0)
    except requests.exceptions.RequestException, e:
        logger.warning("Failed loading URL '{}': {}".format(link, e))
    else:
        if valid_request(r):
            # extract the  (most likely) main content
            doc = Document(r.text, url=link)
            content = doc.summary(html_partial=True)
            rv = remove_html(content)
        else:
            logger.info("Invalid request {} for url '{}'".format(r, link))

    return rv


def repeated_func_schedule(time, func):
    spawn_later(0, func)
    spawn_later(time, repeated_func_schedule, time, func)
Exemple #34
0
    def cleanDocument(self, text, theUrl):
        replaceChars = [
            ("“", '"'),
            ("”", '"'),
            ("‘", "'"),
            ("’", "'"),
            ("`", "'"),
            ("`", "'"),
            ("′", "'"),
            ("—", "-"),
            ("–", "-"),
            ("…", "..."),
            ("•", "."),
            ("«", '"'),
            ("»", '"'),
            ("„", '"'),
            ("μ", "micro"),
            ("™", "(TM)"),
            ("≤", "<="),
            ("≥", ">="),
            ("∀", "ForAll"),
            ("⇒", "=>"),
            ("б", "(6)"),
            ("š", "s"),
            ("├", "|-"),
            ("─", "--"),
            ("|", "| "),
            ("│", "| "),
            ("└", "-"),
            ("→", "->"),
            ("⁄", "/"),
            ("⅓", "1/3"),
            ("📸", "(camera)"),
            ("✅", "(x)"),
            ("👽", "(alien)"),
            ("👍", "(ok)"),
            ("🙀", "(oh)"),
            ("🚀", "(despegar)"),
            ("\\n",""),
            ("\\t",""),
        ]

        from readability import Document

        doc = Document(text)
        doc_title = doc.title()

        if not doc_title or (doc_title == "[no-title]"):
            if theUrl.lower().endswith("pdf"):
                title = getPdfTitle(response)
                print(title)
                doc_title = "[PDF] " + title

        theTitle = doc_title

        # myText = doc.summary()
        myText = doc.content()

        for a, b in replaceChars:
            myText = myText.replace(a, b)
            theTitle = theTitle.replace(a, b)

        return (myText, theTitle)
Exemple #35
0
# smilar = 'https://itunes.apple.com/cn/app/app/id1335458066#see-all/customers-also-bought-apps'
# download = 'https://itunes.apple.com/lookup?id={1191692521,1335458066}&country=cn&entity=software'
# rasting = 'https://itunes.apple.com/cn/customer-reviews/id1335458066?displayable-kind=11'
from bs4 import BeautifulSoup
import requests
from readability import Document
from aip import AipNlp

# """ 你的 APPID AK SK """
# APP_ID = '15827943'
# API_KEY = 'eOkQjloKyEGX77h5EtIpKyNg'
# SECRET_KEY = 'v73VmZGG7tc7UnnS9I32IdlUh518Nh8Y'
#
# client = AipNlp(APP_ID, API_KEY, SECRET_KEY)
# text = "How damaging is the Huawei row for the US and China?"
#
# """ 调用词法分析 """
# print(client.lexer(text))

# with open('C:/Users/mayn/Desktop/test.html','rb')as f:
#     html = f.read()
response = requests.get('https://www.bbc.com/news/technology')
doc = Document(response.text)
print(doc.title())
print(doc.summary())
Exemple #36
0
def get_main_html(html):
    doc = Document(html)
    return doc.summary()
Exemple #37
0
 def retrieve_important(self):
     article_content = Document(self.request_website.text)
     html_text = article_content.summary()
     self.text_content = (ScrapeWebsite.stripTags(html_text))
     self.text_content = ScrapeWebsite.normalizeData(self.text_content)
def get_summary(content):
    doc = Document(content)
    summary = doc.summary(html_partial=True)
    return summary
 def test_wrong_link_issue_49(self):
     """We shouldn't break on bad HTML."""
     sample = load_sample('the-hurricane-rubin-carter-denzel-washington.html')
     doc = Document(sample)
     res = doc.summary(html_partial=True)
     self.assertEqual('<div><div class="content__article-body ', res[0:39])
 def test_si_sample_html_partial(self):
     """Using the si sample, make sure we can get the article alone."""
     sample = load_sample('si-game.sample.html')
     doc = Document(sample, url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html')
     res = doc.summary(html_partial=True)
     self.assertEqual('<div><div class="', res[0:17])
import requests
from readability import Document
from html2text import html2text
import argparse
from sys import stdout

parser = argparse.ArgumentParser(
    description="""
Turn a URL into markdown. That's it!
    """,
)
parser.add_argument(
    "url",
    help="The URL of the page",
    type=str
)

if __name__ == '__main__':
    args = parser.parse_args()
    response = requests.get(args.url)
    doc = Document(response.text)
    simplified_markdown = html2text(doc.summary())
    print(simplified_markdown, file=stdout)
Exemple #42
0
    def post(self):
        # TODO
        # print(self.request.arguments)
        share_id = self.get_argument("id", None)
        title = self.get_argument("title", '')
        markdown = self.get_argument("markdown", '')
        content = self.get_argument("content", '')
        sharetype = self.get_argument("sharetype", '')
        slug = self.get_argument("slug", '')
        tags = self.get_argument("tags", '')
        # upload_img = self.get_argument("uploadImg", '')
        post_img = self.get_argument("post_Img", '')
        link = self.get_argument("link", '')
        user_id = self.current_user["user_id"]
        vote_open = self.get_argument("vote_open", '')
        vote_title = self.get_argument("vote_title", '')
        img_url = self.get_argument("img_url", '')

        tags = tags.split()

        if link:
            url = link
            doc = Webcache.find_one({'url': url}, {'_id': 0})
            if doc:
                logger.info('already downloaded')
                doc_title = doc.title
                # markdown = doc.markdown
            else:
                sessions = requests.session()
                sessions.headers[
                    'User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36'
                try:
                    # response = sessions.get(url)
                    response = sessions.get(url, timeout=4)
                # TODO: try to use a proxy
                except (requests.ConnectionError, requests.Timeout) as e:
                    print(e)
                    self.write("GFW...")
                    return
                # except requests.exceptions.HTTPError as e:
                #     if e.response.status_code == 400:
                #         error = e.response.json()
                #         code = error['code']
                #         message = error['message']

                except Exception as e:
                    logger.info('e: {}'.format(e))
                    # self.redirect("/")
                    self.write("GFW")
                    return
                # response.encoding = 'utf-8'  # TODO
                response.encoding = get_charset(response)
                logger.info('response.encoding {}'.format(response.encoding))
                doc = Document(response.text)
                doc_title = doc.title()
                summary = doc.summary()
                _markdown = html2text.html2text(summary)
                _markdown = _markdown.replace('-\n', '-').strip()
                res_webcache = {}
                res_webcache['url'] = url
                res_webcache['title'] = doc_title
                res_webcache['markdown'] = _markdown
                if _markdown:
                    webcache = Webcache
                    webcache.new(res_webcache)

        if vote_open.isdigit():
            vote_open = int(vote_open)
        else:
            vote_open = 0
        if not title:
            title = doc_title

        # 处理封面链接

        if img_url and not post_img:
            ext = img_url.split('?')[0].split('.')[-1]
            ext = '.' + ext.lower()
            print(ext)
            assert ext in ['.jpg', '.jpeg', '.gif', '.png', '.bmp']
            img_dir = 'static/upload/img'
            now = datetime.datetime.now()
            t = now.strftime('%Y%m%d_%H%M%S_%f')
            img_name = '%s%s' % (t, ext)
            img_path = '%s/%s' % (img_dir, img_name)
            print(img_path)
            r = requests.get(img_url, verify=False,
                             stream=True)  # stream=True)
            chunk_size = 100
            with open(img_path, 'wb') as image:
                for chunk in r.iter_content(chunk_size):
                    image.write(chunk)

            im = Image.open(img_path)
            width, height = im.size
            if width / height > 5 or height / width > 5:
                os.remove(img_path)  # 判断比例 删除图片
                print('请不要上传长宽比例过大的图片')
            else:
                # 创建1200x550 750x230 365x230缩略图
                make_post_thumb(img_path,
                                sizes=[(1200, 550), (750, 230), (365, 230),
                                       (260, 160)])
                print('done')
                post_img = img_path.split('/')[-1]
                post_img = post_img.split('.')[0] + '_1200.jpg'

        res = {
            'title': title,
            'markdown': markdown,
            'content': content,
            'sharetype': sharetype,
            'slug': slug,
            'tags': tags,
            'post_img': post_img,
            'link': link,
            'vote_open': vote_open,
            'vote_title': vote_title,
            'updated': time.time(),
        }
        # if not markdown:
        #     self.redirect("/")
        #     return
        if share_id:
            share = Share.by_sid(share_id)
            if not share:
                self.redirect("/404")
            share.update(res)
            share.save()
        else:
            share = Share
            res['user_id'] = user_id
            share = share.new(res)
            user = User.by_sid(user_id)
            user.user_leaf += 10
            user.save()
        for i in tags:
            doc = {'name': i, 'share_ids': share.id}
            Tag.new(doc)
        self.redirect("/share/" + str(share.id))
Exemple #43
0
url = 'http://ipingshan.sznews.com/content/2018-12/08/content_21265915.htm'
url = 'http://www.sohu.com/a/280148326_675286'
url = 'http://news.sznews.com/content/2019-04/26/content_21699029.htm'
url = 'http://forthxu.com/blog/article/73.html'
url = 'http://bm.szhk.com/2019/04/30/283029943930124.html'

a = Article(url, language='zh')  # Chinese

a.download()
a.parse()

print(a.title)
print(a.text)

response = requests.get(url)
doc = Document(response.content)
title = doc.title()
html = doc.summary(True)

article = Article(url, language='zh')
article.download(input_html=html)
article.parse()

q.d()
print(article.title)
print(article.text)
exit(1)

response = requests.get(url)

doc = Document(response.content)
 def __init__(self, html):
     self._html = html
     self._title = ''
     self._doc = Document(html)
Exemple #45
0
import requests
import re
from readability import Document

response = requests.get(
    ' http://www.omannews.gov.om/ona_n/description.jsp?newsId=277437')
raw_html = Document(response.text).summary()

cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, '', raw_html)
stopterms = ['&#13;', '13#&', '&#13', '\n', '\xa0']
querywords = cleantext.split()
resultwords = [word for word in querywords if word.lower() not in stopterms]
clean = ' '.join(resultwords)

print(clean)
 def retrieve_important(self):
     article_content = Document(self.request_website.text)
     html_text = article_content.summary()
     self.text_content = (self.strip_tags(html_text))
     self.text_content = unicodedata.normalize("NFKD", self.text_content)
Exemple #47
0
    def save(self, *args, **kwargs):
        if self.description:
            document = Document(self.description)
            self.readable_description = document.summary(html_partial=True)

        return super(FeedItem, self).save(*args, **kwargs)
class TitleExtractor(object):
    def __init__(self, html):
        self._html = html
        self._title = ''
        self._doc = Document(html)

    def clean_title(self, title):
        spliters = [' - ', '–', '—', '-', '|', '::']
        for s in spliters:
            if s not in title:
                continue
            tts = title.split(s)
            if len(tts) < 2:
                continue
            title = tts[0]
            break
        return title

    def get_title_method1(self):
        self._title = self._doc.short_title()

    def get_title_method2(self):
        # 处理特殊的网站不规则的标题
        if not self._title:
            regex = TITLE_RE
            self._title = get_info(self._html, regex, fetch_one=True)

    def get_title_method3(self):
        g = Goose()
        article = g.extract(raw_html=self._html)
        self._title = article.title

    def get_title_method4(self):
        doc = lxml.html.fromstring(self._html)
        title = ''
        title_el = doc.xpath('//title')
        if title_el:
            title = title_el[0].text_content().strip()
        if len(title) < 7:
            tt = doc.xpath('//meta[@name="title"]')
            if tt:
                title = tt[0].get('content', '')
        if len(title) < 7:
            tt = doc.xpath(
                '//*[contains(@id, "title") or contains(@class, "title")]')
            if not tt:
                tt = doc.xpath(
                    '//*[contains(@id, "font01") or contains(@class, "font01")]'
                )
            for t in tt:
                ti = t.text_content().strip()
                if ti in title and len(ti) * 2 > len(title):
                    title = ti
                    break
                if len(ti) > 20: continue
                if len(ti) > len(title) or len(ti) > 7:
                    title = ti
        self._title = title

    def get_title(self):
        self.get_title_method1()
        if not self._title:
            self.get_title_method2()
        if not self._title:
            self.get_title_method3()
        self._title = self.clean_title(self._title)
        return self._title
Exemple #49
0
    def parseDetail(self, response):
        '''
        详情页解析
        '''
        meta = response.meta
        url = response.url
        seed = meta["seedInfo"]
        enableDownloadFile = False
        enableDownloadImage = False
        enableSnapshot = False
        if seed.enableDownloadFile == 1:
            enableDownloadFile = True
        if seed.enableDownloadImage == 1:
            enableDownloadImage = True
        if seed.enableSnapshot == 1:
            enableSnapshot = True
        detailData = {}
        html = "".join(response.xpath("//html").extract())
        doc = Document(html)  # 利用readabilty处理文件
        if "detailData" in meta:
            detailData = meta["detailData"]
        if len(detailData) <= 0:
            detailData["title"] = doc.title()  # 详情第一页时读入标题和url
            detailData["publishAt"] = TimeUtils.get_conent_time(html)
            detailData["url"] = url
        content_snap = doc.summary()
        # 获取正文
        content = ArticleUtils.removeTag4Content(content_snap)
        ArticleUtils.mergeDict(detailData, "content", content)
        if enableDownloadImage:
            images = ArticleUtils.get_content_image_urls(content_snap, url)
            if images is not None and len(images) > 0:
                ArticleUtils.mergeDict(detailData, "contentImages", images)
        if enableDownloadFile:
            files = ArticleUtils.getContentFiles(response)
            if files is not None and len(files) > 0:
                ArticleUtils.mergeDict(detailData, "contentFiles", files)
        if enableSnapshot:
            ArticleUtils.mergeDict(detailData, "contentSnapshot", content_snap)
        # 爬取下一页
        nextpage_urls = ArticleUtils.getNextPageUrl('', response)
        if StringUtils.isNotEmpty(nextpage_urls):
            meta["detailData"] = detailData
            yield scrapy.Request(url=nextpage_urls,
                                 meta=meta,
                                 callback=self.parseDetail)
        else:
            item = ArticleUtils.meta2item(meta, detailData["url"])
            for (k, v) in detailData.items():
                itemValue = None
                if "category" == k and k in item:
                    itemValue = item[k] + "/" + v
                elif "contentImages" == k or "contentFiles" == k:
                    itemValue = json.dumps(list(v.values()),
                                           ensure_ascii=False)
                else:
                    itemValue = v
                item[k] = itemValue
            item['html'] = html

            yield item
        '''
Exemple #50
0
from boilerpipe.extract import Extractor
import q
import requests
from readability import Document

url = 'https://news.cnblogs.com/n/624615/'
url = 'https://tech.sina.com.cn/i/2019-04-29/doc-ihvhiqax5802337.shtml'
url = 'http://forthxu.com/blog/article/73.html'
url = 'http://forthxu.com/blog/article/91.html'
url = 'http://forthxu.com/blog/article/gmail-sub-account.html'

response = requests.get(url)

doc = Document(response.content)

print(doc.title())

s_html = doc.summary(True)

print("s_html:", s_html)

extractor = Extractor(extractor='ArticleExtractor', html=s_html)
# extractor = Extractor(extractor='ArticleExtractor', url=url)

extracted_text = extractor.getText()

print("extracted_text:", extracted_text)

# extracted_html = extractor.getHTML()

q.d()
 def test_too_many_images_sample_html_partial(self):
     """Using the too-many-images sample, make sure we still get the article."""
     sample = load_sample('too-many-images.sample.html')
     doc = Document(sample)
     res = doc.summary(html_partial=True)
     self.assertEqual('<div><div class="post-body', res[0:26])
 def test_too_many_images_sample_html_partial(self):
     """Using the too-many-images sample, make sure we still get the article."""
     sample = load_sample('too-many-images.sample.html')
     doc = Document(sample)
     res = doc.summary(html_partial=True)
     self.assertEqual('<div><div class="post-body', res[0:26])
Exemple #53
0
for i in list(data.keys()):

    if n < 1000:

        if data[i]['category'] == 'e': # only use entertainment articles

            # logging
            if n % 10 == 0 and n != 0:
                print("10 more datapoints processed. Total %i. Time: %.2f" %(n, time()-s))
                s = time()

            # get text from url
            try:
                r = requests.get(url=data[i]['url'])
                doc = Document(r.text)
                summary = doc.summary()
            except:
                print("Skipped datapoint %i" %(i))
                continue




            # process text
            soup = BeautifulSoup(summary, 'html.parser')
            text = soup.get_text()
            text = text.lower()
            text = re.sub('"', '', text)
            text = re.sub("'", '', text)
            text = re.sub(",", '', text)
 def test_wrong_link_issue_49(self):
     """We shouldn't break on bad HTML."""
     sample = load_sample('the-hurricane-rubin-carter-denzel-washington.html')
     doc = Document(sample)
     res = doc.summary(html_partial=True)
     self.assertEqual('<div><div class="content__article-body ', res[0:39])
Exemple #55
0
            # Justext
            paragraphs = justext.justext(response.content,
                                         justext.get_stoplist("Tagalog"))
            justext_content = ""
            for paragraph in paragraphs:
                if paragraph.class_type == 'good':
                    justext_content += paragraph.text + "\n"

            # Goose
            g = Goose()
            article = g.extract(raw_html=response.content)
            goose_content = article.cleaned_text

            # Readabilty
            doc = Document(response.text)
            readiblity_content = strip_tags(normalize_spaces(
                doc.summary())).strip()

            # Newspaper
            try:
                newspaper_content = fulltext(response.text, language='tl')
            except AttributeError:
                newspaper_content = ""

            # Similarity Checking
            j = similar(justext_content, orig_content)
            g = similar(goose_content, orig_content)
            r = similar(readiblity_content, orig_content)
            n = similar(newspaper_content, orig_content)
Exemple #56
0
            max-width: 650px;
            line-height: 1.4;
            padding: 0 10px;
        }
        h1, h2, h3 {
            line-height: 1.2;
        }
    </style>
</head>
"""

with codecs.open(os.environ['QUTE_HTML'], 'r', 'utf-8') as source:
    data = source.read()

    try:
        from breadability.readable import Article as reader
        doc = reader(data)
        title = doc._original_document.title
        content = HEADER % title + doc.readable + "</html>"
    except ImportError:
        from readability import Document
        doc = Document(data)
        title = doc.title()
        content = doc.summary().replace('<html>', HEADER % title)

    with codecs.open(tmpfile, 'w', 'utf-8') as target:
        target.write(content.lstrip())

    with open(os.environ['QUTE_FIFO'], 'w') as fifo:
        fifo.write('open -t %s' % tmpfile)
Exemple #57
0
    def parse_xiangxi(self,response):
        sel = Selector(response)
        # print(response.url)
        item = NewsItem()
        if not os.path.exists('图片'):
            os.mkdir('图片')
        try:
            title = sel.xpath("//div[@class='article-title']/h2/text()").extract()
            if len(title) == 0:
                raise Exception('title is none')
            else:
                item['title'] = title[0]
                # print(title)
        except:
            print('title 不能为空')
        try:
            atime = sel.xpath("string(//div[@class='article-desc clearfix']/div/div[@class='article-source'])").extract()
            if len(atime) == 0:
                raise Exception('time is none')
            else:
                item['atime'] = atime[0][3:]
                # print(item['atime'])
        except:
            print('时间不能为空')


        source = sel.xpath("string(//div[@class='article-desc clearfix']/div/div[@class='article-source'])").extract()
        if source == []:
            source = '无'
        else:
            source = sel.xpath("string(//div[@class='article-desc clearfix']/div/div[@class='article-source'])").extract()[0][:4]
        item['source'] = source
        html = urllib.request.urlopen(response.url).read()
        article = Document(html).summary()
        sec = Selector(text=article)
        art = ','.join(sec.css("div.article-content p::text").extract())
        # 第一种关键词的算法
        tfidf = analyse.extract_tags
        keywords = tfidf(art)
        # print(keywords)
        #第二种关键词的算法
        # tfidf = analyse.default_textrank
        # keywords = tfidf(art)
        # for keyword in keywords:
        #     print(keyword)
        # content = sel.xpath("//div[@class='article-content']/p/text()").extract()
        # if content == []:
        #     content = ','.join(sel.xpath("string(//div[@class='article-content']/p)").extract())
        # else:
        #     content = ','.join(content)
        # item['content'] = content
        tupian_urls = sel.xpath("//div[@class='img-container']/img[@class='large']/@src").extract()
        if tupian_urls == []:
            tupian_url = '无'
        else:
            # for tupian_url in tupian_urls:
                # tupian_url = tupian_urls[0]
            tupian_url = ','.join(tupian_urls)
        item['tupian_url'] = tupian_url

        item['tupian_bendi'] = '图片/'+tupian_url[-6:]
Exemple #58
0
	def get_readable(self, response):
		doc = RDoc(response.text)
		return doc.summary()
Exemple #59
0
 def format_html(cls, row, media_path, content=None, custom_html=False):
     media_dir, file_path = os.path.split(media_path)
     resource_dir = os.path.join(settings.ARCHIVE_LOCATION, 'resources', str(row.id))
     resource_link = '/{}/{}/{}/{}'.format(row.usr.username, row.directory, str(row.id), 'resources')
     if not os.path.exists(resource_dir):
         os.makedirs(resource_dir)
     if not content:
         content = ""
         with open(media_path, encoding='utf-8', mode='r') as fd:
             content = fd.read()
     soup = BeautifulSoup(content, 'lxml')
     for script in soup.find_all('script'):
         script.decompose()
     url_path = row.url
     ourl = urlparse(url_path)
     ourld = ourl.scheme + '://' + ourl.netloc
     link_list = soup.find_all(['a', 'link', 'img'])
     for link in link_list:
         if link.name == 'img':
             lnk = link.get('src', '')
         else:
             lnk = link.get('href', '')
         if lnk and lnk != '#':
             if link.name == 'img' or (link.name == 'link' and '.css' in lnk):
                 lnk = dbxs.format_link(lnk, url_path)
                 lnk_bytes = bytes(lnk, 'utf-8')
                 h = hashlib.sha256(lnk_bytes)
                 lnk_hash = h.hexdigest()
                 if link.name == 'img':
                     link['src'] = resource_link + '/' + lnk_hash
                     if custom_html:
                         link['class'] = 'card-img-top'
                 else:
                     lnk_hash = lnk_hash + '.css'
                     link['href'] = resource_link + '/' + lnk_hash
                 file_image = os.path.join(resource_dir, lnk_hash)
                 if not os.path.exists(file_image):
                     cls.vnt_noblock.get(lnk, out=file_image)
                     logger.info('getting file: {}, out: {}'.format(lnk, file_image))
             elif lnk.startswith('http'):
                 pass
             else:
                 nlnk = dbxs.format_link(lnk, url_path)
                 if link.name == 'img':
                     link['src'] = nlnk
                     if custom_html:
                         link['class'] = 'card-img-top'
                 else:
                     link['href'] = nlnk
     if custom_html:
         ndata = soup.prettify()
         if soup.title:
             title = soup.title.text
         else:
             title = row.url.rsplit('/')[-1]
         data = Document(ndata)
         data_sum = data.summary()
         if data_sum:
             nsoup = BeautifulSoup(data_sum, 'lxml')
             if nsoup.text.strip():
                 data = cls.custom_template(title, nsoup.prettify(), row)
             else:
                 data = cls.custom_soup(ndata, title, row)
         else:
             data = cls.custom_soup(ndata, title, row)
     else:
         data = soup.prettify()
     return bytes(data, 'utf-8')
Exemple #60
0
def get_article_doc(link):
    response = requests.get(link)
    doc = Document(response.text)
    return doc