def test_si_sample_html_partial(self): """Using the si sample, make sure we can get the article alone.""" sample = load_sample('si-game.sample.html') doc = Document('http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html', sample) res = doc.get_clean_article() self.assertEqual('<div><div class="', res[0:17])
def test_correct_cleanup(self): sample = """ <html> <body> <section>test section</section> <article class=""> <p>Lot of text here.</p> <div id="advertisement"><a href="link">Ad</a></div> <p>More text is written here, and contains punctuation and dots.</p> </article> <aside id="comment1"/> <div id="comment2"> <a href="asd">spam</a> <a href="asd">spam</a> <a href="asd">spam</a> </div> <div id="comment3"/> <aside id="comment4">A small comment.</aside> <div id="comment5"><p>The comment is also helpful, but it's still not the correct item to be extracted.</p> <p>It's even longer than the article itself!"</p></div> </body> </html> """ doc = Document(sample) s = doc.summary() #print(s) assert('punctuation' in s) assert(not 'comment' in s) assert(not 'aside' in s)
def test_si_sample_html_partial(self): """Using the si sample, make sure we can get the article alone.""" sample = load_sample('si-game.sample.html') doc = Document(sample) doc.parse(["summary"], html_partial=True) res = doc.summary() self.assertEqual('<div><h1>Tigers-R', res[0:17])
def process_item(self, article, spider): doc = Document(article['text']) article['text'] = strip_tags(doc.summary()) article['hash'] = hashlib.sha256(article['url']).hexdigest() return article
def convert(link): """ use burify's readability implementation to transcode a web page and return the transcoded page and images found in it """ if not link: logger.error('Cannot transcode nothing!') return None, None, None try: data = transcoder.prepare_link(link) if data: article = Document(data) if article: images, content = _collect_images( article.summary(html_partial=False), link) return article.short_title(), content, images else: logger.info('Burify cannot recognize the data') return None, None, None else: logger.info('Cannot parse %s correctly' % link) return None, None, None except Exception as k: logger.error('%s for %s' % (str(k), str(link))) return None, None, None
def test_si_sample(self): """Using the si sample, load article with only opening body element""" sample = load_sample('si-game.sample.html') doc = Document(sample) doc.parse(["summary"]) res = doc.summary() self.assertEqual('<html><body><h1>Tigers-Roya', res[0:27])
def test_lxml_obj_result(self): """Feed Document with an lxml obj instead of an html string. Expect an lxml response""" utf8_parser = lxml.html.HTMLParser(encoding='utf-8') sample = lxml.html.document_fromstring(load_sample('nyt-article-video.sample.html'), parser=utf8_parser) doc = Document(sample, url='http://nytimes.com/') res = doc.summary() self.assertFalse(isinstance(res, basestring))
def get(self): url = self.get_argument("url", None) # https://www.ifanr.com/1080409 doc = Webcache.find_one({'url': url}, {'_id': 0}) if doc: self.res = dict(doc) return self.write_json() try: sessions = requests.session() sessions.headers[ 'User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36' response = sessions.get(url) # response.encoding = 'utf-8' # TODO response.encoding = get_charset(response) doc = Document(response.text) title = doc.title() summary = doc.summary() markdown = html2text.html2text(summary) markdown = markdown.replace('-\n', '-') markdown = markdown.strip() res = {} res['url'] = url res['title'] = title res['markdown'] = markdown if title and markdown: webcache = Webcache webcache.new(res) self.res = res self.write_json() except Exception as e: print(e)
def test_si_sample(self): """Using the si sample, load article with only opening body element""" sample = load_sample('si-game.sample.html') doc = Document( sample, url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html') res = doc.summary() self.assertEqual('<html><body><div><div class', res[0:27])
def test_many_repeated_spaces(self): long_space = ' ' * 1000000 sample = '<html><body><p>foo' + long_space + '</p></body></html>' doc = Document(sample) s = doc.summary() assert 'foo' in s
def test_si_sample_html_partial(self): """Using the si sample, make sure we can get the article alone.""" sample = load_sample("si-game.sample.html") doc = Document( sample, url="http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html" ) res = doc.summary(enclose_with_html_tag=True) self.assertEqual('<div><div class="', res[0:17])
def test_lazy_images(self): """ Some sites use <img> elements with data-lazy-src elements pointing to the actual image. """ sample = load_sample('wired.sample.html') doc = Document('http://www.wired.com/design/2014/01/will-influential-ui-design-minority-report/', sample) article = doc.get_clean_article() self.assertIn('<img src="http://www.wired.com/images_blogs/design/2014/01/her-joaquin-phoenix-41-660x371.jpg"', article)
def get(self): urls = self.get_query_arguments('url') if urls and len(urls) == 1: url = urls[0] doc = Document(requests.get(url).text) self.write(smartypants(doc.summary())) self.write(STYLE) else: self.write("Please provide ?url=[your-url]")
def test_best_elem_is_root_and_passing(self): sample = ( '<html class="article" id="body">' ' <body>' ' <p>1234567890123456789012345</p>' ' </body>' '</html>' ) doc = Document(sample) doc.summary()
def transform(self, row, chan): row['response'] = resolve_future(row['response']) doc = Document(row['response'].content) row['title'] = doc.title() summary = doc.summary() row['text'] = html2text(summary, bodywidth=160).replace('****', '').strip() yield row
def extract_article(url, ip): """Extracts the article using readability""" title, summary = None, None response = get_url(url, ip) if response.status_code == 200: doc = Document(response.content) summary = unicode(doc.summary()) title = unicode(doc.title()) return title, summary else: return None
def get_html_article(self, response): """ 先调用readability识别正文,再去除标签以及空行,接下来因为模块识别出的正文会混入导航内容,需进一步处理 具体做法是以换行符分割识别到内容,判断字数.取出是文章的项 """ readable_article = Document(response).summary() readable_article = self.remove_html_tag(readable_article) readable_article = self.remove_empty_line(readable_article) article_split = readable_article.split('\n') # 记录识别到文章开始和结束的位置 begin = 0 end = 0 begin_find = False end_find = False has_article = False for index in range(len(article_split)): # # 当有一段特别大的时候只拿那一段 # if len(article_split[index]) > 500: # begin, end = index, index # break if not begin_find: # 一项长度大于40的话就认为是文章的开头 if len(article_split[index]) > IS_ARTICLE_SIZE: begin = index begin_find = True has_article = True elif not end_find: if len(article_split[-index - 1]) == 0: continue # \u3002\uff01分别对应中文的.跟? 因为一般中文句子结尾都是.跟? elif article_split[-index - 1][-1] in u'\u3002\uff01': if len(article_split[-index - 1]) > IS_ARTICLE_SIZE: end = index end_find = True has_article = True empty_list=[] if not has_article: return empty_list elif begin == end: empty_list.append(article_split[begin]) return empty_list else: return article_split[begin: len(article_split) - end]
def view_html(url): """Converts an html document to a markdown'd string using my own fork of python-readability""" try: from readability import Document except ImportError: print("Can't convert document: python-readability is not installed") return html = urlopen(url).read() doc=Document(html) print(wrap(asciify(BOLD+doc.title()+RESET+"\n"+doc.markdown(),strip_newlines=False),80,''))
def parse_item(self, response): filename = hashlib.sha1(response.url.encode()).hexdigest() readability_document = Document(response.body, url=response.url) item = BeerReviewPage() item['url'] = response.url item['filename'] = filename item['depth'] = response.meta['depth'] item['link_text'] = response.meta['link_text'] item['title'] = readability_document.short_title() with open('data/' + filename + '.html','wb') as html_file: html_file.write(readability_document.content()) print '(' + filename + ') ' + item['title'] + " : " + item['url'] return item
def extract_content_texts(name): article_archive = os.path.join(DEFAULT_SAVE_PATH, name, 'raw_articles') json_archive = os.path.join(DEFAULT_SAVE_PATH, name, 'json_articles') mkdir_p(json_archive) for html in glob.glob(article_archive+'/*.html'): fname = os.path.basename(html)+'.json' savepath = os.path.join(json_archive, fname) if os.path.exists(savepath): logging.info('Skipping existing json data: {0}'.format(savepath)) continue data = {} with open(html, 'r') as myfile: doc = Document(myfile.read()) data['title'] = doc.title() data['content'] = doc.content() data['summary'] = doc.summary() with open(savepath, 'w') as saving: json.dump(data, saving)
def preliminary_parse(self): if(not self.is_downloaded): raise Exception("not downloaded") try: d = Document(self.html) self._readability_title = d.short_title() self._readability_text = d.summary() logging.debug(u"readability title: {0}".format(repr(self._readability_title))) logging.debug(u"readability text: {0}".format(repr(self._readability_text))) if(self._readability_title and self._readability_text): return except Exception as e: logging.warning("error while doing readability parse: {0}".format(str(e))) logging.debug("falling back to newspaper parse") self.newspaper_article.parse() logging.debug(u"newspaper title: {0}".format(repr(self._newspaper_title))) logging.debug(u"newspaper text: {0}".format(repr(self._newspaper_text)))
def get(self): sharetype = self.get_argument("sharetype", "goodlink") link = self.get_argument("link", '') user_id = self.current_user["user_id"] assert link url = link doc = Webcache.find_one({'url': url}, {'_id': 0}) if not doc: sessions = requests.session() sessions.headers[ 'User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36' response = sessions.get(url) # response.encoding = 'utf-8' # TODO response.encoding = get_charset(response) logger.info('response.encoding {}'.format(response.encoding)) doc = Document(response.text) doc_title = doc.title() summary = doc.summary() _markdown = html2text.html2text(summary) _markdown = _markdown.replace('-\n', '-').strip() res_webcache = {} res_webcache['url'] = url res_webcache['title'] = doc_title res_webcache['markdown'] = _markdown if _markdown: webcache = Webcache webcache.new(res_webcache) else: logger.info('already') doc_title = doc.title res = { 'title': doc_title, 'sharetype': sharetype, 'link': link, } share = Share res['user_id'] = user_id share = share.new(res) user = User.by_sid(user_id) user.user_leaf += 10 user.save() self.redirect("/share/" + str(share.id))
def complement(self): for entry in self.entries: try: response = requests.get(entry.url, timeout=10) except requests.RequestException as excp: logger.warn('Exception requesting article %s: %s', entry.url, excp.message) continue document = Document(response.content, url=response.url) # Image extraction first document._html() # Trigger parsing images = document.html.xpath( '//meta[@property="og:image"]/@content') images += document.html.xpath( '//meta[@name="twitter:image:src"]/@content') # Content extraction second entry.url = response.url entry.image = (images or [''])[0] entry.title = document.short_title() entry.content = document.summary() yield entry
def extract(self, item): """Creates an readability document and returns an ArticleCandidate containing article title and text. :param item: A NewscrawlerItem to parse. :return: ArticleCandidate containing the recovered article data. """ doc = Document(deepcopy(item['spider_response'].body)) description = doc.summary() article_candidate = ArticleCandidate() article_candidate.extractor = self._name article_candidate.title = doc.short_title() article_candidate.description = description article_candidate.text = self._text(item) article_candidate.topimage = self._topimage(item) article_candidate.author = self._author(item) article_candidate.publish_date = self._publish_date(item) article_candidate.language = self._language(item) return article_candidate
def parse_web_page(text): """ Generic wep page parser with readability. Used as a fallback. :param text: unicode text :return: title, article :raise ParserException: """ try: from readability import Document from readability.readability import Unparseable except ImportError: raise ParserException('readability is not installed') if not text: raise ParserException('No decoded text available, aborting!') try: doc = Document(text) except Unparseable as e: raise ParserException(e.message) else: return doc.short_title(), doc.summary(True)
def extract_article_info(text): """ Gets simplified page from the text Uses readability module """ doc = Document(text) # safe fetch title title = doc.short_title() if not title: title = doc.title() # content content = doc.summary(html_partial=True) image = get_page_image(doc.content()) # return return {'title': title, 'content': content, 'image': image}
def extract_article(html, title=None): """ Wraps around readability.Document and returns the articles title and content. """ doc = Document(html, negative_keywords=settings.ARTEX_NEGATIVE_KEYWORDS) doc_title = doc.short_title() # invoke the summary method to invoke readability's magic doc.summary(html_partial=True) # obtain the article as HtmlElement tree: html_tree = doc.html # clean up the article html: clean_html = cleanup(html_tree, doc_title) # check if the outer element is a tag from negative_keywords if elem_attr_contain(clean_html, settings.ARTEX_NEGATIVE_KEYWORDS): bad_attr = True else: bad_attr = False if clean_html.tag in settings.ARTEX_NEGATIVE_KEYWORDS or bad_attr: # if so, redo extraction with min_text_length set to 0 doc = Document(html, negative_keywords=settings.ARTEX_NEGATIVE_KEYWORDS, min_text_length=0) doc_title = doc.short_title() # invoke the summary method to invoke readability's magic doc.summary(html_partial=True) # obtain the article as HtmlElement tree: html_tree = doc.html # clean up the article html: clean_html = cleanup(html_tree, doc_title) content = elem_content_to_string(clean_html) if title: # if the extracted title is not a subset of given title, use # the given title (b/c we assume this is more accurate, but # maybe with some unneccessary boilerplate). if not doc_title in title or doc_title == '': doc_title = title return doc_title, content
def _getArticle(url, toSimplified=False, force_cache=False, noAutoConvert=False): content = getContent(url, force_cache=force_cache) soup = BeautifulSoup(_trimWebpage(content), 'html.parser') article_url = _findUrl(url, soup) doc = Document(content) title = _findTitle(soup, doc) to_simplify_calculated = calculateToSimplified(toSimplified, noAutoConvert, title) article = _Article( title, _findAuthor(soup), readee.export(url, content=content, list_replace=True, toSimplified=to_simplify_calculated), article_url) if to_simplify_calculated: article.title = cc.convert(article.title) article.author = cc.convert(article.author) return article
def apply_tool(tool, str_text, mode="", file_name=''): if tool == "BP3": list_paragraphs = get_paragraphs_BP3(str_text, mode) elif tool == "GOO": list_paragraphs = get_paragraphs_GOO(str_text, mode) elif tool == "HTML2TEXT": text_det = html2text.html2text(str_text) list_paragraphs = re.split("\n\n", text_det) elif tool == "INSCRIPTIS": text_det = inscriptis.get_text(str_text) list_paragraphs = re.split("\n", text_det) elif tool == "JT": list_paragraphs = get_paragraphs_JT(str_text, mode, file_name) elif tool == "NEWSPAPER": try: text_det = fulltext(str_text) except: text_det = "" list_paragraphs = re.split("\n\n", text_det) elif tool == "NEWSPLEASE": list_paragraphs = get_paragraphs_newsplease(str_text, mode) elif tool == "READABILITY": try: text_det = Document(str_text).summary(html_partial=True) except: text_det = "" list_paragraphs = re.split("\n", text_det) elif tool == "TRAF": list_paragraphs = get_paragraphs_traf(str_text, mode) elif tool == "TRAF_BL": list_paragraphs = get_paragraphs_traf_baseline(str_text, mode) elif tool == "READ_py": try: list_paragraphs = get_paragraphs_readabilipy(str_text, mode) except: print("Error readabilipy") list_paragraphs = [""] elif tool == "HTML-text": list_paragraphs = get_paragraphs_html_text(str_text, mode) return list_paragraphs
def readability(): import requests from readability import Document from bs4 import BeautifulSoup data = dict(default_data) data['message'] = "Article Extraction by Readability" data['params'] = {} data['error'] = '' data['readability'] = {} if request.method == 'GET': data['params']['url'] = request.args.get('url') if not data['params']['url']: data['error'] = '[url] parameter not found' return jsonify(data) response = requests.get(data['params']['url']) doc = Document(response.text) elif request.method == 'POST': params = request.form # postdata if not params: data['error'] = 'Missing parameters' return jsonify(data) if not params['html']: data['error'] = 'html parameter not found' return jsonify(data) doc = Document(params['html']) data['readability']['title'] = doc.title() data['readability']['short_title'] = doc.short_title() #data['readability']['content'] = doc.content() data['readability']['article_html'] = doc.summary(html_partial=True) soup = BeautifulSoup(data['readability']['article_html']) data['readability']['text'] = soup.get_text() return jsonify(data)
def get_content(response): import chardet from readability import Document import html2text char_encoding = chardet.detect(response.content) # bytes # print(char_encoding) if char_encoding["encoding"] == "utf-8" or char_encoding[ "encoding"] == "utf8": doc = Document(response.content.decode("utf-8")) else: doc = Document(response.content.decode("gbk", "ignore")) title = doc.title() content = doc.summary() h = html2text.HTML2Text() h.ignore_links = True # h.ignore_images = True d_data = h.handle(content).replace("-\n", "-") return d_data.rstrip()
def run(self): global filename, time_out while not self._queue.empty(): url = self._queue.get().strip() url_list = [url] headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'} try: r = requests.get(url, verify=False, timeout=time_out, headers = headers) except: if 'https://' not in url: try: r = requests.get(url.replace('http://','https://'), verify=False, timeout=time_out, headers = headers) except: self.lock_file(filename.replace('.txt','.pass.txt'),url+'\n') continue else: continue ## else: ## if r.status_code//100 > 4: ## continue url_list = self.url_redirects(r,url_list) ## if url_list: try: html = r.content ## if not html: ## continue url_list.append(Document(html).title()) #url_list.append(Document(requests.get(requests.get(url, verify=False, headers = headers).url, verify=False, timeout=10, headers = headers).text).title()) #url_list.append(BeautifulSoup(requests.get(url, verify=False, timeout=10, headers = headers).text.encode('iso-8859-1').decode('utf-8'), 'lxml').title.string) except: pass print(url_list) self.lock_file(filename.replace('.txt','.port_link.txt'),str(url_list)+' ['+str(r.status_code)+']\n') self.lock_file(filename.replace('.txt','.pass.txt'),url+'\n')
if not request.headers["content-type"][:9] in ["text/html", "text/plain"]: return False return True def get_site_content(link): """Try and extract site content from url""" rv = "" try: r = requests.get(link, timeout=15.0) except requests.exceptions.RequestException, e: logger.warning("Failed loading URL '{}': {}".format(link, e)) else: if valid_request(r): # extract the (most likely) main content doc = Document(r.text, url=link) content = doc.summary(html_partial=True) rv = remove_html(content) else: logger.info("Invalid request {} for url '{}'".format(r, link)) return rv def repeated_func_schedule(time, func): spawn_later(0, func) spawn_later(time, repeated_func_schedule, time, func)
def cleanDocument(self, text, theUrl): replaceChars = [ ("“", '"'), ("”", '"'), ("‘", "'"), ("’", "'"), ("`", "'"), ("`", "'"), ("′", "'"), ("—", "-"), ("–", "-"), ("…", "..."), ("•", "."), ("«", '"'), ("»", '"'), ("„", '"'), ("μ", "micro"), ("™", "(TM)"), ("≤", "<="), ("≥", ">="), ("∀", "ForAll"), ("⇒", "=>"), ("б", "(6)"), ("š", "s"), ("├", "|-"), ("─", "--"), ("|", "| "), ("│", "| "), ("└", "-"), ("→", "->"), ("⁄", "/"), ("⅓", "1/3"), ("📸", "(camera)"), ("✅", "(x)"), ("👽", "(alien)"), ("👍", "(ok)"), ("🙀", "(oh)"), ("🚀", "(despegar)"), ("\\n",""), ("\\t",""), ] from readability import Document doc = Document(text) doc_title = doc.title() if not doc_title or (doc_title == "[no-title]"): if theUrl.lower().endswith("pdf"): title = getPdfTitle(response) print(title) doc_title = "[PDF] " + title theTitle = doc_title # myText = doc.summary() myText = doc.content() for a, b in replaceChars: myText = myText.replace(a, b) theTitle = theTitle.replace(a, b) return (myText, theTitle)
# smilar = 'https://itunes.apple.com/cn/app/app/id1335458066#see-all/customers-also-bought-apps' # download = 'https://itunes.apple.com/lookup?id={1191692521,1335458066}&country=cn&entity=software' # rasting = 'https://itunes.apple.com/cn/customer-reviews/id1335458066?displayable-kind=11' from bs4 import BeautifulSoup import requests from readability import Document from aip import AipNlp # """ 你的 APPID AK SK """ # APP_ID = '15827943' # API_KEY = 'eOkQjloKyEGX77h5EtIpKyNg' # SECRET_KEY = 'v73VmZGG7tc7UnnS9I32IdlUh518Nh8Y' # # client = AipNlp(APP_ID, API_KEY, SECRET_KEY) # text = "How damaging is the Huawei row for the US and China?" # # """ 调用词法分析 """ # print(client.lexer(text)) # with open('C:/Users/mayn/Desktop/test.html','rb')as f: # html = f.read() response = requests.get('https://www.bbc.com/news/technology') doc = Document(response.text) print(doc.title()) print(doc.summary())
def get_main_html(html): doc = Document(html) return doc.summary()
def retrieve_important(self): article_content = Document(self.request_website.text) html_text = article_content.summary() self.text_content = (ScrapeWebsite.stripTags(html_text)) self.text_content = ScrapeWebsite.normalizeData(self.text_content)
def get_summary(content): doc = Document(content) summary = doc.summary(html_partial=True) return summary
def test_wrong_link_issue_49(self): """We shouldn't break on bad HTML.""" sample = load_sample('the-hurricane-rubin-carter-denzel-washington.html') doc = Document(sample) res = doc.summary(html_partial=True) self.assertEqual('<div><div class="content__article-body ', res[0:39])
def test_si_sample_html_partial(self): """Using the si sample, make sure we can get the article alone.""" sample = load_sample('si-game.sample.html') doc = Document(sample, url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html') res = doc.summary(html_partial=True) self.assertEqual('<div><div class="', res[0:17])
import requests from readability import Document from html2text import html2text import argparse from sys import stdout parser = argparse.ArgumentParser( description=""" Turn a URL into markdown. That's it! """, ) parser.add_argument( "url", help="The URL of the page", type=str ) if __name__ == '__main__': args = parser.parse_args() response = requests.get(args.url) doc = Document(response.text) simplified_markdown = html2text(doc.summary()) print(simplified_markdown, file=stdout)
def post(self): # TODO # print(self.request.arguments) share_id = self.get_argument("id", None) title = self.get_argument("title", '') markdown = self.get_argument("markdown", '') content = self.get_argument("content", '') sharetype = self.get_argument("sharetype", '') slug = self.get_argument("slug", '') tags = self.get_argument("tags", '') # upload_img = self.get_argument("uploadImg", '') post_img = self.get_argument("post_Img", '') link = self.get_argument("link", '') user_id = self.current_user["user_id"] vote_open = self.get_argument("vote_open", '') vote_title = self.get_argument("vote_title", '') img_url = self.get_argument("img_url", '') tags = tags.split() if link: url = link doc = Webcache.find_one({'url': url}, {'_id': 0}) if doc: logger.info('already downloaded') doc_title = doc.title # markdown = doc.markdown else: sessions = requests.session() sessions.headers[ 'User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36' try: # response = sessions.get(url) response = sessions.get(url, timeout=4) # TODO: try to use a proxy except (requests.ConnectionError, requests.Timeout) as e: print(e) self.write("GFW...") return # except requests.exceptions.HTTPError as e: # if e.response.status_code == 400: # error = e.response.json() # code = error['code'] # message = error['message'] except Exception as e: logger.info('e: {}'.format(e)) # self.redirect("/") self.write("GFW") return # response.encoding = 'utf-8' # TODO response.encoding = get_charset(response) logger.info('response.encoding {}'.format(response.encoding)) doc = Document(response.text) doc_title = doc.title() summary = doc.summary() _markdown = html2text.html2text(summary) _markdown = _markdown.replace('-\n', '-').strip() res_webcache = {} res_webcache['url'] = url res_webcache['title'] = doc_title res_webcache['markdown'] = _markdown if _markdown: webcache = Webcache webcache.new(res_webcache) if vote_open.isdigit(): vote_open = int(vote_open) else: vote_open = 0 if not title: title = doc_title # 处理封面链接 if img_url and not post_img: ext = img_url.split('?')[0].split('.')[-1] ext = '.' + ext.lower() print(ext) assert ext in ['.jpg', '.jpeg', '.gif', '.png', '.bmp'] img_dir = 'static/upload/img' now = datetime.datetime.now() t = now.strftime('%Y%m%d_%H%M%S_%f') img_name = '%s%s' % (t, ext) img_path = '%s/%s' % (img_dir, img_name) print(img_path) r = requests.get(img_url, verify=False, stream=True) # stream=True) chunk_size = 100 with open(img_path, 'wb') as image: for chunk in r.iter_content(chunk_size): image.write(chunk) im = Image.open(img_path) width, height = im.size if width / height > 5 or height / width > 5: os.remove(img_path) # 判断比例 删除图片 print('请不要上传长宽比例过大的图片') else: # 创建1200x550 750x230 365x230缩略图 make_post_thumb(img_path, sizes=[(1200, 550), (750, 230), (365, 230), (260, 160)]) print('done') post_img = img_path.split('/')[-1] post_img = post_img.split('.')[0] + '_1200.jpg' res = { 'title': title, 'markdown': markdown, 'content': content, 'sharetype': sharetype, 'slug': slug, 'tags': tags, 'post_img': post_img, 'link': link, 'vote_open': vote_open, 'vote_title': vote_title, 'updated': time.time(), } # if not markdown: # self.redirect("/") # return if share_id: share = Share.by_sid(share_id) if not share: self.redirect("/404") share.update(res) share.save() else: share = Share res['user_id'] = user_id share = share.new(res) user = User.by_sid(user_id) user.user_leaf += 10 user.save() for i in tags: doc = {'name': i, 'share_ids': share.id} Tag.new(doc) self.redirect("/share/" + str(share.id))
url = 'http://ipingshan.sznews.com/content/2018-12/08/content_21265915.htm' url = 'http://www.sohu.com/a/280148326_675286' url = 'http://news.sznews.com/content/2019-04/26/content_21699029.htm' url = 'http://forthxu.com/blog/article/73.html' url = 'http://bm.szhk.com/2019/04/30/283029943930124.html' a = Article(url, language='zh') # Chinese a.download() a.parse() print(a.title) print(a.text) response = requests.get(url) doc = Document(response.content) title = doc.title() html = doc.summary(True) article = Article(url, language='zh') article.download(input_html=html) article.parse() q.d() print(article.title) print(article.text) exit(1) response = requests.get(url) doc = Document(response.content)
def __init__(self, html): self._html = html self._title = '' self._doc = Document(html)
import requests import re from readability import Document response = requests.get( ' http://www.omannews.gov.om/ona_n/description.jsp?newsId=277437') raw_html = Document(response.text).summary() cleanr = re.compile('<.*?>') cleantext = re.sub(cleanr, '', raw_html) stopterms = [' ', '13#&', '
', '\n', '\xa0'] querywords = cleantext.split() resultwords = [word for word in querywords if word.lower() not in stopterms] clean = ' '.join(resultwords) print(clean)
def retrieve_important(self): article_content = Document(self.request_website.text) html_text = article_content.summary() self.text_content = (self.strip_tags(html_text)) self.text_content = unicodedata.normalize("NFKD", self.text_content)
def save(self, *args, **kwargs): if self.description: document = Document(self.description) self.readable_description = document.summary(html_partial=True) return super(FeedItem, self).save(*args, **kwargs)
class TitleExtractor(object): def __init__(self, html): self._html = html self._title = '' self._doc = Document(html) def clean_title(self, title): spliters = [' - ', '–', '—', '-', '|', '::'] for s in spliters: if s not in title: continue tts = title.split(s) if len(tts) < 2: continue title = tts[0] break return title def get_title_method1(self): self._title = self._doc.short_title() def get_title_method2(self): # 处理特殊的网站不规则的标题 if not self._title: regex = TITLE_RE self._title = get_info(self._html, regex, fetch_one=True) def get_title_method3(self): g = Goose() article = g.extract(raw_html=self._html) self._title = article.title def get_title_method4(self): doc = lxml.html.fromstring(self._html) title = '' title_el = doc.xpath('//title') if title_el: title = title_el[0].text_content().strip() if len(title) < 7: tt = doc.xpath('//meta[@name="title"]') if tt: title = tt[0].get('content', '') if len(title) < 7: tt = doc.xpath( '//*[contains(@id, "title") or contains(@class, "title")]') if not tt: tt = doc.xpath( '//*[contains(@id, "font01") or contains(@class, "font01")]' ) for t in tt: ti = t.text_content().strip() if ti in title and len(ti) * 2 > len(title): title = ti break if len(ti) > 20: continue if len(ti) > len(title) or len(ti) > 7: title = ti self._title = title def get_title(self): self.get_title_method1() if not self._title: self.get_title_method2() if not self._title: self.get_title_method3() self._title = self.clean_title(self._title) return self._title
def parseDetail(self, response): ''' 详情页解析 ''' meta = response.meta url = response.url seed = meta["seedInfo"] enableDownloadFile = False enableDownloadImage = False enableSnapshot = False if seed.enableDownloadFile == 1: enableDownloadFile = True if seed.enableDownloadImage == 1: enableDownloadImage = True if seed.enableSnapshot == 1: enableSnapshot = True detailData = {} html = "".join(response.xpath("//html").extract()) doc = Document(html) # 利用readabilty处理文件 if "detailData" in meta: detailData = meta["detailData"] if len(detailData) <= 0: detailData["title"] = doc.title() # 详情第一页时读入标题和url detailData["publishAt"] = TimeUtils.get_conent_time(html) detailData["url"] = url content_snap = doc.summary() # 获取正文 content = ArticleUtils.removeTag4Content(content_snap) ArticleUtils.mergeDict(detailData, "content", content) if enableDownloadImage: images = ArticleUtils.get_content_image_urls(content_snap, url) if images is not None and len(images) > 0: ArticleUtils.mergeDict(detailData, "contentImages", images) if enableDownloadFile: files = ArticleUtils.getContentFiles(response) if files is not None and len(files) > 0: ArticleUtils.mergeDict(detailData, "contentFiles", files) if enableSnapshot: ArticleUtils.mergeDict(detailData, "contentSnapshot", content_snap) # 爬取下一页 nextpage_urls = ArticleUtils.getNextPageUrl('', response) if StringUtils.isNotEmpty(nextpage_urls): meta["detailData"] = detailData yield scrapy.Request(url=nextpage_urls, meta=meta, callback=self.parseDetail) else: item = ArticleUtils.meta2item(meta, detailData["url"]) for (k, v) in detailData.items(): itemValue = None if "category" == k and k in item: itemValue = item[k] + "/" + v elif "contentImages" == k or "contentFiles" == k: itemValue = json.dumps(list(v.values()), ensure_ascii=False) else: itemValue = v item[k] = itemValue item['html'] = html yield item '''
from boilerpipe.extract import Extractor import q import requests from readability import Document url = 'https://news.cnblogs.com/n/624615/' url = 'https://tech.sina.com.cn/i/2019-04-29/doc-ihvhiqax5802337.shtml' url = 'http://forthxu.com/blog/article/73.html' url = 'http://forthxu.com/blog/article/91.html' url = 'http://forthxu.com/blog/article/gmail-sub-account.html' response = requests.get(url) doc = Document(response.content) print(doc.title()) s_html = doc.summary(True) print("s_html:", s_html) extractor = Extractor(extractor='ArticleExtractor', html=s_html) # extractor = Extractor(extractor='ArticleExtractor', url=url) extracted_text = extractor.getText() print("extracted_text:", extracted_text) # extracted_html = extractor.getHTML() q.d()
def test_too_many_images_sample_html_partial(self): """Using the too-many-images sample, make sure we still get the article.""" sample = load_sample('too-many-images.sample.html') doc = Document(sample) res = doc.summary(html_partial=True) self.assertEqual('<div><div class="post-body', res[0:26])
for i in list(data.keys()): if n < 1000: if data[i]['category'] == 'e': # only use entertainment articles # logging if n % 10 == 0 and n != 0: print("10 more datapoints processed. Total %i. Time: %.2f" %(n, time()-s)) s = time() # get text from url try: r = requests.get(url=data[i]['url']) doc = Document(r.text) summary = doc.summary() except: print("Skipped datapoint %i" %(i)) continue # process text soup = BeautifulSoup(summary, 'html.parser') text = soup.get_text() text = text.lower() text = re.sub('"', '', text) text = re.sub("'", '', text) text = re.sub(",", '', text)
# Justext paragraphs = justext.justext(response.content, justext.get_stoplist("Tagalog")) justext_content = "" for paragraph in paragraphs: if paragraph.class_type == 'good': justext_content += paragraph.text + "\n" # Goose g = Goose() article = g.extract(raw_html=response.content) goose_content = article.cleaned_text # Readabilty doc = Document(response.text) readiblity_content = strip_tags(normalize_spaces( doc.summary())).strip() # Newspaper try: newspaper_content = fulltext(response.text, language='tl') except AttributeError: newspaper_content = "" # Similarity Checking j = similar(justext_content, orig_content) g = similar(goose_content, orig_content) r = similar(readiblity_content, orig_content) n = similar(newspaper_content, orig_content)
max-width: 650px; line-height: 1.4; padding: 0 10px; } h1, h2, h3 { line-height: 1.2; } </style> </head> """ with codecs.open(os.environ['QUTE_HTML'], 'r', 'utf-8') as source: data = source.read() try: from breadability.readable import Article as reader doc = reader(data) title = doc._original_document.title content = HEADER % title + doc.readable + "</html>" except ImportError: from readability import Document doc = Document(data) title = doc.title() content = doc.summary().replace('<html>', HEADER % title) with codecs.open(tmpfile, 'w', 'utf-8') as target: target.write(content.lstrip()) with open(os.environ['QUTE_FIFO'], 'w') as fifo: fifo.write('open -t %s' % tmpfile)
def parse_xiangxi(self,response): sel = Selector(response) # print(response.url) item = NewsItem() if not os.path.exists('图片'): os.mkdir('图片') try: title = sel.xpath("//div[@class='article-title']/h2/text()").extract() if len(title) == 0: raise Exception('title is none') else: item['title'] = title[0] # print(title) except: print('title 不能为空') try: atime = sel.xpath("string(//div[@class='article-desc clearfix']/div/div[@class='article-source'])").extract() if len(atime) == 0: raise Exception('time is none') else: item['atime'] = atime[0][3:] # print(item['atime']) except: print('时间不能为空') source = sel.xpath("string(//div[@class='article-desc clearfix']/div/div[@class='article-source'])").extract() if source == []: source = '无' else: source = sel.xpath("string(//div[@class='article-desc clearfix']/div/div[@class='article-source'])").extract()[0][:4] item['source'] = source html = urllib.request.urlopen(response.url).read() article = Document(html).summary() sec = Selector(text=article) art = ','.join(sec.css("div.article-content p::text").extract()) # 第一种关键词的算法 tfidf = analyse.extract_tags keywords = tfidf(art) # print(keywords) #第二种关键词的算法 # tfidf = analyse.default_textrank # keywords = tfidf(art) # for keyword in keywords: # print(keyword) # content = sel.xpath("//div[@class='article-content']/p/text()").extract() # if content == []: # content = ','.join(sel.xpath("string(//div[@class='article-content']/p)").extract()) # else: # content = ','.join(content) # item['content'] = content tupian_urls = sel.xpath("//div[@class='img-container']/img[@class='large']/@src").extract() if tupian_urls == []: tupian_url = '无' else: # for tupian_url in tupian_urls: # tupian_url = tupian_urls[0] tupian_url = ','.join(tupian_urls) item['tupian_url'] = tupian_url item['tupian_bendi'] = '图片/'+tupian_url[-6:]
def get_readable(self, response): doc = RDoc(response.text) return doc.summary()
def format_html(cls, row, media_path, content=None, custom_html=False): media_dir, file_path = os.path.split(media_path) resource_dir = os.path.join(settings.ARCHIVE_LOCATION, 'resources', str(row.id)) resource_link = '/{}/{}/{}/{}'.format(row.usr.username, row.directory, str(row.id), 'resources') if not os.path.exists(resource_dir): os.makedirs(resource_dir) if not content: content = "" with open(media_path, encoding='utf-8', mode='r') as fd: content = fd.read() soup = BeautifulSoup(content, 'lxml') for script in soup.find_all('script'): script.decompose() url_path = row.url ourl = urlparse(url_path) ourld = ourl.scheme + '://' + ourl.netloc link_list = soup.find_all(['a', 'link', 'img']) for link in link_list: if link.name == 'img': lnk = link.get('src', '') else: lnk = link.get('href', '') if lnk and lnk != '#': if link.name == 'img' or (link.name == 'link' and '.css' in lnk): lnk = dbxs.format_link(lnk, url_path) lnk_bytes = bytes(lnk, 'utf-8') h = hashlib.sha256(lnk_bytes) lnk_hash = h.hexdigest() if link.name == 'img': link['src'] = resource_link + '/' + lnk_hash if custom_html: link['class'] = 'card-img-top' else: lnk_hash = lnk_hash + '.css' link['href'] = resource_link + '/' + lnk_hash file_image = os.path.join(resource_dir, lnk_hash) if not os.path.exists(file_image): cls.vnt_noblock.get(lnk, out=file_image) logger.info('getting file: {}, out: {}'.format(lnk, file_image)) elif lnk.startswith('http'): pass else: nlnk = dbxs.format_link(lnk, url_path) if link.name == 'img': link['src'] = nlnk if custom_html: link['class'] = 'card-img-top' else: link['href'] = nlnk if custom_html: ndata = soup.prettify() if soup.title: title = soup.title.text else: title = row.url.rsplit('/')[-1] data = Document(ndata) data_sum = data.summary() if data_sum: nsoup = BeautifulSoup(data_sum, 'lxml') if nsoup.text.strip(): data = cls.custom_template(title, nsoup.prettify(), row) else: data = cls.custom_soup(ndata, title, row) else: data = cls.custom_soup(ndata, title, row) else: data = soup.prettify() return bytes(data, 'utf-8')
def get_article_doc(link): response = requests.get(link) doc = Document(response.text) return doc