def get_text(url): response = get(url) doc = Document(response.text) title = doc.title() summary = doc.summary() body = bs4.BeautifulSoup(summary, features="lxml").get_text() return f"{title} : {body}"
def convert(link): """ use burify's readability implementation to transcode a web page and return the transcoded page and images found in it """ if not link: logger.error('Cannot transcode nothing!') return None, None, None try: data = transcoder.prepare_link(link) if data: article = Document(data) if article: images, content = _collect_images( article.summary(html_partial=False), link) return article.short_title(), content, images else: logger.info('Burify cannot recognize the data') return None, None, None else: logger.info('Cannot parse %s correctly' % link) return None, None, None except Exception as k: logger.error('%s for %s' % (str(k), str(link))) return None, None, None
def process_item(self, item: NoticeItem, spider: scrapy.Spider): if item and isinstance(item, NoticeItem): ''' Extract main content from html. ''' def clean_p(s: str) -> str: return s.replace('\xa0', ' ').strip() def clean_all(s: str) -> str: s = self.SPACES_PATTERN.sub('\n', s) s = s.strip() return s """ Expel non-UTF8 characters. """ content = item['content'].decode('utf-8', 'replace') item['author'] = _find_author(content) item['department'] = _find_department(content) item['publish_time'] = _find_publish_time(content) article = Document(content, handle_failures=None) page = etree.HTML(article.summary()) paragraphs = [ clean_p(p.xpath('string(.)')) for p in page.xpath('//p') ] item['content'] = clean_all('\n'.join(paragraphs)) self.pg_pool.runInteraction(self.submit_item, item) else: return item
def POST(self): url = web.data().decode('utf8') response = requests.get(url) doc = Document(response.text) summary = doc.summary() web.header('Content-Type', 'application/text') return clean(capture(summary))
def parse(html): doc = Document(html) title = doc.title() if title == u'[no-title]': title = u'' content_html = doc.summary() content_html = content_html.replace(u'<html>', u'').replace(u'</html>', u'')\ .replace(u'<body>', u'').replace(u'</body>', u'') clear_paths = [u'//script', u'//img', u'//a'] body = clearDOM(html, clear_paths) match_list = findTimeStr(body) post_date = u'' for match_item in match_list: if len(match_item) > len(post_date): post_date = match_item style_in_list = [] style_need_replace = [] content_item = { u'title': title, u'content_html': content_html, u'post_date': post_date, u'style_in_list': style_in_list, u'style_need_replace': style_need_replace } return content_item
def parse_article(url: str) -> Dict: """ Parses the HTML output for URL and grabs title and description :param url: The URL sent by the user :return: Dictionary containing HTML data """ if is_url_blacklisted(url): return {} data = {"title": "", "description": "", "image": "", "body": ""} try: headers = {'user-agent': 'Bookie/app'} response = requests.get(https_upgrade(url), timeout=3, headers=headers) except Timeout: return data tree = html.fromstring(response.content) doc = Document(response.text) title = tree.xpath('//title/text()') description = tree.xpath('//meta[@name="description"]/@content') image = tree.xpath('//meta[@property="og:image"]/@content') body = doc.summary() if title: data["title"] = title[0] if description: data["description"] = description[0] if image: data["image"] = https_upgrade(image[0]) if body: data["body"] = body return data
def test_correct_cleanup(self): sample = """ <html> <body> <section>test section</section> <article class=""> <p>Lot of text here.</p> <div id="advertisement"><a href="link">Ad</a></div> <p>More text is written here, and contains punctuation and dots.</p> </article> <aside id="comment1"/> <div id="comment2"> <a href="asd">spam</a> <a href="asd">spam</a> <a href="asd">spam</a> </div> <div id="comment3"/> <aside id="comment4">A small comment.</aside> <div id="comment5"><p>The comment is also helpful, but it's still not the correct item to be extracted.</p> <p>It's even longer than the article itself!"</p></div> </body> </html> """ doc = Document(sample) s = doc.summary() # print(s) assert "punctuation" in s assert not "comment" in s assert not "aside" in s
def parse(self, url: str) -> dict: """Download the article and parse it""" r = get(url, headers=HTML_HEADERS) doc = Document(r.text, url=url) html = doc.summary(html_partial=True) clean_html = self.fix_blockquotes(html) return {"content": clean_html}
def analyze(request): 'API text analyze view' if request.method == 'POST': text = request.body.decode('utf-8') try: text = json.loads(text)['text'] except ValueError: # catch POST form as well for key in request.POST.dict().keys(): text = key if settings.ALLOW_URL_IMPORTS and text.startswith(('http://', 'https://', 'www')): page = requests.get(text) doc = Document(page.text) soup = BeautifulSoup(doc.summary()) text = soup.get_text() title = doc.title().strip() text = '{0}.\n{1}'.format(title, text) if not text: response = JsonResponse( {'status': 'false', 'message': 'need some text here!'}) response.status_code = 400 return response # add some limit here text = text[:200000] ret = {} ret = analyze_text(text) return JsonResponse(ret) else: ret = {'methods_allowed': 'POST'} return JsonResponse(ret)
def parse_story(self, response): doc = Document(response.text) story = NewsItem() story['url'] = response.url story['headline'] = doc.short_title() story['body'] = doc.summary() yield story
def getAutoDetail(cls, contentPageNumber, html, enableDownloadImage=False, enableSnapshot=False): autoDetail = {} try: doc = Document(html) # response. if contentPageNumber <= 1: autoDetail["title"] = ArticleUtils.cleanHeadTitle(doc.title()) # autoDetail["publishAt"] = TimeUtils.get_conent_time(html) # autoDetail["html"] = html contentSnapshot = doc.summary() if StringUtils.isNotEmpty( ArticleUtils.removeAllTag(contentSnapshot)): if enableSnapshot: autoDetail["contentSnapshot"] = contentSnapshot.replace( "<html>", "").replace("</html>", "").replace("<body>", "").replace("</body>", "") autoDetail["content"] = ArticleUtils.removeTag4Content( contentSnapshot) if enableDownloadImage: autoDetail[ "contentImages"] = ArticleUtils.get_content_image_urls( contentSnapshot, response.url) except Exception as e: return autoDetail return autoDetail
def main(url): article = Article(url) article.download() article.parse() article.nlp() document = Document(article.html) summary = document.summary() content = document.content() title = get_title(article) text = get_text(article) entities = get_entities(text) phrases = get_phrases(text, entities) keywords = get_keywords(article, phrases) urls_primary = get_urls(summary, url, []) urls_secondary = get_urls(content, url, urls_primary) return { 'title': title, 'text': text, 'entities': entities, 'keywords': keywords, 'phrases': phrases, 'urls': { 'primary': urls_primary, 'secondary': urls_secondary, }, }
def getAutoDetail(cls, response, enableDownloadImage=False, enableSnapshot=False, isFirstPage=True): autoDetail = {} try: html = "".join(response.xpath("//html").extract()) doc = Document(html) if isFirstPage: autoDetail["title"] = doc.title() autoDetail["publishAt"] = TimeUtils.get_conent_time(html) contentSnapshot = doc.summary() if enableSnapshot: autoDetail["contentSnapshot"] = contentSnapshot.replace( "<html>", "").replace("</html>", "").replace("<body>", "").replace("</body>", "") autoDetail["content"] = ArticleUtils.removeTag4Content( contentSnapshot) if enableDownloadImage: autoDetail[ "contentImages"] = ArticleUtils.get_content_image_urls( contentSnapshot, response.url) except Exception as e: return autoDetail return autoDetail
def parse(self, response): # response.body(str)与reponse.text(unicode)的区别使用 # 注意页面的编码问题 key = response.url code = get_code(response.body) text = response.body.decode(code).encode('utf-8') name, college = pro_lists[key][0], pro_lists[key][1] # 存储原始格式的html文件 homepage_name = unicode(name+ '_' + college, 'utf-8') filename = '%s.html' % homepage_name with open(os.path.join(source_html, filename), 'w') as f: f.write(text) self.log('Saved origin file %s' % filename) # 使用readability包的初步处理,获取较为干净的页面 doc = Document(text) clean_text, title = self.get_cleanpage(doc) with open(os.path.join(clean_html, filename), 'w') as f: f.write(clean_text) self.log('Saved clean file %s' % filename) # 使用htmlpaser,抽取所有的文字(配合readablity来完成基本的抽取) data = htmlpaser.FilterTag.strip_tags(text) data1 = beatifulsoup.get_html_content(text) html_content.write(homepage_name + ': "' + data + '"\n')
def process_item(self, article, spider): doc = Document(article['text']) article['text'] = strip_tags(doc.summary()) article['hash'] = hashlib.sha256(article['url']).hexdigest() return article
def get_article(url): try: headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36' } response = requests.get( url, headers=headers, timeout=4 ) # can throw BaseException if the server does not respond, or TooManyRedirects except BaseException: return '' readability_doc = Document(response.text) # doc.summary() is not really a summary just the main part of the # website's content html_content = readability_doc.summary() title = readability_doc.title() soup = BeautifulSoup(html_content, 'html.parser') # Replace all excesive whitespace and new line content = re.sub(r"(\s{2,})|(\n{2,})", "\n", soup.get_text(), flags=re.UNICODE) if " - " in title: title = re.sub(r" - [\s\w]+$", "", title) return content, title
def handleMatch(self, match, full_text): """Find a import statement in the text, if it's there we extract the import type and its associated URL.""" import_type, url = match.group(1, 2) #TODO: Refactor this to use methods instead of long # if conditional if import_type == "html": source = request.urlopen(url) parser = HTMLBodyParser() #TODO: Handle non-UTF-8 parser.feed(source.read().decode("UTF-8")) source.close() return parser.document, match.start(), match.end() if import_type == "html_s": source = request.urlopen(url) #TODO: Handle non-UTF-8 source_document = Document(source.read().decode("UTF-8")) parser = HTMLBodyParser() parser.feed(source_document.summary()) source.close() return parser.document, match.start(), match.end() if import_type == "csv": #TODO: Handle non-UTF-8 source = request.urlopen(url) csv_text = source.read().decode("UTF-8") table = etree.Element("table") for row in csv.reader(csv_text.splitlines()): table_row = etree.SubElement(table, "tr") for data in row: td = etree.SubElement(table_row, "td") td.text = data source.close() return table, match.start(), match.end()
def text(url): if 'http' not in url: url = 'http://' + url page = get(url).text doc = Document(page).summary() text = BeautifulSoup(doc).get_text() return text.strip()
def test_wrong_link_issue_49(self): """We shouldn't break on bad HTML.""" sample = load_sample( "the-hurricane-rubin-carter-denzel-washington.html") doc = Document(sample) res = doc.summary(html_partial=True) self.assertEqual('<div><div class="content__article-body ', res[0:39])
def getHtmlToClipboard(dest_url): html_headers = { "Host": "note.youdao.com", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", #"Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "Connection": "keep-alive" } try: html_host = urllib.parse.urlparse(dest_url) html_headers["Host"] = html_host.netloc # urllib不会自动解压缩 #html_request = urllib.request.Request(url, headers=html_headers) #html_respone = urllib.request.urlopen(html_request) #html_respone = urllib.request.urlopen(dest_url) #print(html_respone.read().decode("utf-8")) #html_text = html_respone.read().decode("utf-8") html_respone = requests.get(dest_url, headers=html_headers) #print(html_respone.text) html_text = html_respone.text html_doc = Document(html_text) HtmlClipboard.PutHtml(html_doc.summary()) if HtmlClipboard.HasHtml(): print('there is HTML!!') dirty_HTML = HtmlClipboard.GetHtml() except Exception: return False else: return True
def extract(): url = request.form['site'] response = requests.get(url) doc = Document(response.text) parser = html2text.HTML2Text() parser.ignore_links = True parser.ignore_images = True parser.ignore_emphasis = True parser.ignore_anchors = True parser.ignore_tables = True title = doc.title() title = re.sub(r' *- [-a-zA-Z0-9 @:%._\+~#=]{1,256}', '', title) # <---- this crops everything after -[space] title = re.sub(r' *- *[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)', '', title) # <---- this is for websites only article = parser.handle(str(doc.summary())) article = fix_article(article) article = article.split("<br/>") if url != '': insert_in_db(analytics_client, container, url, title, article) return flask.render_template('index.html', title=title, data=article) else: error_msg = "Enter a valid URL" return flask.render_template('index.html', error=error_msg)
def run_readability(htmlstring): '''try with the Python3 port of readability.js''' try: doc = Document(htmlstring) return doc.summary() # sanitize(doc.summary()) except Exception as err: print('Exception:', err) return ''
def test_best_elem_is_root_and_passing(self): sample = ('<html class="article" id="body">' ' <body>' ' <p>1234567890123456789012345</p>' ' </body>' '</html>') doc = Document(sample) doc.summary()
def process_one(self, content): try: doc = Document(content) return ContentResult(doc.title(), doc.summary()) except Exception as e: logger.error( f"Readability failed on {content.title} with error {e}") return ContentResult('', '')
def test_si_sample(self): """Using the si sample, load article with only opening body element""" sample = load_sample('si-game.sample.html') doc = Document( sample, url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html') res = doc.summary() self.assertEqual('<html><body><div><div class', res[0:27])
def get_entry_content(self, entry, rss_entry): try: r = requests.get(rss_entry.link) assert r.status_code == 200 except Exception: return '<div class="alert alert-warning" role="alert">Unable to get the full article with readability, because the page didn\'t load :(</div>\n' + super( ).get_entry_content(entry, rss_entry) return Document(r.text).summary()
def get_main_text(url): response = requests.get(url) doc = Document(response.text) text = doc.summary() re_form = r'(?<=<p>)(.*?)(?=</p>)' re_text = " ".join(re.findall(re_form, text)) print(re_text) return re_text
def test_many_repeated_spaces(self): long_space = " " * 1000000 sample = "<html><body><p>foo" + long_space + "</p></body></html>" doc = Document(sample) s = doc.summary() assert "foo" in s
def get_text_from_url(url): ''' Get most significant text returned from URL. ''' response = requests.get(url) doc = Document(response.text) text = dataset.preprocess_html(doc.summary()) return text
def test_many_repeated_spaces(self): long_space = ' ' * 1000000 sample = '<html><body><p>foo' + long_space + '</p></body></html>' doc = Document(sample) s = doc.summary() assert 'foo' in s