def translate_page(request: HttpRequest): """ 翻译页面,可以使用url或file_id参数 :param request: :return: """ url = request.GET.get('url') file_id = request.GET.get('file_id') if url: # 如果提供了url if 'http' not in url: url = 'http://' + url try: # 获取网页 f = urllib.request.urlopen(url) text = f.read().decode() article = Document(text) article.content() # 抽取文本 html = article.get_clean_html() # 计算hash m = hashlib.md5() m.update(html.encode('utf-8')) hashed = m.hexdigest() try: # 如果该用户相同的文本已经存在,读取并返回 p = UserPreference.objects.get(hashed=hashed, user=request.user) except UserPreference.DoesNotExist: p = UserPreference(user=request.user, original=text, trans=text, hashed=hashed) p.save() text = base64.b64encode(p.trans.encode('utf-8')).decode() return render(request, 'translate.html', context={ 'text': text, 'hashed': hashed, 'inc': p.inc }) except Exception: return redirect('/translate_index?message=' + _("Cannot open url")) elif file_id: try: p = UserPreference.objects.get(pk=file_id) text = base64.b64encode(p.trans.encode('utf-8')).decode() return render(request, 'translate.html', context={ 'text': text, 'hashed': p.hashed, 'inc': p.inc }) except UserPreference.DoesNotExist: pass return redirect('/')
def sitegetter(bots, update, args): url = args[0] raw = args[1] == "true" response = requests.get(url) if not sendFile: doc = Document(response.text) if raw is False: print('summary') text = doc.summary() else: text = doc.content() line = text n = 4000 output = [line[i:i + n] for i in range(0, len(line), n)] for a in output: bots.send_message(chat_id=update.message.chat_id, text=a) else: with open('output.html', 'w+') as out: out.write(response.text) bots.send_document(chat_id=update.message.chat_id, document=open('output.html', 'rb'))
def process_item(self, item, spider): ''' DESCRIPTION: ------------ For each news item, corresponding news text is extracted using python library 'readability'. RETURNS: -------- News item with 'newsText' field updated is returned. ''' try: response = requests.get(item['newsUrl']) doc = Document(response.text) content = Document(doc.content()).summary() h = html2text.HTML2Text() h.ignore_links = True articleText = h.handle(content) articleText = articleText.replace('\r', ' ').replace('\n', ' ').strip() item['newsText'] = articleText except Exception: raise DropItem("Failed to extract article text from: " + item['newsUrl']) return item
def main(url): article = Article(url) article.download() article.parse() article.nlp() document = Document(article.html) summary = document.summary() content = document.content() title = get_title(article) text = get_text(article) entities = get_entities(text) phrases = get_phrases(text, entities) keywords = get_keywords(article, phrases) urls_primary = get_urls(summary, url, []) urls_secondary = get_urls(content, url, urls_primary) return { 'title': title, 'text': text, 'entities': entities, 'keywords': keywords, 'phrases': phrases, 'urls': { 'primary': urls_primary, 'secondary': urls_secondary, }, }
def parse_item(self, response): filename = hashlib.sha1(response.url.encode()).hexdigest() readability_document = Document(response.body, url=response.url) item = BeerReviewPage() item['url'] = response.url item['filename'] = filename item['depth'] = response.meta['depth'] item['link_text'] = response.meta['link_text'] item['title'] = readability_document.short_title() with open('data/' + filename + '.html','wb') as html_file: html_file.write(readability_document.content()) print '(' + filename + ') ' + item['title'] + " : " + item['url'] return item
def process(self, item, spider): try: response = requests.get(item['newsurl']) doc = Document(response.text) content = Document(doc.content()).summary() h = html2text.HTML2Text() h.ignore_links = True articltext = h.handele(content) articltext = articltext.replace('\r', ' ').replace('\n', ' ').strip() item['newstext'] = articltext except Exception: raise DropItem("extract article Failed from: " + item['newsurl']) return item
def extract_article_info(text): """ Gets simplified page from the text Uses readability module """ doc = Document(text) # safe fetch title title = doc.short_title() if not title: title = doc.title() # content content = doc.summary(html_partial=True) image = get_page_image(doc.content()) # return return {'title': title, 'content': content, 'image': image}
def get_article_text(self, response): ''' DESCRIPTION: ----------- * This function cleanse the page of superfluous content such as advertising and HTML PARAMETERS: ---------- 1. response ''' doc = Document(response.text) article_html = Document(doc.content()).summary() h = html2text.HTML2Text() h.ignore_links = True article_text = h.handle(article_html) article_text = article_text.replace('\r', ' ').replace('\n', ' ').strip() return article_text
def extract_content_texts(name): article_archive = os.path.join(DEFAULT_SAVE_PATH, name, 'raw_articles') json_archive = os.path.join(DEFAULT_SAVE_PATH, name, 'json_articles') mkdir_p(json_archive) for html in glob.glob(article_archive+'/*.html'): fname = os.path.basename(html)+'.json' savepath = os.path.join(json_archive, fname) if os.path.exists(savepath): logging.info('Skipping existing json data: {0}'.format(savepath)) continue data = {} with open(html, 'r') as myfile: doc = Document(myfile.read()) data['title'] = doc.title() data['content'] = doc.content() data['summary'] = doc.summary() with open(savepath, 'w') as saving: json.dump(data, saving)
def extract_content_texts(name): article_archive = os.path.join(DEFAULT_SAVE_PATH, name, 'raw_articles') json_archive = os.path.join(DEFAULT_SAVE_PATH, name, 'json_articles') mkdir_p(json_archive) for html in glob.glob(article_archive + '/*.html'): fname = os.path.basename(html) + '.json' savepath = os.path.join(json_archive, fname) if os.path.exists(savepath): logging.info('Skipping existing json data: {0}'.format(savepath)) continue data = {} with open(html, 'r') as myfile: doc = Document(myfile.read()) data['title'] = doc.title() data['content'] = doc.content() data['summary'] = doc.summary() with open(savepath, 'w') as saving: json.dump(data, saving)
def cleanDocument(self, text, theUrl): replaceChars = [ ("“", '"'), ("”", '"'), ("‘", "'"), ("’", "'"), ("`", "'"), ("`", "'"), ("′", "'"), ("—", "-"), ("–", "-"), ("…", "..."), ("•", "."), ("«", '"'), ("»", '"'), ("„", '"'), ("μ", "micro"), ("™", "(TM)"), ("≤", "<="), ("≥", ">="), ("∀", "ForAll"), ("⇒", "=>"), ("б", "(6)"), ("š", "s"), ("├", "|-"), ("─", "--"), ("|", "| "), ("│", "| "), ("└", "-"), ("→", "->"), ("⁄", "/"), ("⅓", "1/3"), ("📸", "(camera)"), ("✅", "(x)"), ("👽", "(alien)"), ("👍", "(ok)"), ("🙀", "(oh)"), ("🚀", "(despegar)"), ("\\n",""), ("\\t",""), ] from readability import Document doc = Document(text) doc_title = doc.title() if not doc_title or (doc_title == "[no-title]"): if theUrl.lower().endswith("pdf"): title = getPdfTitle(response) print(title) doc_title = "[PDF] " + title theTitle = doc_title # myText = doc.summary() myText = doc.content() for a, b in replaceChars: myText = myText.replace(a, b) theTitle = theTitle.replace(a, b) return (myText, theTitle)
def make_site_with_rssfeed_readable_again(url, filename, is_clean): """Convert feed to an HTML.""" with open(filename, 'w') as file_object: print "\nOPENING URL: " + url + "\n\n" headers = { 'User-Agent': APP_BRANDNAME + '/' + APP_RELEASE + ' (Unix; Intel OS Nine 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36' } response = requests.get(url, headers=headers) mystr = response.text # remove heigh and width in images because CSS will do that mystr = mystr.replace(u"height=", "whatever=") mystr = mystr.replace(u"width=", "whatever=") # remove unwanted strings in output mystr = mystr.replace(u'<hr id=', '<hr class="spenden" id=') mystr = mystr.replace(u"<p><strong>Hilf mit!</strong>", "") mystr = mystr.replace( u"Mit Deiner finanziellen Hilfe unterstützt Du unabhängigen Journalismus.", "") if APP_DEBUG: print "FEED:\n" + str(mystr) + "\n****************************" feedtitle = None try: root = parse_feed(mystr) entries = root.entries # access feedtitle feedtitle = root.feed.title except Exception, e: print "PARSING-ERROR: " + str(e) print(traceback.format_exc()) pass if not feedtitle: feedtitle = DEFAULT_TITLE if is_clean: template = APP_PATH + '/' + 'template_clean.html' else: template = APP_PATH + '/' + 'template_readable.html' if APP_DEBUG: print "\n ENTRIES TO RENDER: " + str(len(entries)) + "\n" last_entry_link = entries[len(entries) - 1].link html_footer = site_footer_html() html_content = Template(filename=template, output_encoding='utf-8').render( last_entry_link=last_entry_link, num_of_entries=len(entries), feedurl=url, entries=entries, feedtitle=feedtitle, footer=html_footer) if APP_DEBUG: print "HTML:\n" + html_content + "\n****************************" if is_clean: clean = Document(html_content) file_object.write(clean.content()) else: file_object.write(html_content)
- ArticleExtractor - ArticleSentencesExtractor - KeepEverythingExtractor - KeepEverythingWithMinKWordsExtractor - LargestContentExtractor - NumWordsRulesExtractor - CanolaExtractor """ url = 'https://techcrunch.com/2017/02/13/mit-speech-chip/' #BadStatusLine from boilerpipurle url = "http://www.forbes.com/sites/trevorclawson/2017/02/23/finding-a-voice-can-a-uk-startup-compete-with-its-heavy-hitters-in-the-speech-recognition-market/" url = "https://nakedsecurity.sophos.com/2017/03/03/researcher-uses-googles-speech-tools-to-skewer-google-recaptcha/" url = "http://www.natureworldnews.com/articles/32595/20161123/microsoft-officially-makes-first-humanly-accurate-speech-recognition-tech.htm" url = "http://www.businessinsider.com/ibm-edges-closer-to-human-speech-recognition-2017-3" #ArticleExtractor = Extractor(extractor='ArticleExtractor', url=url) #print "ArticleExtractor:\n" + ArticleExtractor.getText() + "\n" ArticleSentencesExtractor = Extractor(extractor='ArticleSentencesExtractor', url=url) print ArticleSentencesExtractor.getText() article = Goose().extract(url=url) print article.cleaned_text document = Document(requests.get(url)) document.content()
import argparse import requests from readability import Document parser = argparse.ArgumentParser() parser.add_argument('-u', '--url', dest='url', help='url of policy', metavar='URL', required=True) parser.add_argument('-d', '--dest', dest='filepath', help='file to save policy', metavar='FILE', required=True) args = parser.parse_args() url = args.url filepath = args.filepath print(url, filepath) response = requests.get(url) doc = Document(response.text) doc.title() with open(filepath, 'w') as fd: fd.write(doc.content())
import requests from readability import Document from pprint import pprint response = requests.get('https://laravel-news.com/announcing-building-a-chatbot-with-laravel-and-botman') doc = Document(response.text) # API methods: # .title() -- full title # .short_title() -- cleaned up title # .content() -- full content # .summary() -- cleaned up content data = dict() data['title'] = doc.title() data['short_title'] = doc.short_title() data['content'] = doc.content() data['summary'] = doc.summary() pprint( data )