def test_doc_no_scripts_styles(self): """Step #1 remove all scripts from the document""" doc = Article(load_snippet('document_scripts.html')) readable = doc.readable_dom self.assertEqual(readable.findall(".//script"), []) self.assertEqual(readable.findall(".//style"), []) self.assertEqual(readable.findall(".//link"), [])
def getData(source_url): if not source_url.startswith('//') and '://' not in source_url: source_url = 'http://' + source_url try: html = get(source_url, headers={ 'User-Agent': 'Computer Club Plaintext Reading Plugin' }).text except: return None, None, None soup = BeautifulSoup(html, 'lxml') try: header = soup.find('h1').text.strip() except: header = None try: title = soup.find('title').text.strip() except: title = None readable = Article(html, url=source_url).readable soup = BeautifulSoup(readable, 'lxml') if not soup.find('div', {'id': 'readabilityBody'}).text.strip(): readable = None return title, header, readable
def test_no_content(): """Without content we supply an empty unparsed doc.""" doc = Article('') assert doc.readable_dom.tag == 'div' assert doc.readable_dom.get('id') == 'readabilityBody' assert doc.readable_dom.get('class') == 'parsing-error'
def __init__(self, html_content, tokenizer, url=None): super(HtmlParser, self).__init__(tokenizer) self._article = Article(html_content, url) #count number of paragraphs on page self.desired_ct = 2 self.paragraph_ct = 0 self.correct_paragraph_ct = 0 #count paragraphs for paragraph in self._article.main_text: self.paragraph_ct += 1 #check if a lot of paragraphs, usually intro material will be in the first couple paragraphs if self.paragraph_ct > self.desired_ct: #calculate number of paragraphs to use self.correct_paragraph_ct = int(self.paragraph_ct * 0.3 // 10) #if correction results in too few paragraphs - opt to use desired number if self.correct_paragraph_ct <= 1: self.correct_paragraph_ct = self.desired_ct #if less than desired and more than 0, already at correct paragraph count elif self.paragraph_ct > 0: self.correct_paragraph_ct = self.paragraph_ct #delete excess paragraphs diff = self.paragraph_ct - self.correct_paragraph_ct while diff > 0: del self._article.main_text[diff + self.correct_paragraph_ct - 1] diff -= 1
def test_candidates(self): """Verify we have candidates.""" doc = Article(self.article) # from lxml.etree import tounicode found = False wanted_hash = '04e46055' # from breadability.logconfig import LNODE # from breadability.logconfig import set_logging_level # set_logging_level('DEBUG') # LNODE.activate() for node in doc.candidates.values(): if node.hash_id == wanted_hash: found = node self.assertTrue(found) # we have the right node, it must be deleted for some reason if it's # not still there when we need it to be. # Make sure it's not in our to drop list. for node in doc._should_drop: self.assertFalse(node == found.node) by_score = sorted([c for c in doc.candidates.values()], key=attrgetter('content_score'), reverse=True) self.assertTrue(by_score[0].node == found.node) updated_winner = check_siblings(by_score[0], doc.candidates) updated_winner.node = prep_article(updated_winner.node)
def test_content_exists(self): """Verify that some content exists.""" doc = Article(self.article) self.assertTrue('Amazon and Google' in doc.readable) self.assertFalse('Linkblog updated' in doc.readable) self.assertFalse( '#anExampleGoogleDoesntIntendToShareBlogAndItWill' in doc.readable)
def test_unlikely_hits(): """Verify we wipe out things from our unlikely list.""" doc = Article(load_snippet('test_readable_unlikely.html')) readable = doc.readable_dom must_not_appear = [ 'comment', 'community', 'disqus', 'extra', 'foot', 'header', 'menu', 'remark', 'rss', 'shoutbox', 'sidebar', 'sponsor', 'ad-break', 'agegate', 'pagination' '', 'pager', 'popup', 'tweet', 'twitter', 'imgBlogpostPermalink' ] want_to_appear = ['and', 'article', 'body', 'column', 'main', 'shadow'] for i in must_not_appear: # we cannot find any class or id with this value by_class = readable.find_class(i) for test in by_class: # if it's here it cannot have the must not class without the # want to appear class found = False for cls in test.get('class').split(): if cls in want_to_appear: found = True assert found by_ids = readable.get_element_by_id(i, False) if by_ids is not False: found = False for ids in test.get('id').split(): if ids in want_to_appear: found = True assert found
def test_doc_no_scripts_styles(): """Step #1 remove all scripts from the document""" doc = Article(load_snippet('document_scripts.html')) readable = doc.readable_dom assert readable.findall(".//script") == [] assert readable.findall(".//style") == [] assert readable.findall(".//link") == []
def article(): """Load up the article for us""" article_path = os.path.join(os.path.dirname(__file__), 'article.html') with open(article_path, "rb") as file: return Article( file.read(), "http://www.businessinsider.com/tech-ceos-favorite-productivity-hacks-2013-8" )
def test_beta_removed(self): """The id=beta element should be removed It's link heavy and causing a lot of garbage content. This should be removed. """ doc = Article(self.article) self.assertTrue('id="beta"' not in doc.readable)
def test_body_doesnt_exist(self): """If we can't find a body, then we create one. We build our doc around the rest of the html we parsed. """ doc = Article(load_snippet('document_no_body.html')) self.assertEqual(doc.readable_dom.tag, 'div') self.assertEqual(doc.readable_dom.get('id'), 'readabilityBody')
def test_images_preserved(self): """The div with the comments should be removed.""" doc = Article(self.article) self.assertTrue( 'bharath-kumar-a-co-founder-at-pugmarksme-suggests-working-on-a-sunday-late-night.jpg' in doc.readable) self.assertTrue( 'bryan-guido-hassin-a-university-professor-and-startup-junkie-uses-airplane-days.jpg' in doc.readable)
def test_bare_content(self): """If the document is just pure content, no html tags we should be ok We build our doc around the rest of the html we parsed. """ doc = Article(load_snippet('document_only_content.html')) self.assertEqual(doc.readable_dom.tag, 'div') self.assertEqual(doc.readable_dom.get('id'), 'readabilityBody')
def test_find_body_exists(self): """If the document has a body, we store that as the readable html No sense processing anything other than the body content. """ doc = Article(load_snippet('document_min.html')) self.assertEqual(doc.readable_dom.tag, 'div') self.assertEqual(doc.readable_dom.get('id'), 'readabilityBody')
def test_one_annotation(): article = Article( "<div><p>This is text\r\twith <del>no</del> annotations</p></div>") annotated_text = article.main_text assert annotated_text == [( ("This is text\nwith", None), ("no", ("del", )), ("annotations", None), )]
def extract_html(content): article = Article(content) annotated_text = article.main_text paragraphs = "" for paragraph in annotated_text: sentences = "" for text, annotations in paragraph: sentences += text paragraphs += sentences return paragraphs
def test_one_annotation(self): article = Article( "<div><p>This is text\r\twith <del>no</del> annotations</p></div>") annotated_text = article.main_text expected = [( ("This is text\nwith", None), ("no", ("del", )), ("annotations", None), )] self.assertEqual(annotated_text, expected)
def parse(content, content_type=None, url=None): """Handle the parsing out of the html content given""" read = Readable() document = Article(content.read(), url=url) if not document.readable: read.error(STATUS_CODES['900'], "Could not parse content.") else: read.set_content(document.readable, content_type=content_type) read.status = STATUS_CODES['1'] return read
def createAudioFile(filename : str, url : str, speed : int = 200): # check that the url is valid try: response = requests.get(url) if response.status_code is not 200: return "Unable to get page " + url except: return "Unable to get page " + url # now actually get the response response = requests.get(url) # fix filename if needed for pyttsx3 if len(filename) < 5: filename = filename + ".mp3" elif filename[-4] is not ".mp3": filename = filename + ".mp3" # check that the filename/location is valid # This is really not the pythonic solution, HOWEVER # pyttsx3 seems to provide NO error checking or anything directory = os.path.abspath(os.path.dirname(filename)) if not os.access(directory, os.W_OK): return "Unable to write to directory/file" # check for existance if os.path.exists(filename): return "File already exists" # find the important stuff with readability doc = Article(response.content, url=url) # isolate text with soup soup = BeautifulSoup(doc.readable) text = soup.text # init speech engine engine = pyttsx3.init() # set speed engine.setProperty('rate', speed) # write to the file location engine.save_to_file(text, filename) # run engine.runAndWait() engine.stop() # return to make sure it was completed return "Wrote audio to " + filename
def processHtml(html): """ 处理html语言 :param html: :return: """ _article = Article(html) annotated_text = _article.main_text sentences = [] for paragraph in annotated_text: current_text = "" for text, annotations in paragraph: current_text += " " + text sentences.append(current_text) return "".join(sentences)
def extract_html(content,is_content=True): if is_content: article = Article(content) annotated_text=article.main_text else: annotated_text=[((content,None),)] paragraphs="" split_sent=['。','?','!','!','?'] # 将双分号里面的句子不进行分割。 Dquotes=['"','“','”'] for paragraph in annotated_text: sentences="" for text, annotations in paragraph: sentences+=text sentences = re.sub("((.*?))?(\(.+\))?(编译.+)?(责编:.+)?", "", sentences) sentences = re.sub("(本文系版权作品,未经授权严禁转载。.*)\s?(责编)?", "", sentences) sentences = re.sub("\d", "#", sentences) sentences=" ".join(jieba.cut(sentences)) if len(sentences)==0: continue quote=False newsentences="" newsentences+=" "+PARAGRAPH_START+" "+SENTENCE_START+" " for word in sentences: if word in Dquotes and not quote: quote=True newsentences+=word elif word in Dquotes and quote: quote=False newsentences+=word elif quote: newsentences+=word elif word in split_sent and not quote: newsentences+=word newsentences+=" "+SENTENCE_END+" " newsentences+=SENTENCE_START+" " else: newsentences+=word if len(newsentences)-newsentences.rfind(SENTENCE_START+" ")==4: newsentences=newsentences[:-len(SENTENCE_START+" ")] else: newsentences+=" "+SENTENCE_END newsentences+=" "+PARAGRAPH_END paragraphs+=newsentences return paragraphs
def test_simple_snippet(): snippet = Article(load_snippet("annotated_1.html")) annotated_text = snippet.main_text assert annotated_text == [( ("Paragraph is more", None), ("better", ("em", )), (".\nThis text is very", None), ("pretty", ("strong", )), ("'cause she's girl.", None), ), ( ("This is not", None), ("crap", ("big", )), ("so", None), ("readability", ("dfn", )), ("me :)", None), )]
def test_real_article(): article = Article(load_article("zdrojak_automaticke_zabezpeceni.html")) annotated_text = article.main_text assert annotated_text == [ ( ("Automatické zabezpečení", ("h1", )), ("Úroveň zabezpečení aplikace bych rozdělil do tří úrovní:", None), ), ( ("Aplikace zabezpečená není, neošetřuje uživatelské vstupy ani své výstupy.", ("li", "ol")), ("Aplikace se o zabezpečení snaží, ale takovým způsobem, že na ně lze zapomenout.", ("li", "ol")), ("Aplikace se o zabezpečení stará sama, prakticky se nedá udělat chyba.", ("li", "ol")), ), (("Jak se tyto úrovně projevují v jednotlivých oblastech?", None), ), ( ("XSS", ("a", "h2")), ("Druhou úroveň představuje ruční ošetřování pomocí", None), ("htmlspecialchars", ("a", "kbd")), (". Třetí úroveň zdánlivě reprezentuje automatické ošetřování v šablonách, např. v", None), ("Nette Latte", ("a", "strong")), (". Proč píšu zdánlivě? Problém je v tom, že ošetření se dá obvykle snadno zakázat, např. v Latte pomocí", None), ("{!$var}", ("code", )), (". Viděl jsem šablony plné vykřičníků i na místech, kde být neměly. Autor to vysvětlil tak, že psaní", None), ("{$var}", ("code", )), ("někde způsobovalo problémy, které po přidání vykřičníku zmizely, tak je začal psát všude.", None), ), (("<?php\n$safeHtml = $texy->process($content_texy);\n$content = Html::el()->setHtml($safeHtml);\n// v šabloně pak můžeme použít {$content}\n?>", ("pre", )), ), ( ("Ideální by bylo, když by už samotná metoda", None), ("process()", ("code", )), ("vracela instanci", None), ("Html", ("code", )), (".", None), ), ]
def extract(html, **kwargs): """ Extract an article from given URL Example:: >>> from artexin.fetch import fetch_content >>> c = fetch_content('http://hetland.org/writing/instant-hacking.html') >>> t, s = extract(c) >>> 'What is Programming?' in s True >>> '<a href="./../research">Research</a>' in s False >>> '<div id="navigation">' in s False :param html: String containing the HTML document :param **kwargs: Extra arguments for readability's ``Document()`` class :returns: Two-tuple containing document title and article body """ # Extract article soup = BeautifulSoup(html, 'lxml') title_text = get_title(soup) doc = Article(html, return_fragment=False, **kwargs) # Create basic <head> tag with <title> and charset tags clean_html = doc.readable soup = BeautifulSoup(clean_html, 'lxml') head = soup.new_tag('head') title = soup.new_tag('title') title.string = title_text meta_charset = soup.new_tag('meta', charset='utf-8') meta_equiv = soup.new_tag('meta', content="text/html; charset='utf-8'") meta_equiv['name'] = 'http-equiv' # new_tag() doesn't allow 'name' kwarg soup.html.insert(0, head) soup.head.append(meta_charset) soup.head.append(meta_equiv) soup.head.append(title) # Add doctype final = '<!DOCTYPE html>\n' + soup.prettify() return (title_text, final)
def main(): args = parse_args() if args.verbose: set_logging_level('DEBUG') if args.debug: LNODE.activate() target = args.path[0] LOG.debug("Target: " + target) if target.startswith('http') or target.startswith('www'): is_url = True url = target else: is_url = False url = None if is_url: req = urllib.urlopen(target) content = req.read() ucontent = unicode(content, 'utf-8') else: ucontent = codecs.open(target, "r", "utf-8").read() doc = Article(ucontent, url=url, fragment=args.fragment) if args.browser: fg, pathname = mkstemp(suffix='.html') out = codecs.open(pathname, 'w', 'utf-8') out.write(doc.readable) out.close() webbrowser.open(pathname) else: # Wrap sys.stdout into a StreamWriter to allow writing unicode. sys.stdout = codecs.getwriter( locale.getpreferredencoding())(sys.stdout) sys.stdout.write(doc.readable)
def article(): """Load up the article for us""" article_path = os.path.join(os.path.dirname(__file__), 'article.html') with open(article_path) as file: return Article(file.read())
def __init__(self, html_content, tokenizer, url=None): super(HtmlParser, self).__init__(tokenizer) self._article = Article(html_content, url)
def test_title_loads(): """Verify we can fetch the title of the parsed article""" doc = Article(load_snippet('document_min.html')) assert doc._original_document.title == 'Min Document Title'
def test_no_annotations(): article = Article("<div><p>This is text with no annotations</p></div>") annotated_text = article.main_text assert annotated_text == [(("This is text with no annotations", None), )]
def test_empty(): article = Article("") annotated_text = article.main_text assert annotated_text == []