def read(self, url=None, html=None, soup=None, autocue=default_autocue): self.autocue = autocue if soup is not None: self.soup = soup else: if html is not None: print "making soup" try: self.soup = self._get_soup(html) except TypeError: return False else: if url is not None: html = requests.get(url).content try: self.soup = self._get_soup(html) except TypeError: return False else: self.soup = None print "making url" self.make_urls_absolute(self.soup, url) print "pre traversal" self.autocue.execute(self.soup, PRE_TRAVERSAL) # We work our way up the DOM for tag in reversed(self.soup.find_all()): if tag.name == 'p': evaluate_as = EVAL_PARAGRAPH else: # If tag contains text of its own, evaluate it as a paragraph evaluate_as = EVAL_CONTAINER for child in tag.children: if isinstance(child, NavigableString): text = unicode(child).strip() if len(text) > 10: evaluate_as = EVAL_PARAGRAPH continue self.autocue.execute(tag, evaluate_as) print "post traversal" self.autocue.execute(self.soup, POST_TRAVERSAL) print "max score" self.news_container = get_tag_with_max_score(self.soup) return True
def read(self, url=None, html=None, soup=None, autocue=default_autocue): self.autocue = autocue self.last_url = '' if soup is not None: self.soup = soup else: if html is not None: try: self.soup = self._get_soup(html) except TypeError: return False else: if url is not None: r = requests.get(url) print r.headers['content-type'] self.is_html = 'text/html' in r.headers['content-type'] print 'read says', self.is_html html = r.content self.last_url = r.url try: self.soup = self._get_soup(html) except TypeError: return False else: self.soup = None self.make_urls_absolute(self.soup, url) self.autocue.execute(self.soup, PRE_TRAVERSAL) # We work our way up the DOM for tag in reversed(self.soup.find_all()): if tag.name == 'p': evaluate_as = EVAL_PARAGRAPH else: # If tag contains text of its own, evaluate it as a paragraph evaluate_as = EVAL_CONTAINER for child in tag.children: if isinstance(child, NavigableString): text = unicode(child).strip() if len(text) > 10: evaluate_as = EVAL_PARAGRAPH continue self.autocue.execute(tag, evaluate_as) self.autocue.execute(self.soup, POST_TRAVERSAL) self.news_container = get_tag_with_max_score(self.soup) return True
def read(self, url=None, html=None, soup=None, autocue=default_autocue): self.autocue = autocue if soup is not None: self.soup = soup else: if html is not None: self.soup = self._get_soup(html) else: if url is not None: html = requests.get(url).content self.soup = self._get_soup(html) else: self.soup = None self.make_urls_absolute(self.soup, url) self.autocue.execute(self.soup, PRE_TRAVERSAL) # We work our way up the DOM for tag in reversed(self.soup.find_all()): if tag.name == 'p': evaluate_as = EVAL_PARAGRAPH else: # If tag contains text of its own, evaluate it as a paragraph evaluate_as = EVAL_CONTAINER for child in tag.children: if isinstance(child, NavigableString): text = unicode(child).strip() if len(text) > 10: evaluate_as = EVAL_PARAGRAPH continue self.autocue.execute(tag, evaluate_as) self.autocue.execute(self.soup, POST_TRAVERSAL) self.news_container = get_tag_with_max_score(self.soup)