def auto_excerpt(self): """ Attempts to detect the text of this page (ignoring all navigation and other clutter), returning a list of strings. Each string represents a paragraph. """ from ebdata.textmining.treeutils import make_tree tree = make_tree(self.html) if self.seed.rss_full_entry: from ebdata.templatemaker.textlist import html_to_paragraph_list paras = html_to_paragraph_list(tree) else: if self.seed.strip_noise: from ebdata.templatemaker.clean import strip_template try: html2 = self.companion_page().html except IndexError: pass else: tree2 = make_tree(html2) strip_template(tree, tree2) if self.seed.guess_article_text: from ebdata.templatemaker.articletext import article_text paras = article_text(tree) else: from ebdata.templatemaker.textlist import html_to_paragraph_list paras = html_to_paragraph_list(tree) return paras
def main(url): if not url: print "No url provided" sys.exit() #url = 'http://newstatesman.com/politics/2013/10/russell-brand-on-revolution' #h = getHTML(url) html = UnicodeRetriever().fetch_data(url) tree = make_tree(html) lines = article_text(tree) file_type = magic.from_buffer(html, mime=True) print "File Type: %s" % file_type #print html url_obj = urlparse(url) if not url_obj.path: print "URL is top-level" if len(lines) < 1: print "URL is top-level" soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES) #print get_attribute(html, 'img', url) img = get_attribute(soup, 'img', url) title = get_attribute(soup, 'title', url) desc = get_attribute(soup, 'description', lines) print "Title: %s" % title print "Desc: %s" % desc print "IMG: %s" % img
def main(url): if not url: print "No url provided" sys.exit() #url = 'http://newstatesman.com/politics/2013/10/russell-brand-on-revolution' #h = getHTML(url) html = UnicodeRetriever().fetch_data(url) tree = make_tree(html) lines = article_text(tree) file_type = magic.from_buffer(html, mime=True) print "File Type: %s" % file_type #print html url_obj = urlparse(url) if not url_obj.path: print "URL is top-level" if len(lines)<1: print "URL is top-level" soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES) #print get_attribute(html, 'img', url) img = get_attribute(soup, 'img', url) title = get_attribute(soup, 'title', url) desc = get_attribute(soup, 'description', lines) print "Title: %s" % title print "Desc: %s" % desc print "IMG: %s" % img
def get_printer_friendly_page(self, html, url): """ Parses the given detail page and returns the printer-friendly page, or None if not found. """ print_link = printer_friendly_link(make_tree(html)) if print_link: print_link = urlparse.urljoin(url, print_link) try: return self.get_article_page(print_link) except Exception, e: self.logger.debug('Error retrieving printer-friendly page %s: %s', url, e) return None
def get_printer_friendly_page(self, html, url): """ Parses the given detail page and returns the printer-friendly page, or None if not found. """ print_link = printer_friendly_link(make_tree(html)) if print_link: print_link = urlparse.urljoin(url, print_link) try: return self.get_article_page(print_link) except Exception, e: self.logger.debug( 'Error retrieving printer-friendly page %s: %s', url, e) return None
def assertMakeTree(self, html, expected): import warnings # Note, warnings.catch_warnings() should work but has no effect here? warnings.simplefilter('ignore', UnicodeWarning) got = etree.tostring(make_tree(html), method='html') self.assertEqual(got, expected)
def assertConverts(self, html, expected): self.assertEqual(html_to_paragraph_list(make_tree(html)), expected)
def setAttributesForUrl(pgConn, url_id, html_str=None, url_core=None): """ Downloads the HTML if not found in DB pulls out the title, description, thumbnail, etc saves this meta data to postgres """ printMsg = True html = None url = None soup = None lines = [] if not pgConn: if printMsg: print '[setAttributesForUrl]: Error - No postgres connection' return False if url_core: url = url_core if not url: row = get_url(pgConn, url_id) if row: url = row['url'] if url: url_obj = urlparse(url) if len(url_obj.path) < 5: url = None if url: if html_str: html = html_str if not html: html_row = get_html(pgConn, url_id) if html_row: html = html_row['html'] elif url: html = getHTML(url) if html: tree = make_tree(html) lines = article_text(tree) soup = cleanSoup(BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)) if len(lines) < 1: html = None if not html or not url or not soup: if printMsg: print '[setAttributesForUrl]: Error - no html returned %s' % url delete_url(pgConn, url_id) # not sure we need to do this return False # get thumbnail # TODO: check to see if this is working correctly thumbnail = get_attribute(soup, 'img', url) if not len(thumbnail): if printMsg: print '[setAttributesForUrl]: Warning - no thumbnail returned - %s' % url # get title title = get_attribute(soup, 'title') if title is None: title = '' if not len(title): if printMsg: print '[setAttributesForUrl]: Warning - no title returned - %s' % url title = " ".join(title.strip().split()) # get description description = get_attribute(soup, 'description', lines) if description == 'error': #delete_url(pgConn, url_id) description = None if description is None: description = '' try: description = " ".join(description.strip().split()) except: description = description if printMsg: print "" print "-----------------------------------------" print "URL ID: %s" % url_id print "Title: %s" % title print "Desc: %s" % description print "IMG: %s" % thumbnail print "-----------------------------------------" if not len(description): if printMsg: print '[setAttributesForUrl]: Warning - no description returned - %s' % url # save if pgConn: save_url_attributes(pgConn, url_id, title, description, thumbnail) return False
def urlValidation(pgConn, url_long, hosts=[]): # TODO: fix periodically not returning the expanded URL url_parts = getURLCore(pgConn, url_long) # returns list or None valid = False url_core = None html = None if not url_parts: print "[addURL]: %s - Error - not able to find a core: %s" % (getCurrentTime(), url_long) return url_core, html if url_parts and len(url_parts): url_core = url_parts[0] url_netloc = url_parts[1] if url_core and url_netloc: #validate here to only the media outlets in hosts if len(hosts): net_loc_lower = url_netloc.lower().strip() ext = tldextract.extract(net_loc_lower) normalized_url_netloc = "%s.%s" % (ext.domain, ext.suffix) for host in hosts: host_parts = host.split('/') #speigal.de/international #if not valid: if host_parts[0] == normalized_url_netloc or host_parts[0] == normalized_url_netloc: if len(host_parts) == 2: if host in url_core: valid = True else: valid = True if valid: break else: valid = True if not valid: print "[addURL]: %s - Error - url is not in the whitelist of hosts - Core: %s - Net: %s" % (getCurrentTime(), url_core, url_netloc) if valid: valid = is_url_valid(url_core) if not valid: print "[addURL]: %s - Error - url seems to be an image, video or audio file - Core: %s - Net: %s" % (getCurrentTime(), url_core, url_netloc) # got this far need to check if there is quality text on the page html = None if valid: html = getHTML(url_core) if html: tree = make_tree(html) lines = article_text(tree) if len(lines) < 1: valid = False else: valid = False if not valid: print "[addURL]: %s - Error - there seems to be no paragraphs to read - Core: %s - Net: %s" % (getCurrentTime(), url_core, url_netloc) if not valid: url_core = None html = None return url_core, html
def urlValidation(pgConn, url_long, hosts=[]): # TODO: fix periodically not returning the expanded URL url_parts = getURLCore(pgConn, url_long) # returns list or None valid = False url_core = None html = None if not url_parts: print "[addURL]: %s - Error - not able to find a core: %s" % ( getCurrentTime(), url_long) return url_core, html if url_parts and len(url_parts): url_core = url_parts[0] url_netloc = url_parts[1] if url_core and url_netloc: #validate here to only the media outlets in hosts if len(hosts): net_loc_lower = url_netloc.lower().strip() ext = tldextract.extract(net_loc_lower) normalized_url_netloc = "%s.%s" % (ext.domain, ext.suffix) for host in hosts: host_parts = host.split('/') #speigal.de/international #if not valid: if host_parts[0] == normalized_url_netloc or host_parts[ 0] == normalized_url_netloc: if len(host_parts) == 2: if host in url_core: valid = True else: valid = True if valid: break else: valid = True if not valid: print "[addURL]: %s - Error - url is not in the whitelist of hosts - Core: %s - Net: %s" % ( getCurrentTime(), url_core, url_netloc) if valid: valid = is_url_valid(url_core) if not valid: print "[addURL]: %s - Error - url seems to be an image, video or audio file - Core: %s - Net: %s" % ( getCurrentTime(), url_core, url_netloc) # got this far need to check if there is quality text on the page html = None if valid: html = getHTML(url_core) if html: tree = make_tree(html) lines = article_text(tree) if len(lines) < 1: valid = False else: valid = False if not valid: print "[addURL]: %s - Error - there seems to be no paragraphs to read - Core: %s - Net: %s" % ( getCurrentTime(), url_core, url_netloc) if not valid: url_core = None html = None return url_core, html
def assertMakeTree(self, html, expected): got = etree.tostring(make_tree(html), method='html') self.assertEqual(got, expected)
to_delete = [] for i, paragraph in enumerate(section): if paragraph.lower() in ignored_paragraphs: to_delete.append(i) elif is_punctuated(paragraph) and len(paragraph) >= MIN_CHARS_IN_PARAGRAPH: count += 1 percent_punctuated = decimal.Decimal(count) / decimal.Decimal(len(section)) if count >= NUM_PARAGRAPHS_SAFE_GUESS or (count >= MIN_NUM_PUNCTUATED and percent_punctuated >= MIN_PERCENTAGE_PUNCTUATED): for i in reversed(to_delete): # Delete in reverse so that index order is preserved. del section[i] final_sections.append(section) return final_sections def article_text(tree): """ Simple wrapper around article_text_sections() that "flattens" sections into a single section. """ result = [] for section in article_text_sections(tree): result.extend(section) return result if __name__ == "__main__": from ebdata.retrieval import UnicodeRetriever from ebdata.textmining.treeutils import make_tree import sys html = UnicodeRetriever().fetch_data(sys.argv[1]) lines = article_text(make_tree(html)) print lines
def setAttributesForUrl(pgConn, url_id, html_str=None, url_core=None): """ Downloads the HTML if not found in DB pulls out the title, description, thumbnail, etc saves this meta data to postgres """ printMsg = True html = None url = None soup = None lines = [] if not pgConn: if printMsg: print '[setAttributesForUrl]: Error - No postgres connection' return False if url_core: url = url_core if not url: row = get_url(pgConn, url_id) if row: url = row['url'] if url: url_obj = urlparse(url) if len(url_obj.path) < 5: url = None if url: if html_str: html = html_str if not html: html_row = get_html(pgConn, url_id) if html_row: html = html_row['html'] elif url: html = getHTML(url) if html: tree = make_tree(html) lines = article_text(tree) soup = cleanSoup( BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)) if len(lines) < 1: html = None if not html or not url or not soup: if printMsg: print '[setAttributesForUrl]: Error - no html returned %s' % url delete_url(pgConn, url_id) # not sure we need to do this return False # get thumbnail # TODO: check to see if this is working correctly thumbnail = get_attribute(soup, 'img', url) if not len(thumbnail): if printMsg: print '[setAttributesForUrl]: Warning - no thumbnail returned - %s' % url # get title title = get_attribute(soup, 'title') if title is None: title = '' if not len(title): if printMsg: print '[setAttributesForUrl]: Warning - no title returned - %s' % url title = " ".join(title.strip().split()) # get description description = get_attribute(soup, 'description', lines) if description == 'error': #delete_url(pgConn, url_id) description = None if description is None: description = '' try: description = " ".join(description.strip().split()) except: description = description if printMsg: print "" print "-----------------------------------------" print "URL ID: %s" % url_id print "Title: %s" % title print "Desc: %s" % description print "IMG: %s" % thumbnail print "-----------------------------------------" if not len(description): if printMsg: print '[setAttributesForUrl]: Warning - no description returned - %s' % url # save if pgConn: save_url_attributes(pgConn, url_id, title, description, thumbnail) return False
def assertPreprocesses(self, html, expected, **kwargs): import warnings with warnings.catch_warnings(): tree = make_tree(html) got = etree.tostring(preprocess(tree, **kwargs), method='html') self.assertEqual(got, expected)
if count >= NUM_PARAGRAPHS_SAFE_GUESS or ( count >= MIN_NUM_PUNCTUATED and percent_punctuated >= MIN_PERCENTAGE_PUNCTUATED): for i in reversed( to_delete ): # Delete in reverse so that index order is preserved. del section[i] final_sections.append(section) return final_sections def article_text(tree): """ Simple wrapper around article_text_sections() that "flattens" sections into a single section. """ result = [] for section in article_text_sections(tree): result.extend(section) return result if __name__ == "__main__": from ebdata.retrieval import UnicodeRetriever from ebdata.textmining.treeutils import make_tree import sys html = UnicodeRetriever().fetch_data(sys.argv[1]) lines = article_text(make_tree(html)) print lines
def assertPreprocesses(self, html, expected, **kwargs): tree = make_tree(html) got = etree.tostring(preprocess(tree, **kwargs), method='html') self.assertEqual(got, expected)