def extract_normed_body(html): # Note # - html.dom.drop_ignore_trees() can not except ignore tags in case of “summary/test/resources/html/music-visualizer-progress.html“ # from html.dom import drop_tree, drop_ignore_trees # dom = fromstring(html) # dom = drop_ignore_trees(dom) # return to_unicode(tostring(dom.body)) dom = fromstring(drop_ignore_trees(html)) return to_unicode(tostring(dom.body))
content = self.content data = fromstring(content) return [item.attrib for item in data.xpath('//img')] # function for fetching URLs for many schemes using a variety of different protocols. # instead of an 'http:', we can use 'ftp:', 'file:', etc. def extract(html = None, uri = None, config = {}): data = html if data is None and uri is not None: try: response = urllib.urlopen(uri) data = response.read() except urllib2.HTTPError, e: print 'The server couldn\'t fulfill the request.' print 'Error code: ', e.code print 'Reason: ', e.reason return False except urllib2.URLError, e: print 'We failed to reach a server.' print 'Error code: ', e.code print 'Reason: ', e.reason return False except IOError, e: print 'We failed to fetch local file.' print 'Error code: ', e.code print 'Reason: ', e.reason return False return Article(to_unicode(data))