def parse_article(fn): parser = etree.HTMLParser() tree = etree.parse(open(fn), parser) e = tree.getroot() for i in e.xpath('.//img'): try: print layout.norm_ext_img_url(i.get('src')) except UnicodeEncodeError: sys.stderr.write('UnicodeError %s %r \n' %(fn, i.get('src')))
def parse_css(css, cssimagedir): urls = [u for u in re.findall('url\((.*?)\)', css) if not u.startswith('data:')] + extra for u in urls: url = layout.norm_ext_img_url(u) lurl = layout.ext_img_url2local_cssimg_url(url) fn = layout.ext_img_url2fn(url, keep_ext=False) ofn = os.path.join(cssimagedir, fn) #print url, lurl, fn, ofn try: open(ofn, 'w').write( urllib2.urlopen(url).read() ) css = css.replace(u, lurl) except urllib2.URLError, e: print 'ERR', e, url