def __call__(self, values): new_values = [] for v in arg_to_iter(values): if isinstance(v, (str, unicode)): v = remove_entities(v).strip() new_values.append(int(v)) return new_values
def text(region): """Converts HTML to text. There is no attempt at formatting other than removing excessive whitespace, For example: >>> t = lambda s: text(htmlregion(s)) >>> t(u'<h1>test</h1>') u'test' Leading and trailing whitespace are removed >>> t(u'<h1> test</h1> ') u'test' Comments are removed >>> t(u'test <!-- this is a comment --> me') u'test me' Text between script tags is ignored >>> t(u"scripts are<script>n't</script> ignored") u'scripts are ignored' HTML entities are converted to text >>> t(u"only £42") u'only \\xa342' """ chunks = _process_markup( region, lambda text: remove_entities(text, encoding=region.htmlpage.encoding), lambda tag: u' ') text = u''.join(chunks) return _WS.sub(u' ', text).strip()
def text(region): """Converts HTML to text. There is no attempt at formatting other than removing excessive whitespace, For example: >>> t = lambda s: text(htmlregion(s)) >>> t(u'<h1>test</h1>') u'test' Leading and trailing whitespace are removed >>> t(u'<h1> test</h1> ') u'test' Comments are removed >>> t(u'test <!-- this is a comment --> me') u'test me' Text between script tags is ignored >>> t(u"scripts are<script>n't</script> ignored") u'scripts are ignored' HTML entities are converted to text >>> t(u"only £42") u'only \\xa342' """ chunks = _process_markup(region, lambda text: remove_entities(text, encoding=region.htmlpage.encoding), lambda tag: u' ' ) text = u''.join(chunks) return _WS.sub(u' ', text).strip()
def mklink(url, anchortext=None, nofollow=False): url = url.strip() fullurl = urljoin(base_href, remove_entities(url, encoding=htmlpage.encoding)) return Link(fullurl.encode(htmlpage.encoding), text=anchortext, nofollow=nofollow)
def __call__(self, values): new_values = [] for v in arg_to_iter(values): if isinstance(v, (str, unicode)): v = remove_entities(v).strip() v = v.lower() == "true" else: v = bool(v) new_values.append(v) return new_values
def _extract_links(self, response_text, response_url, response_encoding): base_url = self.base_url if self.base_url else response_url clean_url = lambda u: urljoin_rfc(base_url, remove_entities(clean_link(u.decode(response_encoding)))) clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip() links_text = linkre.findall(response_text) urlstext = set([(clean_url(url), clean_text(text)) for url, _, text in links_text]) return [Link(url, text) for url, text in urlstext]
def __call__(self, values): new_values = [] for v in arg_to_iter(values): if isinstance(v, (str, unicode)): v = remove_entities(v).strip() v = (v.lower() == 'true') else: v = bool(v) new_values.append(v) return new_values
def image_url(txt): """convert text to a url this is quite conservative, since relative urls are supported Example: >>> image_url('') >>> image_url(' ') >>> image_url(' \\n\\n ') >>> image_url('foo-bar.jpg') ['foo-bar.jpg'] >>> image_url('/images/main_logo12.gif') ['/images/main_logo12.gif'] >>> image_url("http://www.image.com/image.jpg") ['http://www.image.com/image.jpg'] >>> image_url("http://www.domain.com/path1/path2/path3/image.jpg") ['http://www.domain.com/path1/path2/path3/image.jpg'] >>> image_url("/path1/path2/path3/image.jpg") ['/path1/path2/path3/image.jpg'] >>> image_url("path1/path2/image.jpg") ['path1/path2/image.jpg'] >>> image_url("background-image : url(http://www.site.com/path1/path2/image.jpg)") ['http://www.site.com/path1/path2/image.jpg'] >>> image_url("background-image : url('http://www.site.com/path1/path2/image.jpg')") ['http://www.site.com/path1/path2/image.jpg'] >>> image_url('background-image : url("http://www.site.com/path1/path2/image.jpg")') ['http://www.site.com/path1/path2/image.jpg'] >>> image_url("background : url(http://www.site.com/path1/path2/image.jpg)") ['http://www.site.com/path1/path2/image.jpg'] >>> image_url("background : url('http://www.site.com/path1/path2/image.jpg')") ['http://www.site.com/path1/path2/image.jpg'] >>> image_url('background : url("http://www.site.com/path1/path2/image.jpg")') ['http://www.site.com/path1/path2/image.jpg'] >>> image_url('/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350') ['/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350'] >>> image_url('http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350') ['http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350'] >>> image_url('http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80') ['http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80'] >>> image_url('../image.aspx?thumb=true&boxSize=175&img=Unknoportrait[1].jpg') ['../image.aspx?thumb=true&boxSize=175&img=Unknoportrait%5B1%5D.jpg'] >>> image_url('http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff') ['http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff'] >>> image_url('http://www.site.com/image.php') ['http://www.site.com/image.php'] >>> image_url('background-image:URL(http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&defaultImage=noimage_wasserstrom)') ['http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&defaultImage=noimage_wasserstrom'] """ imgurl = extract_image_url(txt) return [safe_url_string(remove_entities(url(imgurl)))] if imgurl else None
def extract_regex(regex, text, encoding='utf-8'): """Extract a list of unicode strings from the given text/encoding using the following policies: * if the regex contains a named group called "extract" that will be returned * if the regex contains multiple numbered groups, all those will be returned (flattened) * if the regex doesn't contain any group the entire regex matching is returned """ if isinstance(regex, basestring): regex = re.compile(regex) try: strings = [regex.search(text).group('extract')] # named group except: strings = regex.findall(text) # full regex or numbered groups strings = flatten(strings) if isinstance(text, unicode): return [remove_entities(s, keep=['lt', 'amp']) for s in strings] else: return [remove_entities(unicode(s, encoding), keep=['lt', 'amp']) for s in strings]
def _extract_links(self, response_text, response_url, response_encoding): base_url = urljoin_rfc( response_url, self.base_url) if self.base_url else response_url clean_url = lambda u: urljoin_rfc( base_url, remove_entities(clean_link(u.decode(response_encoding)))) clean_text = lambda t: replace_escape_chars( remove_tags(t.decode(response_encoding))).strip() links_text = linkre.findall(response_text) urlstext = set([(clean_url(url), clean_text(text)) for url, _, text in links_text]) return [Link(url, text) for url, text in urlstext]
def extract_regex(regex, text, encoding='utf-8'): """Extract a list of unicode strings from the given text/encoding using the following policies: * if the regex contains a named group called "extract" that will be returned * if the regex contains multiple numbered groups, all those will be returned (flattened) * if the regex doesn't contain any group the entire regex matching is returned """ if isinstance(regex, basestring): regex = re.compile(regex) try: strings = [regex.search(text).group('extract')] # named group except: strings = regex.findall(text) # full regex or numbered groups strings = flatten(strings) if isinstance(text, unicode): return [remove_entities(s, keep=['lt', 'amp']) for s in strings] else: return [ remove_entities(unicode(s, encoding), keep=['lt', 'amp']) for s in strings ]
def get_meta_refresh(response): """Parse the http-equiv parameter of the HTML meta element from the given response and return a tuple (interval, url) where interval is an integer containing the delay in seconds (or zero if not present) and url is a string with the absolute url to redirect. If no meta redirect is found, (None, None) is returned. """ if response not in _metaref_cache: body_chunk = remove_comments(remove_entities(response.body_as_unicode()[0:4096])) match = META_REFRESH_RE.search(body_chunk) if match: interval = float(match.group('int')) url = safe_url_string(match.group('url').strip(' "\'')) url = urljoin_rfc(response.url, url) _metaref_cache[response] = (interval, url) else: _metaref_cache[response] = (None, None) #_metaref_cache[response] = match.groups() if match else (None, None) return _metaref_cache[response]
def get_meta_refresh(response): """Parse the http-equiv parameter of the HTML meta element from the given response and return a tuple (interval, url) where interval is an integer containing the delay in seconds (or zero if not present) and url is a string with the absolute url to redirect. If no meta redirect is found, (None, None) is returned. """ if response not in _metaref_cache: body_chunk = remove_comments( remove_entities(response.body_as_unicode()[0:4096])) match = META_REFRESH_RE.search(body_chunk) if match: interval = float(match.group('int')) url = safe_url_string(match.group('url').strip(' "\'')) url = urljoin_rfc(response.url, url) _metaref_cache[response] = (interval, url) else: _metaref_cache[response] = (None, None) #_metaref_cache[response] = match.groups() if match else (None, None) return _metaref_cache[response]
def get_meta_refresh(response): """Parse the http-equiv parameter of the HTML meta element from the given response and return a tuple (interval, url) where interval is an integer containing the delay in seconds (or zero if not present) and url is a string with the absolute url to redirect. If no meta redirect is found, (None, None) is returned. """ if response not in _metaref_cache: body_chunk = remove_comments(remove_entities(response.body_as_unicode()[0:4096])) for match1 in META_TAG_RE.finditer(body_chunk): params = {} for match2 in META_TAG_ATTRS_RE.finditer(match1.group(1)): params[match2.group("key")] = match2.group("value") if params.get("http-equiv") == "refresh": match = META_CONTENT_RE.search(params.get("content", "")) if match: interval = float(match.group("int")) url = urljoin_rfc(response.url, safe_url_string((match.group("url") or "").strip(' "\''))) _metaref_cache[response] = (interval, url) return (interval, url) _metaref_cache[response] = (None, None) return _metaref_cache[response]
def __call__(self, values): return [remove_entities(v) for v in arg_to_iter(values)]
def iterlinks(htmlpage): """Iterate through the links in the HtmlPage passed For example: >>> from scrapely.htmlpage import HtmlPage >>> p = HtmlPage(body=u"Please visit <a href='http://scrapinghub.com/'>Scrapinghub</a>") >>> iterlinks(p).next() Link(url='http://scrapinghub.com/', text=u'Scrapinghub', fragment='', nofollow=False) >>> p = HtmlPage(body=u"Go <a href='home.html'>Home</a>") >>> iterlinks(p).next() Link(url='home.html', text=u'Home', fragment='', nofollow=False) When a url is specified, absolute urls are made: >>> p.url = 'http://scrapinghub.com/' >>> iterlinks(p).next() Link(url='http://scrapinghub.com/home.html', text=u'Home', fragment='', nofollow=False) Base href attributes in the page are respected >>> p.body = u"<html><head><base href='myproject/'/></head><body>see my <a href='index.html'>project</a></body>" >>> iterlinks(p).next() Link(url='http://scrapinghub.com/myproject/index.html', text=u'project', fragment='', nofollow=False) >>> p.body = u"<html><head><base href='http://scrape.io/myproject/'/></head><body>see my <a href='index.html'>project</a></body>" >>> iterlinks(p).next() Link(url='http://scrape.io/myproject/index.html', text=u'project', fragment='', nofollow=False) Frameset and iframe urls are extracted >>> p = HtmlPage(body=u"<html><frameset><frame src=frame1.html><frame src=frame2.html></frameset><iframe src='iframe.html'/></html>") >>> [l.url for l in iterlinks(p)] ['frame1.html', 'frame2.html', 'iframe.html'] As are meta refresh tags: >>> p = HtmlPage(body=u"<html><head><meta http-equiv='refresh' content='5;url=http://example.com/' />") >>> iterlinks(p).next().url 'http://example.com/' nofollow is set to True if the link has a rel='nofollow' attribute: >>> p = HtmlPage(body=u"<a href='somewhere.html' rel='nofollow'>somewhere</a>") >>> list(iterlinks(p)) [Link(url='somewhere.html', text=u'somewhere', fragment='', nofollow=True)] It does not require well formed HTML and behaves similar to many browsers >>> p = HtmlPage(body=u"<a href='foo'>foo <a href=bar>bar</a><a href='baz'/>baz") >>> list(iterlinks(p)) [Link(url='foo', text=u'foo ', fragment='', nofollow=False), Link(url='bar', text=u'bar', fragment='', nofollow=False), Link(url='baz', text=u'baz', fragment='', nofollow=False)] Leading and trailing whitespace should be removed, including in base href >>> p = HtmlPage(body=u"<head><base href=' foo/ '/></head><a href='bar '/>baz") >>> list(iterlinks(p)) [Link(url='foo/bar', text=u'baz', fragment='', nofollow=False)] Test standard onclick links >>> p = HtmlPage(url="http://www.example.com", body=u"<html><td onclick=window.open('page.html?productid=23','win2') >") >>> list(iterlinks(p)) [Link(url='http://www.example.com/page.html?productid=23', text=None, fragment='', nofollow=False)] >>> p = HtmlPage("http://www.example.com", body=u"<html><a href='#' onclick=window.open('page.html?productid=24','win2') >") >>> list(iterlinks(p)) [Link(url='http://www.example.com/page.html?productid=24', text=None, fragment='', nofollow=False)] >>> p = HtmlPage(body=u"<html><div onclick=window.location.href='http://www.jungleberry.co.uk/Fair-Trade-Earrings/Aguas-Earrings.htm'>") >>> list(iterlinks(p)) [Link(url='http://www.jungleberry.co.uk/Fair-Trade-Earrings/Aguas-Earrings.htm', text=None, fragment='', nofollow=False)] Onclick with no href >>> p = HtmlPage("http://www.example.com", body=u"<html><a onclick=window.open('page.html?productid=24','win2') >") >>> list(iterlinks(p)) [Link(url='http://www.example.com/page.html?productid=24', text=None, fragment='', nofollow=False)] Dont generate link when target is an anchor >>> p = HtmlPage("http://www.example.com", body=u"<html><a href='#section1' >") >>> list(iterlinks(p)) [] Extract links from <link> tags in page header >>> p = HtmlPage("http://example.blogspot.com/", body=u"<html><head><link rel='me' href='http://www.blogger.com/profile/987372' /></head><body>This is my body!</body></html>") >>> list(iterlinks(p)) [Link(url='http://www.blogger.com/profile/987372', text=None, fragment='', nofollow=False)] """ base_href = remove_entities(htmlpage.url, encoding=htmlpage.encoding) def mklink(url, anchortext=None, nofollow=False): url = url.strip() fullurl = urljoin(base_href, remove_entities(url, encoding=htmlpage.encoding)) return Link(fullurl.encode(htmlpage.encoding), text=anchortext, nofollow=nofollow) # iter to quickly scan only tags tag_iter = (t for t in htmlpage.parsed_body if isinstance(t, HtmlTag)) # parse body astart = ahref = None nofollow = False for nexttag in tag_iter: tagname = nexttag.tag attributes = nexttag.attributes if tagname == 'a' and (nexttag.tag_type == HtmlTagType.CLOSE_TAG or attributes.get('href') \ and not attributes.get('href', '').startswith('#')): if astart: yield mklink(ahref, htmlpage.body[astart:nexttag.start], nofollow) astart = ahref = None nofollow = False href = attributes.get('href') if href: ahref = href astart = nexttag.end nofollow = attributes.get('rel') == u'nofollow' elif tagname == 'head': # scan ahead until end of head section for nexttag in tag_iter: tagname = nexttag.tag if (tagname == 'head' and \ nexttag.tag_type == HtmlTagType.CLOSE_TAG) or \ tagname == 'body': break if tagname == 'base': href = nexttag.attributes.get('href') if href: joined_base = urljoin(htmlpage.url, href.strip(), htmlpage.encoding) base_href = remove_entities(joined_base, encoding=htmlpage.encoding) elif tagname == 'meta': attrs = nexttag.attributes if attrs.get('http-equiv') == 'refresh': m = _META_REFRESH_CONTENT_RE.search( attrs.get('content', '')) if m: target = m.group('url') if target: yield mklink(target) elif tagname == 'link': href = nexttag.attributes.get('href') if href: yield mklink(href) elif tagname == 'area': href = attributes.get('href') if href: nofollow = attributes.get('rel') == u'nofollow' yield mklink(href, attributes.get('alt', ''), nofollow) elif tagname in ('frame', 'iframe'): target = attributes.get('src') if target: yield mklink(target) elif 'onclick' in attributes: match = _ONCLICK_LINK_RE.search(attributes["onclick"] or "") if not match: continue target = match.group("url") nofollow = attributes.get('rel') == u'nofollow' yield mklink(target, nofollow=nofollow) if astart: yield mklink(ahref, htmlpage.body[astart:])
def iterlinks(htmlpage): """Iterate through the links in the HtmlPage passed For example: >>> from scrapely.htmlpage import HtmlPage >>> p = HtmlPage(body=u"Please visit <a href='http://scrapinghub.com/'>Scrapinghub</a>") >>> iterlinks(p).next() Link(url='http://scrapinghub.com/', text=u'Scrapinghub', fragment='', nofollow=False) >>> p = HtmlPage(body=u"Go <a href='home.html'>Home</a>") >>> iterlinks(p).next() Link(url='home.html', text=u'Home', fragment='', nofollow=False) When a url is specified, absolute urls are made: >>> p.url = 'http://scrapinghub.com/' >>> iterlinks(p).next() Link(url='http://scrapinghub.com/home.html', text=u'Home', fragment='', nofollow=False) Base href attributes in the page are respected >>> p.body = u"<html><head><base href='myproject/'/></head><body>see my <a href='index.html'>project</a></body>" >>> iterlinks(p).next() Link(url='http://scrapinghub.com/myproject/index.html', text=u'project', fragment='', nofollow=False) >>> p.body = u"<html><head><base href='http://scrape.io/myproject/'/></head><body>see my <a href='index.html'>project</a></body>" >>> iterlinks(p).next() Link(url='http://scrape.io/myproject/index.html', text=u'project', fragment='', nofollow=False) Frameset and iframe urls are extracted >>> p = HtmlPage(body=u"<html><frameset><frame src=frame1.html><frame src=frame2.html></frameset><iframe src='iframe.html'/></html>") >>> [l.url for l in iterlinks(p)] ['frame1.html', 'frame2.html', 'iframe.html'] As are meta refresh tags: >>> p = HtmlPage(body=u"<html><head><meta http-equiv='refresh' content='5;url=http://example.com/' />") >>> iterlinks(p).next().url 'http://example.com/' nofollow is set to True if the link has a rel='nofollow' attribute: >>> p = HtmlPage(body=u"<a href='somewhere.html' rel='nofollow'>somewhere</a>") >>> list(iterlinks(p)) [Link(url='somewhere.html', text=u'somewhere', fragment='', nofollow=True)] It does not require well formed HTML and behaves similar to many browsers >>> p = HtmlPage(body=u"<a href='foo'>foo <a href=bar>bar</a><a href='baz'/>baz") >>> list(iterlinks(p)) [Link(url='foo', text=u'foo ', fragment='', nofollow=False), Link(url='bar', text=u'bar', fragment='', nofollow=False), Link(url='baz', text=u'baz', fragment='', nofollow=False)] Leading and trailing whitespace should be removed, including in base href >>> p = HtmlPage(body=u"<head><base href=' foo/ '/></head><a href='bar '/>baz") >>> list(iterlinks(p)) [Link(url='foo/bar', text=u'baz', fragment='', nofollow=False)] Test standard onclick links >>> p = HtmlPage(url="http://www.example.com", body=u"<html><td onclick=window.open('page.html?productid=23','win2') >") >>> list(iterlinks(p)) [Link(url='http://www.example.com/page.html?productid=23', text=None, fragment='', nofollow=False)] >>> p = HtmlPage("http://www.example.com", body=u"<html><a href='#' onclick=window.open('page.html?productid=24','win2') >") >>> list(iterlinks(p)) [Link(url='http://www.example.com/page.html?productid=24', text=None, fragment='', nofollow=False)] >>> p = HtmlPage(body=u"<html><div onclick=window.location.href='http://www.jungleberry.co.uk/Fair-Trade-Earrings/Aguas-Earrings.htm'>") >>> list(iterlinks(p)) [Link(url='http://www.jungleberry.co.uk/Fair-Trade-Earrings/Aguas-Earrings.htm', text=None, fragment='', nofollow=False)] Onclick with no href >>> p = HtmlPage("http://www.example.com", body=u"<html><a onclick=window.open('page.html?productid=24','win2') >") >>> list(iterlinks(p)) [Link(url='http://www.example.com/page.html?productid=24', text=None, fragment='', nofollow=False)] Dont generate link when target is an anchor >>> p = HtmlPage("http://www.example.com", body=u"<html><a href='#section1' >") >>> list(iterlinks(p)) [] """ base_href = remove_entities(htmlpage.url, encoding=htmlpage.encoding) def mklink(url, anchortext=None, nofollow=False): url = url.strip() fullurl = urljoin(base_href, remove_entities(url, encoding=htmlpage.encoding)) return Link(fullurl.encode(htmlpage.encoding), text=anchortext, nofollow=nofollow) # iter to quickly scan only tags tag_iter = (t for t in htmlpage.parsed_body if isinstance(t, HtmlTag)) # parse body astart = ahref = None nofollow = False for nexttag in tag_iter: tagname = nexttag.tag attributes = nexttag.attributes if tagname == 'a' and (nexttag.tag_type == HtmlTagType.CLOSE_TAG or attributes.get('href') \ and not attributes.get('href', '').startswith('#')): if astart: yield mklink(ahref, htmlpage.body[astart:nexttag.start], nofollow) astart = ahref = None nofollow = False href = attributes.get('href') if href: ahref = href astart = nexttag.end nofollow = attributes.get('rel') == u'nofollow' elif tagname == 'head': # scan ahead until end of head section for nexttag in tag_iter: tagname = nexttag.tag if (tagname == 'head' and \ nexttag.tag_type == HtmlTagType.CLOSE_TAG) or \ tagname == 'body': break if tagname == 'base': href = nexttag.attributes.get('href') if href: joined_base = urljoin(htmlpage.url, href.strip(), htmlpage.encoding) base_href = remove_entities(joined_base, encoding=htmlpage.encoding) elif tagname == 'meta': attrs = nexttag.attributes if attrs.get('http-equiv') == 'refresh': m = _META_REFRESH_CONTENT_RE.search(attrs.get('content', '')) if m: target = m.group('url') if target: yield mklink(target) elif tagname == 'area': href = attributes.get('href') if href: nofollow = attributes.get('rel') == u'nofollow' yield mklink(href, attributes.get('alt', ''), nofollow) elif tagname in ('frame', 'iframe'): target = attributes.get('src') if target: yield mklink(target) elif 'onclick' in attributes: match = _ONCLICK_LINK_RE.search(attributes["onclick"] or "") if not match: continue target = match.group("url") nofollow = attributes.get('rel') == u'nofollow' yield mklink(target, nofollow=nofollow) if astart: yield mklink(ahref, htmlpage.body[astart:])
def test_remove_entities(self): # make sure it always return uncode assert isinstance(remove_entities('no entities'), unicode) assert isinstance(remove_entities('Price: £100!'), unicode) # regular conversions self.assertEqual(remove_entities(u'As low as £100!'), u'As low as \xa3100!') self.assertEqual(remove_entities('As low as £100!'), u'As low as \xa3100!') self.assertEqual(remove_entities('redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold ½oz solid crucifix pendant'), u'redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold \xbdoz solid crucifix pendant') # keep some entities self.assertEqual(remove_entities('<b>Low < High & Medium £ six</b>', keep=['lt', 'amp']), u'<b>Low < High & Medium \xa3 six</b>') # illegal entities self.assertEqual(remove_entities('a < b &illegal; c � six', remove_illegal=False), u'a < b &illegal; c � six') self.assertEqual(remove_entities('a < b &illegal; c � six', remove_illegal=True), u'a < b c six') self.assertEqual(remove_entities('x≤y'), u'x\u2264y') # check browser hack for numeric character references in the 80-9F range self.assertEqual(remove_entities('x™y', encoding='cp1252'), u'x\u2122y') # encoding self.assertEqual(remove_entities('x\x99™™y', encoding='cp1252'), \ u'x\u2122\u2122\u2122y')
def image_url(txt): """convert text to a url this is quite conservative, since relative urls are supported Example: >>> image_url('') >>> image_url(' ') >>> image_url(' \\n\\n ') >>> image_url('foo-bar.jpg') ['foo-bar.jpg'] >>> image_url('/images/main_logo12.gif') ['/images/main_logo12.gif'] >>> image_url("http://www.image.com/image.jpg") ['http://www.image.com/image.jpg'] >>> image_url("http://www.domain.com/path1/path2/path3/image.jpg") ['http://www.domain.com/path1/path2/path3/image.jpg'] >>> image_url("/path1/path2/path3/image.jpg") ['/path1/path2/path3/image.jpg'] >>> image_url("path1/path2/image.jpg") ['path1/path2/image.jpg'] >>> image_url("background-image : url(http://www.site.com/path1/path2/image.jpg)") ['http://www.site.com/path1/path2/image.jpg'] >>> image_url("background-image : url('http://www.site.com/path1/path2/image.jpg')") ['http://www.site.com/path1/path2/image.jpg'] >>> image_url('background-image : url("http://www.site.com/path1/path2/image.jpg")') ['http://www.site.com/path1/path2/image.jpg'] >>> image_url("background : url(http://www.site.com/path1/path2/image.jpg)") ['http://www.site.com/path1/path2/image.jpg'] >>> image_url("background : url('http://www.site.com/path1/path2/image.jpg')") ['http://www.site.com/path1/path2/image.jpg'] >>> image_url('background : url("http://www.site.com/path1/path2/image.jpg")') ['http://www.site.com/path1/path2/image.jpg'] >>> image_url('/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350') ['/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350'] >>> image_url('http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350') ['http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350'] >>> image_url('http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80') ['http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80'] >>> image_url('../image.aspx?thumb=true&boxSize=175&img=Unknoportrait[1].jpg') ['../image.aspx?thumb=true&boxSize=175&img=Unknoportrait%5B1%5D.jpg'] >>> image_url('http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff') ['http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff'] >>> image_url('http://www.site.com/image.php') ['http://www.site.com/image.php'] >>> image_url('background-image:URL(http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&defaultImage=noimage_wasserstrom)') ['http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&defaultImage=noimage_wasserstrom'] """ txt = url(txt) imgurl = None if txt: # check if the text is style content m = _CSS_IMAGERE.search(txt) txt = m.groups()[0] if m else txt parsed = urlparse.urlparse(txt) path = None m = _IMAGE_PATH_RE.search(parsed.path) if m: path = m.group() elif parsed.query: m = _GENERIC_PATH_RE.search(parsed.path) if m: path = m.group() if path is not None: parsed = list(parsed) parsed[2] = path imgurl = urlparse.urlunparse(parsed) if not imgurl: imgurl = txt return [safe_url_string(remove_entities(url(imgurl)))] if imgurl else None
def test_remove_entities(self): # make sure it always return uncode assert isinstance(remove_entities('no entities'), unicode) assert isinstance(remove_entities('Price: £100!'), unicode) # regular conversions self.assertEqual(remove_entities(u'As low as £100!'), u'As low as \xa3100!') self.assertEqual(remove_entities('As low as £100!'), u'As low as \xa3100!') self.assertEqual( remove_entities( 'redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold ½oz solid crucifix pendant' ), u'redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold \xbdoz solid crucifix pendant' ) # keep some entities self.assertEqual( remove_entities('<b>Low < High & Medium £ six</b>', keep=['lt', 'amp']), u'<b>Low < High & Medium \xa3 six</b>') # illegal entities self.assertEqual( remove_entities('a < b &illegal; c � six', remove_illegal=False), u'a < b &illegal; c � six') self.assertEqual( remove_entities('a < b &illegal; c � six', remove_illegal=True), u'a < b c six') self.assertEqual(remove_entities('x≤y'), u'x\u2264y') # check browser hack for numeric character references in the 80-9F range self.assertEqual(remove_entities('x™y', encoding='cp1252'), u'x\u2122y') # encoding self.assertEqual(remove_entities('x\x99™™y', encoding='cp1252'), \ u'x\u2122\u2122\u2122y')
def iterlinks(htmlpage): """Iterate through the links in the HtmlPage passed For example: >>> from scrapely.htmlpage import HtmlPage >>> p = HtmlPage(body=u"Please visit <a href='http://scrapinghub.com/'>Scrapinghub</a>") >>> iterlinks(p).next() Link(url='http://scrapinghub.com/', text=u'Scrapinghub', fragment='', nofollow=False) >>> p = HtmlPage(body=u"Go <a href='home.html'>Home</a>") >>> iterlinks(p).next() Link(url='home.html', text=u'Home', fragment='', nofollow=False) When a url is specified, absolute urls are made: >>> p.url = 'http://scrapinghub.com/' >>> iterlinks(p).next() Link(url='http://scrapinghub.com/home.html', text=u'Home', fragment='', nofollow=False) Base href attributes in the page are respected >>> p.body = u"<html><head><base href='myproject/'/></head><body>see my <a href='index.html'>project</a></body>" >>> iterlinks(p).next() Link(url='http://scrapinghub.com/myproject/index.html', text=u'project', fragment='', nofollow=False) >>> p.body = u"<html><head><base href='http://scrape.io/myproject/'/></head><body>see my <a href='index.html'>project</a></body>" >>> iterlinks(p).next() Link(url='http://scrape.io/myproject/index.html', text=u'project', fragment='', nofollow=False) Frameset and iframe urls are extracted >>> p = HtmlPage(body=u"<html><frameset><frame src=frame1.html><frame src=frame2.html></frameset><iframe src='iframe.html'/></html>") >>> [l.url for l in iterlinks(p)] ['frame1.html', 'frame2.html', 'iframe.html'] As are meta refresh tags: >>> p = HtmlPage(body=u"<html><head><meta http-equiv='refresh' content='5;url=http://example.com/' />") >>> iterlinks(p).next().url 'http://example.com/' nofollow is set to True if the link has a rel='nofollow' attribute: >>> p = HtmlPage(body=u"<a href='somewhere.html' rel='nofollow'>somewhere</a>") >>> list(iterlinks(p)) [Link(url='somewhere.html', text=u'somewhere', fragment='', nofollow=True)] It does not require well formed HTML and behaves similar to many browsers >>> p = HtmlPage(body=u"<a href='foo'>foo <a href=bar>bar</a><a href='baz'/>baz") >>> list(iterlinks(p)) [Link(url='foo', text=u'foo ', fragment='', nofollow=False), Link(url='bar', text=u'bar', fragment='', nofollow=False), Link(url='baz', text=u'baz', fragment='', nofollow=False)] Leading and trailing whitespace should be removed, including in base href >>> p = HtmlPage(body=u"<head><base href=' foo/ '/></head><a href='bar '/>baz") >>> list(iterlinks(p)) [Link(url='foo/bar', text=u'baz', fragment='', nofollow=False)] """ base_href = remove_entities(htmlpage.url, encoding=htmlpage.encoding) def mklink(url, anchortext=None, nofollow=False): url = url.strip() fullurl = urljoin_rfc(base_href, remove_entities(url, encoding=htmlpage.encoding), htmlpage.encoding) return Link(fullurl, text=anchortext, nofollow=nofollow) # iter to quickly scan only tags tag_iter = (t for t in htmlpage.parsed_body if isinstance(t, HtmlTag)) # parse body astart = ahref = None nofollow = False for nexttag in tag_iter: tagname = nexttag.tag attributes = nexttag.attributes if tagname == "a": if astart: yield mklink(ahref, htmlpage.body[astart : nexttag.start], nofollow) astart = ahref = None nofollow = False if nexttag.tag_type != HtmlTagType.CLOSE_TAG: href = attributes.get("href") if href and not href.startswith("#"): ahref = href astart = nexttag.end nofollow = attributes.get("rel") == u"nofollow" elif tagname == "head": # scan ahead until end of head section for nexttag in tag_iter: tagname = nexttag.tag if (tagname == "head" and nexttag.tag_type == HtmlTagType.CLOSE_TAG) or tagname == "body": break if tagname == "base": href = nexttag.attributes.get("href") if href: joined_base = urljoin_rfc(htmlpage.url, href.strip(), htmlpage.encoding) base_href = remove_entities(joined_base, encoding=htmlpage.encoding) elif tagname == "meta": attrs = nexttag.attributes if attrs.get("http-equiv") == "refresh": m = _META_REFRESH_CONTENT_RE.search(attrs.get("content", "")) if m: target = m.group("url") if target: yield mklink(target) elif tagname == "area": href = attributes.get("href") if href: nofollow = attributes.get("rel") == u"nofollow" yield mklink(href, attributes.get("alt", ""), nofollow) elif tagname in ("frame", "iframe"): target = attributes.get("src") if target: yield mklink(target) elif "onclick" in attributes: # FIXME: extract URLs in onclick and add doctests pass if astart: yield mklink(ahref, htmlpage.body[astart:])
def parse_product(self, response): soup = BeautifulSoup(response.body) if not soup.find('div', attrs={'class': 'product'}): retry_request = _retry_page(response) if retry_request: yield retry_request else: self.log( "Error parsing page, couldn't extract product name: %s" % response.url) return main_name = soup.find('div', attrs={'class': 'product'}).h1.text main_name = remove_entities(main_name) brand_el = soup.find( lambda tag: tag.name == 'td' and 'brand' in tag.text.lower()) brand = brand_el.findNextSibling('td').text.strip() if brand_el else '' cat_names = [ span.a.text for span in soup.find('div', attrs={ 'class': 'breadcrumbtrail' }).span.findAll('span') if span.a ][2:] image_url = soup.find('img', {'itemprop': 'image'}) image_url = image_url['src'] if image_url else None table = soup.find('table', id='responsive-table') options = soup.findAll('div', attrs={'class': 'option'}) if table: for row in table.findAll('tr'): # Skip head row if not row.td: continue name = row.find('span', attrs={'class': 'name'}).text name = remove_entities(name) if not _main_name_in_opt_name(main_name, name): name = main_name + ' ' + name identifier = row.find('span', attrs={'class': 'codenumber'}) if not identifier: self.errors.append( "Identifier not found for products on page: %s" % response.url) continue identifier = identifier.text price = row.find(_is_price_tag).text real_price = extract_price(price) if real_price < 15: shipping_cost = 3 elif real_price < 40: shipping_cost = 4 elif real_price < 130: shipping_cost = 7 else: shipping_cost = None loader = ProductLoaderWithNameStrip(Product(), response=response) loader.add_value('name', name) loader.add_value('url', response.url) loader.add_value('brand', brand) loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_value('price', price) for cat_name in cat_names: loader.add_value('category', cat_name) loader.add_value('shipping_cost', shipping_cost) loader.add_value('image_url', image_url) yield loader.load_item() elif options: main_id = response.url.split('.')[-2].split('p-')[-1] price = soup.find('span', attrs={'class': 'inctax'}).span.text real_price = extract_price(price) if real_price < 15: shipping_cost = 3 elif real_price < 40: shipping_cost = 4 elif real_price < 130: shipping_cost = 7 else: shipping_cost = None results = {} for opt in options: opt_name = opt.label.span.text results[opt_name] = [] for subopt in opt.select.findAll('option'): subopt_name = subopt.text subopt_value = _soup_el_get_attr(subopt, 'value') if subopt_value == '0': continue results[opt_name].append({ 'id': remove_entities(subopt_name).replace('"', ''), 'name': opt_name + ': ' + subopt_name }) for opt_tuple in product(*results.values()): name = _build_opt_name(main_name, opt_tuple) identifier = _build_opt_id(main_id, opt_tuple) loader = ProductLoaderWithNameStrip(Product(), response=response) loader.add_value('name', name) loader.add_value('url', response.url) loader.add_value('brand', brand) loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_value('price', price) for cat_name in cat_names: loader.add_value('category', cat_name) loader.add_value('shipping_cost', shipping_cost) loader.add_value('image_url', image_url) yield loader.load_item()