def mklink(url, anchortext=None, nofollow=False): url = url.strip() fullurl = urljoin(base_href, replace_entities(url, encoding=htmlpage.encoding)) return Link(fullurl.encode(htmlpage.encoding), text=anchortext, nofollow=nofollow)
def _clean_field_str(self, field, clean_html=False, optional=False, max_length=None): """Generic clean method for string field.""" value = self._as_str(field) if max_length: value = value[:max_length] if clean_html: value = replace_entities(remove_tags(value)) if not value and not optional: raise ValueError('field is not optional' " or can't be converted to a string") return value
def iterlinks(htmlpage): """Iterate through the links in the HtmlPage passed For example: >>> from scrapely.htmlpage import HtmlPage >>> p = HtmlPage(body=u"Please visit <a href='http://scrapinghub.com/'>Scrapinghub</a>") >>> iterlinks(p).next() Link(url='http://scrapinghub.com/', text=u'Scrapinghub', fragment='', nofollow=False) >>> p = HtmlPage(body=u"Go <a href='home.html'>Home</a>") >>> iterlinks(p).next() Link(url='home.html', text=u'Home', fragment='', nofollow=False) When a url is specified, absolute urls are made: >>> p.url = 'http://scrapinghub.com/' >>> iterlinks(p).next() Link(url='http://scrapinghub.com/home.html', text=u'Home', fragment='', nofollow=False) Base href attributes in the page are respected >>> p.body = u"<html><head><base href='myproject/'/></head><body>see my <a href='index.html'>project</a></body>" >>> iterlinks(p).next() Link(url='http://scrapinghub.com/myproject/index.html', text=u'project', fragment='', nofollow=False) >>> p.body = u"<html><head><base href='http://scrape.io/myproject/'/></head><body>see my <a href='index.html'>project</a></body>" >>> iterlinks(p).next() Link(url='http://scrape.io/myproject/index.html', text=u'project', fragment='', nofollow=False) Frameset and iframe urls are extracted >>> p = HtmlPage(body=u"<html><frameset><frame src=frame1.html><frame src=frame2.html></frameset><iframe src='iframe.html'/></html>") >>> [l.url for l in iterlinks(p)] ['frame1.html', 'frame2.html', 'iframe.html'] As are meta refresh tags: >>> p = HtmlPage(body=u"<html><head><meta http-equiv='refresh' content='5;url=http://example.com/' />") >>> iterlinks(p).next().url 'http://example.com/' nofollow is set to True if the link has a rel='nofollow' attribute: >>> p = HtmlPage(body=u"<a href='somewhere.html' rel='nofollow'>somewhere</a>") >>> list(iterlinks(p)) [Link(url='somewhere.html', text=u'somewhere', fragment='', nofollow=True)] It does not require well formed HTML and behaves similar to many browsers >>> p = HtmlPage(body=u"<a href='foo'>foo <a href=bar>bar</a><a href='baz'/>baz") >>> list(iterlinks(p)) [Link(url='foo', text=u'foo ', fragment='', nofollow=False), Link(url='bar', text=u'bar', fragment='', nofollow=False), Link(url='baz', text=u'baz', fragment='', nofollow=False)] Leading and trailing whitespace should be removed, including in base href >>> p = HtmlPage(body=u"<head><base href=' foo/ '/></head><a href='bar '/>baz") >>> list(iterlinks(p)) [Link(url='foo/bar', text=u'baz', fragment='', nofollow=False)] Test standard onclick links >>> p = HtmlPage(url="http://www.example.com", body=u"<html><td onclick=window.open('page.html?productid=23','win2') >") >>> list(iterlinks(p)) [Link(url='http://www.example.com/page.html?productid=23', text=None, fragment='', nofollow=False)] >>> p = HtmlPage("http://www.example.com", body=u"<html><a href='#' onclick=window.open('page.html?productid=24','win2') >") >>> list(iterlinks(p)) [Link(url='http://www.example.com/page.html?productid=24', text=None, fragment='', nofollow=False)] >>> p = HtmlPage(body=u"<html><div onclick=window.location.href='http://www.jungleberry.co.uk/Fair-Trade-Earrings/Aguas-Earrings.htm'>") >>> list(iterlinks(p)) [Link(url='http://www.jungleberry.co.uk/Fair-Trade-Earrings/Aguas-Earrings.htm', text=None, fragment='', nofollow=False)] Onclick with no href >>> p = HtmlPage("http://www.example.com", body=u"<html><a onclick=window.open('page.html?productid=24','win2') >") >>> list(iterlinks(p)) [Link(url='http://www.example.com/page.html?productid=24', text=None, fragment='', nofollow=False)] Dont generate link when target is an anchor >>> p = HtmlPage("http://www.example.com", body=u"<html><a href='#section1' >") >>> list(iterlinks(p)) [] Extract links from <link> tags in page header >>> p = HtmlPage("http://example.blogspot.com/", body=u"<html><head><link rel='me' href='http://www.blogger.com/profile/987372' /></head><body>This is my body!</body></html>") >>> list(iterlinks(p)) [Link(url='http://www.blogger.com/profile/987372', text=None, fragment='', nofollow=False)] """ base_href = replace_entities(htmlpage.url, encoding=htmlpage.encoding) def mklink(url, anchortext=None, nofollow=False): url = url.strip() fullurl = urljoin(base_href, replace_entities(url, encoding=htmlpage.encoding)) return Link(fullurl.encode(htmlpage.encoding), text=anchortext, nofollow=nofollow) # iter to quickly scan only tags tag_iter = (t for t in htmlpage.parsed_body if isinstance(t, HtmlTag)) # parse body astart = ahref = None nofollow = False for nexttag in tag_iter: tagname = nexttag.tag attributes = nexttag.attributes if tagname == 'a' and (nexttag.tag_type == HtmlTagType.CLOSE_TAG or attributes.get('href') \ and not attributes.get('href', '').startswith('#')): if astart: yield mklink(ahref, htmlpage.body[astart:nexttag.start], nofollow) astart = ahref = None nofollow = False href = attributes.get('href') if href: ahref = href astart = nexttag.end nofollow = attributes.get('rel') == u'nofollow' elif tagname == 'head': # scan ahead until end of head section for nexttag in tag_iter: tagname = nexttag.tag if (tagname == 'head' and \ nexttag.tag_type == HtmlTagType.CLOSE_TAG) or \ tagname == 'body': break if tagname == 'base': href = nexttag.attributes.get('href') if href: joined_base = urljoin(htmlpage.url, href.strip(), htmlpage.encoding) base_href = replace_entities(joined_base, encoding=htmlpage.encoding) elif tagname == 'meta': attrs = nexttag.attributes if attrs.get('http-equiv') == 'refresh': m = _META_REFRESH_CONTENT_RE.search(attrs.get('content', '')) if m: target = m.group('url') if target: yield mklink(target) elif tagname == 'link': href = nexttag.attributes.get('href') if href: yield mklink(href) elif tagname == 'area': href = attributes.get('href') if href: nofollow = attributes.get('rel') == u'nofollow' yield mklink(href, attributes.get('alt', ''), nofollow) elif tagname in ('frame', 'iframe'): target = attributes.get('src') if target: yield mklink(target) elif 'onclick' in attributes: match = _ONCLICK_LINK_RE.search(attributes["onclick"] or "") if not match: continue target = match.group("url") nofollow = attributes.get('rel') == u'nofollow' yield mklink(target, nofollow=nofollow) if astart: yield mklink(ahref, htmlpage.body[astart:])