def test_extraction_using_single_values(self): """Test the extractor's behaviour among different situations""" lx = SgmlLinkExtractor(allow="sample") self.assertEqual( [link for link in lx.extract_links(self.response)], [ Link(url="http://example.com/sample1.html", text=u""), Link(url="http://example.com/sample2.html", text=u"sample 2"), Link(url="http://example.com/sample3.html", text=u"sample 3 text"), ], ) lx = SgmlLinkExtractor(allow="sample", deny="3") self.assertEqual( [link for link in lx.extract_links(self.response)], [ Link(url="http://example.com/sample1.html", text=u""), Link(url="http://example.com/sample2.html", text=u"sample 2"), ], ) lx = SgmlLinkExtractor(allow_domains="google.com") self.assertEqual( [link for link in lx.extract_links(self.response)], [Link(url="http://www.google.com/something", text=u"")] ) lx = SgmlLinkExtractor(deny_domains="example.com") self.assertEqual( [link for link in lx.extract_links(self.response)], [Link(url="http://www.google.com/something", text=u"")] )
def test_tags(self): html = """<html><area href="sample1.html"></area><a href="sample2.html">sample 2</a><img src="sample2.jpg"/></html>""" response = HtmlResponse("http://example.com/index.html", body=html) lx = SgmlLinkExtractor(tags=None) self.assertEqual(lx.extract_links(response), []) lx = SgmlLinkExtractor() self.assertEqual(lx.extract_links(response), [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), ]) lx = SgmlLinkExtractor(tags="area") self.assertEqual(lx.extract_links(response), [ Link(url='http://example.com/sample1.html', text=u''), ]) lx = SgmlLinkExtractor(tags="a") self.assertEqual(lx.extract_links(response), [ Link(url='http://example.com/sample2.html', text=u'sample 2'), ]) lx = SgmlLinkExtractor(tags=("a","img"), attrs=("href", "src"), deny_extensions=()) self.assertEqual(lx.extract_links(response), [ Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample2.jpg', text=u''), ])
def test_attrs(self): lx = SgmlLinkExtractor(attrs="href") self.assertEqual(lx.extract_links(self.response), [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), Link(url='http://www.google.com/something', text=u''), Link(url='http://example.com/innertag.html', text=u'inner tag'), ]) lx = SgmlLinkExtractor(attrs=("href","src"), tags=("a","area","img"), deny_extensions=()) self.assertEqual(lx.extract_links(self.response), [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample2.jpg', text=u''), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), Link(url='http://www.google.com/something', text=u''), Link(url='http://example.com/innertag.html', text=u'inner tag'), ]) lx = SgmlLinkExtractor(attrs=None) self.assertEqual(lx.extract_links(self.response), []) html = """<html><area href="sample1.html"></area><a ref="sample2.html">sample text 2</a></html>""" response = HtmlResponse("http://example.com/index.html", body=html) lx = SgmlLinkExtractor(attrs=("href")) self.assertEqual(lx.extract_links(response), [ Link(url='http://example.com/sample1.html', text=u''), ])
def test_extraction_using_single_values(self): '''Test the extractor's behaviour among different situations''' lx = SgmlLinkExtractor(allow='sample') self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), ]) lx = SgmlLinkExtractor(allow='sample', deny='3') self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), ]) lx = SgmlLinkExtractor(allow_domains='google.com') self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://www.google.com/something', text=u''), ]) lx = SgmlLinkExtractor(deny_domains='example.com') self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://www.google.com/something', text=u''), ])
def test_extraction_using_single_values(self): '''Test the extractor's behaviour among different situations''' lx = SgmlLinkExtractor(allow='sample') self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 text') ]) lx = SgmlLinkExtractor(allow='sample', deny='3') self.assertEqual([link for link in lx.extract_links(self.response)], [ Link(url='http://example.com/sample1.html', text=u''), Link(url='http://example.com/sample2.html', text=u'sample 2') ]) lx = SgmlLinkExtractor(allow_domains='google.com') self.assertEqual( [link for link in lx.extract_links(self.response)], [Link(url='http://www.google.com/something', text=u'')]) lx = SgmlLinkExtractor(deny_domains='example.com') self.assertEqual( [link for link in lx.extract_links(self.response)], [Link(url='http://www.google.com/something', text=u'')])
def parse(self, response): print "IN PARSE!" # inspect_response(response,self) links=SgmlLinkExtractor( allow=('https://www.coursera.org/course/\w+'), ) print "TAMANHO:",len(links.extract_links(response)) for link in links.extract_links(response): # print link yield Request(link.url,callback=self.parse_item)
def test_deny_extensions(self): html = """<a href="page.html">asd</a> and <a href="photo.jpg">""" response = HtmlResponse("http://example.org/", body=html) lx = SgmlLinkExtractor() self.assertEqual(lx.extract_links(response), [ Link(url='http://example.org/page.html', text=u'asd'), ]) lx = SgmlLinkExtractor(deny_extensions="jpg") self.assertEqual(lx.extract_links(response), [ Link(url='http://example.org/page.html', text=u'asd'), ])
def parse_start_url(self, response): if not hasattr(response, 'encoding'): setattr(response, 'encoding', 'text/html;charset=UTF-8') target_le = SgmlLinkExtractor( allow=r'/cn/products/products_detail.asp\?Catalog_id=\w+') links = target_le.extract_links(response) if links: return [Request(url=link.url, cookies=self.forged_cookie, callback=self.parse_item) for link in links] else: general_le = SgmlLinkExtractor( allow=()) return [Request(url=link.url, cookies=self.forged_cookie) for link in general_le.extract_links(response)]
def test_urls_type(self): '''Test that the resulting urls are regular strings and not a unicode objects''' lx = SgmlLinkExtractor() self.assertTrue( all( isinstance(link.url, str) for link in lx.extract_links(self.response)))
def parseL2(self, response): # forums - liks to lists and to threads s2 = SgmlLinkExtractor(restrict_xpaths=['//table[@class="forums-list"]/tr/td/a']) Links = s2.extract_links(response) for l in Links: yield Request(l.url, callback=self.parseL3) self.scrapeTheadURL(response)
def parse(self, response): # title page hxs = HtmlXPathSelector(response) s1 = SgmlLinkExtractor(restrict_xpaths=['//a[@class="title"]']) Links = s1.extract_links(response) for l in Links: yield Request(l.url, callback=self.parseL2)
def parse(self, response): # title page hxs = HtmlXPathSelector(response) s1 = SgmlLinkExtractor(restrict_xpaths=['//a[@class="title"]']) Links = s1.extract_links(response) for l in Links: yield Request(l.url, callback=self.parseL2)
class LinkScraper: """A scraper to find all URLs in a page """ def __init__(self): self._link_extractor = SgmlLinkExtractor() def parse(self, response): """Scrape a spider's HttpRequest.Response for links""" # sanity check if self._link_extractor is None: self._link_extractor = SgmlLinkExtractor() # use scrapy SgmlLinkExtractor to extract links try: links = self._link_extractor.extract_links(response) except SGMLParseError as e: # Page was poorly formatted, oh well _linkscraper_logger.error('Exception encountered when link extracting page') return [] # add these links to our Url item urls = list() for link in links: url = ScrapedUrl() url['url'] = link.url url['domain'] = UrlUtility.get_domain(link.url) url['last_visited'] = datetime(1, 1, 1) if url not in urls: urls.append(url) return urls
class LinkScraper: """A scraper to find all URLs in a page """ def __init__(self): self._link_extractor = SgmlLinkExtractor() def parse(self, response): """Scrape a spider's HttpRequest.Response for links""" # sanity check if self._link_extractor is None: self._link_extractor = SgmlLinkExtractor() # use scrapy SgmlLinkExtractor to extract links try: links = self._link_extractor.extract_links(response) except SGMLParseError as e: # Page was poorly formatted, oh well _linkscraper_logger.error( 'Exception encountered when link extracting page') return [] # add these links to our Url item urls = list() for link in links: url = ScrapedUrl() url['url'] = link.url url['domain'] = UrlUtility.get_domain(link.url) url['last_visited'] = datetime(1, 1, 1) if url not in urls: urls.append(url) return urls
def parse_hospital_active_doctor(self, response): """ This function parses a sample response. Some contracts are mingled with this docstring. @url http://www.haodf.com/hospital/DE4roiYGYZwXhYmS30yF9V0wc/DE4rO-XCoLU0Jq1rbc1P6dS2aO/daifu.htm @returns items 14 14 @returns requests 20 100 @scrapes _name hospital specialty title reply2wCount """ hxs = HtmlXPathSelector(response) city = response.meta['city'] area = response.meta['area'] print "$$$ current city: %s area: %s" % (city[0], area[0]) #Sample #http://www.haodf.com/hospital/DE4roiYGYZwXhYmS30yF9V0wc/DE4rO-XCoLUE-578VWVmvC3uh7/daifu.htm linkExtractor = SgmlLinkExtractor(allow=(r"/hospital/\S+/\S+/daifu.htm",), unique=True) links = linkExtractor.extract_links(response) for link in links: request = Request(link.url, callback=self.parse_hospital_active_doctor) request.meta['city'] = response.meta['city'] request.meta["area"] = response.meta['area'] yield request hospital = hxs.select("/html/body/div[3]/div/a[3]/text()").extract()[0] print hospital specialty = hxs.select("//div[@class='subnav']/a/text()").re(r'(\S+)\s+(\S+)')[0] print specialty docLinks = hxs.select("//table[@id='doc_list_index']/tr[descendant::td[contains(@class, 'tda')]]") #docLinks = hxs.select("//table[@id='doc_list_index']/tr") for doc in docLinks: l = XPathItemLoader(ActiveDoctorItem(), doc) docNames = doc.select("./td[@class='tda']/li/a[contains(@href, 'http://www.haodf.com/doctor/')]/text()").extract() if len(docNames) != 0: print docNames[0] l.add_xpath('_name', "./td[@class='tda']/li/a[contains(@href, 'http://www.haodf.com/doctor/')]/text()") l.add_value('specialty', specialty) l.add_value('hospital', hospital) l.add_value('city', response.meta['city']) l.add_value('area', response.meta['area']) title = doc.select("./td[@class='tda']/li/text()").re('\S+') if len(title) == 1: l.add_value('title', title[0]) l.add_xpath('count_ReplyInTwoWeeks', u"./td[@class='td_hf']/div[contains(text(), '近2周回复咨询')]/span/text()") l.add_xpath('count_ReplyTotal', u"./td[@class='td_hf']/div[contains(text(), '总共回复')]/span/text()") l.add_xpath('count_Calls', u"./td[@class='td_hf']/div[contains(text(), '已接听电话咨询')]/span/text()") ret = l.load_item() #print ret yield ret
def test_encoded_url_in_restricted_xpath(self): body = """<html><body><div><a href="?page=2">BinB</a></body></html>""" response = HtmlResponse("http://known.fm/AC%2FDC/", body=body, encoding='utf8') lx = SgmlLinkExtractor(restrict_xpaths="//div") self.assertEqual(lx.extract_links(response), [ Link(url='http://known.fm/AC%2FDC/?page=2', text=u'BinB', fragment='', nofollow=False), ])
def test_encoded_url_in_restricted_xpath(self): body = """<html><body><div><a href="?page=2">BinB</a></body></html>""" response = HtmlResponse("http://known.fm/AC%2FDC/", body=body, encoding='utf8') lx = SgmlLinkExtractor(restrict_xpaths="//div") self.assertEqual(lx.extract_links(response), [ Link(url='http://known.fm/AC%2FDC/?page=2', text=u'BinB', fragment='', nofollow=False), ])
def parse(self, response): metadata = response.meta['userdata'] #处理常规部分 link_extractor = SgmlLinkExtractor(restrict_xpaths=('//div[@class="linksList"]//a')) links = link_extractor.extract_links(response) for link in links: m = copy.deepcopy(metadata) url = link.url cat_title = link.text cat_name = cat_title.lower() m['tags_mapping']['category-0'] = [{'title': cat_title, 'name': cat_name}] gender = cm.guess_gender(cat_name) if gender: m['gender'] = [gender] yield Request(url=url, callback=self.parse_cat, errback=self.onerr, meta={'userdata': m}) #处理区域特别部分 region = metadata['region'] if region == 'jp': extra_urls = [ 'http://www.paulsmith.co.jp/shop/gifts/products', 'http://www.paulsmith.co.jp/shop/reserve/products', 'http://www.paulsmith.co.jp/shop/sales/products', 'http://www.paulsmith.co.jp/shop/paulsmithcollection/products' ] for url in extra_urls: m = copy.deepcopy(metadata) yield Request(url=url, callback=self.parse_cat, errback=self.onerr, meta={'userdata': m}) else: extra_urls = [ 'http://www.paulsmith.co.uk/%s-en/shop/valentines-day-gifts/valentines-day-gifts-for-her' % region, 'http://www.paulsmith.co.uk/%s-en/shop/valentines-day-gifts/valentines-day-gifts-for-him' % region, ] for url in extra_urls: m = copy.deepcopy(metadata) yield Request(url=url, callback=self.parse_cat, errback=self.onerr, meta={'userdata': m})
def parse_session_hash(self, response): extractor = SgmlLinkExtractor( allow=r'/w/valikko\.jsp', tags='frame', attrs=('src', )) link = extractor.extract_links(response)[0] query = urlparse.urlparse(link.url).query params = urlparse.parse_qs(query) return params['MD5avain'][0]
def parseThread(self, response): print('inside a thread') hxs = HtmlXPathSelector(response) filename = "xxx"+response.url.split("/")[-2][1:] with open(filename, 'a') as f: for entry in hxs.select('//div[contains(@class,"forums-thread")]'): msgID= entry.select('span/@id').extract()[0] msgDate= entry.select('h4/text()').extract()[0].encode('ascii','ignore').replace('\n','') msgText=' '.join(entry.select('span/text()').extract()).encode('ascii','ignore').replace('\n','') try: mgAuthor= entry.select('h3/span/a/text()').extract()[0].encode('ascii','ignore').replace('\n','') except: mgAuthor='none' try: msgTitle= entry.select('h3/strong/text()').extract()[0].encode('ascii','ignore').replace('\n','') except: msgTitle="none" f.write('msgID:'+msgID+'\n') f.write('msgTitle:'+msgTitle+'\n') f.write('mgAuthor:'+mgAuthor+'\n') f.write('msgDate:'+msgDate+'\n') f.write('msgText:'+msgText+'\n\n') s = SgmlLinkExtractor(restrict_xpaths=['//li[contains(@class, "next")]']) Links = s.extract_links(response) if len(Links) > 0: print 'going to the next page' yield Request(Links[0].url, callback=self.parseThread)
class FollowAllSpider(BaseSpider): name = 'followall' def __init__(self, **kw): super(FollowAllSpider, self).__init__(**kw) url = kw.get('url') or kw.get('domain') or 'http://scrapinghub.com/' if not url.startswith('http://') and not url.startswith('https://'): url = 'http://%s/' % url self.url = url self.allowed_domains = [urlparse(url).hostname.lstrip('www.')] self.link_extractor = SgmlLinkExtractor() self.cookies_seen = set() def start_requests(self): return [Request(self.url, callback=self.parse)] def parse(self, response): """Parse a PageItem and all requests to follow @url http://www.scrapinghub.com/ @returns items 1 1 @returns requests 1 @scrapes url title foo """ self.log("I am at : "+ response.url) page = self._get_item(response) r = [page] r.extend(self._extract_requests(response)) return r def _get_item(self, response): item = Page(url=response.url, size=str(len(response.body)), referer=response.request.headers.get('Referer')) self._set_title(item, response) self._set_new_cookies(item, response) return item def _extract_requests(self, response): r = [] if isinstance(response, HtmlResponse): links = self.link_extractor.extract_links(response) r.extend(Request(x.url, callback=self.parse) for x in links) return r def _set_title(self, page, response): if isinstance(response, HtmlResponse): title = HtmlXPathSelector(response).select("//title/text()").extract() if title: page['title'] = title[0] def _set_new_cookies(self, page, response): cookies = [] for cookie in [x.split(';', 1)[0] for x in response.headers.getlist('Set-Cookie')]: if cookie not in self.cookies_seen: self.cookies_seen.add(cookie) cookies.append(cookie) if cookies: page['newcookies'] = cookies
def test_base_url_with_restrict_xpaths(self): html = """<html><head><title>Page title<title><base href="http://otherdomain.com/base/" /> <body><p><a href="item/12.html">Item 12</a></p> </body></html>""" response = HtmlResponse("http://example.org/somepage/index.html", body=html) lx = SgmlLinkExtractor(restrict_xpaths="//p") self.assertEqual(lx.extract_links(response), [Link(url='http://otherdomain.com/base/item/12.html', text='Item 12')])
def test_base_url_with_restrict_xpaths(self): html = """<html><head><title>Page title<title><base href="http://otherdomain.com/base/" /> <body><p><a href="item/12.html">Item 12</a></p> </body></html>""" response = HtmlResponse("http://example.org/somepage/index.html", body=html) lx = SgmlLinkExtractor(restrict_xpaths="//p") self.assertEqual(lx.extract_links(response), [Link(url='http://otherdomain.com/base/item/12.html', text='Item 12')])
def test_restrict_xpaths_concat_in_handle_data(self): """html entities cause SGMLParser to call handle_data hook twice""" body = """<html><body><div><a href="/foo">>\xbe\xa9<\xb6\xab</a></body></html>""" response = HtmlResponse("http://example.org", body=body, encoding='gb18030') lx = SgmlLinkExtractor(restrict_xpaths="//div") self.assertEqual(lx.extract_links(response), [Link(url='http://example.org/foo', text=u'>\u4eac<\u4e1c', fragment='', nofollow=False)])
def parse_cat(self, response): #先处理本页商品 link_extractor = SgmlLinkExtractor(restrict_xpaths=('//div[@class="category-products"]//h2//a')) links = link_extractor.extract_links(response) metadata = response.meta['userdata'] for link in links: m = copy.deepcopy(metadata) url = link.url yield Request(url=url, callback=self.parse_details, errback=self.onerr, meta={'userdata': m}) #再处理翻页 link_extractor = SgmlLinkExtractor(restrict_xpaths=('//li[@class="next"]//a')) links = link_extractor.extract_links(response) if links: next_page = links[0] next_page_url = next_page.url m = copy.deepcopy(metadata) yield Request(url=next_page_url, callback=self.parse_cat, errback=self.onerr, meta={'userdata': m})
def parse(self, response): link_extractor = SgmlLinkExtractor(restrict_xpaths=('//div[@id="banners"]')) links = link_extractor.extract_links(response) metadata = response.meta['userdata'] for link in links: m = copy.deepcopy(metadata) url = link.url yield Request(url=url, callback=self.parse_cat, errback=self.onerr, meta={'userdata': m})
def parseL2(self, response): # forums - liks to lists and to threads s2 = SgmlLinkExtractor( restrict_xpaths=['//table[@class="forums-list"]/tr/td/a']) Links = s2.extract_links(response) for l in Links: yield Request(l.url, callback=self.parseL3) self.scrapeTheadURL(response)
def extractLinks(self, response, **extra): """ 抽取链接 """ link_extractor = SgmlLinkExtractor(**extra) links = link_extractor.extract_links(response) log.msg('从%s抽取到的链接:%s' % (response.url,len(links)), level=log.DEBUG) return links
def test_restrict_xpaths_concat_in_handle_data(self): """html entities cause SGMLParser to call handle_data hook twice""" body = """<html><body><div><a href="/foo">>\xbe\xa9<\xb6\xab</a></body></html>""" response = HtmlResponse("http://example.org", body=body, encoding='gb18030') lx = SgmlLinkExtractor(restrict_xpaths="//div") self.assertEqual(lx.extract_links(response), [Link(url='http://example.org/foo', text=u'>\u4eac<\u4e1c', fragment='', nofollow=False)])
def parseL3(self, response): # like model specific self.scrapeTheadURL(response) # multipage s = SgmlLinkExtractor(restrict_xpaths=['//li[contains(@class, "next")]']) Links = s.extract_links(response) if len(Links) > 0: yield Request(Links[0].url, callback=self.parseL3)
def test_restrict_xpaths(self): lx = SgmlLinkExtractor(restrict_xpaths=('//div[@id="subwrapper"]',)) self.assertEqual( [link for link in lx.extract_links(self.response)], [ Link(url="http://example.com/sample1.html", text=u""), Link(url="http://example.com/sample2.html", text=u"sample 2"), ], )
def extract_links(self, response, **extra): # {{{ """ Extract links from response extra - passed to SgmlLinkExtractor """ link_extractor = SgmlLinkExtractor(**extra) links = link_extractor.extract_links(response) return links
class BaseSiteSpider(CrawlSpider): def __init__(self, **kw): super(BaseSiteSpider, self).__init__(**kw) url = kw.get('url') or kw.get('domain') if not url.startswith('http://') and not url.startswith('https://'): url = 'http://%s/' % url self.url = url self.process = kw.get('process') self.deny = [re.compile(x) for x in kw.get('deny', [])] self.allowed_domains = [urlparse(url).hostname.lstrip('www.')] self.link_extractor = SgmlLinkExtractor() #self.cookies_seen = set() def clean_up(self): pass def setup(self): pass def start_requests(self): return [Request(self.url, callback=self.parse)] def parse(self, response): page = self._get_item(response) r = [page] r.extend(self._extract_requests(response)) return r def _should_follow(self, url): for pattern in self.deny: if pattern.search(url) is not None: return False return True def _set_title(self, page, response): if isinstance(response, HtmlResponse): title = Selector(response).xpath("//title/text()").extract() if title: page['title'] = title[0] def _set_new_cookies(self, page, response): cookies = [] for cookie in [x.split(';', 1)[0] for x in response.headers.getlist('Set-Cookie')]: if cookie not in self.cookies_seen: self.cookies_seen.add(cookie) cookies.append(cookie) if cookies: page['newcookies'] = cookies def _extract_requests(self, response): r = [] if isinstance(response, HtmlResponse): links = (x for x in self.link_extractor.extract_links(response) if self._should_follow(x.url)) r.extend(Request(x.url, callback=self.parse) for x in links) return r
def test_link_nofollow(self): html = """ <a href="page.html?action=print" rel="nofollow">Printer-friendly page</a> <a href="about.html">About us</a> """ response = HtmlResponse("http://example.org/page.html", body=html) lx = SgmlLinkExtractor() self.assertEqual([link for link in lx.extract_links(response)], [ Link(url='http://example.org/page.html?action=print', text=u'Printer-friendly page', nofollow=True), Link(url='http://example.org/about.html', text=u'About us', nofollow=False) ])
def parse_type(self, response): link_extractor = SgmlLinkExtractor( restrict_xpaths=('//div[@class="inner-nav-content"]//a')) links = link_extractor.extract_links(response) if links: results = self.parse_cat(response) else: results = self.parse_list(response) for result in results: yield result
def parseL3(self, response): # like model specific self.scrapeTheadURL(response) # multipage s = SgmlLinkExtractor( restrict_xpaths=['//li[contains(@class, "next")]']) Links = s.extract_links(response) if len(Links) > 0: yield Request(Links[0].url, callback=self.parseL3)
def parse(self, response): # changed to parse to crawl all home page lx = SgmlLinkExtractor() urls = lx.extract_links(response) noworder = 0 for oneurl in urls: noworder += 1 yield scrapy.Request( oneurl.url, callback=lambda response, crawllevel=1, order=noworder, loopstr ='': self.parse_text(response, crawllevel, order, loopstr))
def test_link_nofollow(self): html = """ <a href="page.html?action=print" rel="nofollow">Printer-friendly page</a> <a href="about.html">About us</a> """ response = HtmlResponse("http://example.org/page.html", body=html) lx = SgmlLinkExtractor() self.assertEqual([link for link in lx.extract_links(response)], [ Link(url='http://example.org/page.html?action=print', text=u'Printer-friendly page', nofollow=True), Link(url='http://example.org/about.html', text=u'About us', nofollow=False), ])
def parse(self, response): metadata = response.meta['userdata'] m = metadata link_extractor = SgmlLinkExtractor( restrict_xpaths=('//div[@class="shared_header"]//li/a')) links = link_extractor.extract_links(response) enter_url = links[0].url yield Request(url=enter_url, callback=self.parse_type, errback=self.onerr, meta={'userdata': m})
def parse(self, response): """ This function parses a sample response. Some contracts are mingled with this docstring. @url http://www.chunyuyisheng.com/clinics/1/doctors @returns items 0 0 @returns requests 500 100000 """ hxs = HtmlXPathSelector(response) listlinkExtractor = SgmlLinkExtractor(allow=(r"/clinics/\d+/doctors(|\?page=\d+)",), unique=True) list_links = listlinkExtractor.extract_links(response) for link in list_links: yield Request(link.url, callback=self.parse) docdetail_linkExtractor = SgmlLinkExtractor(allow=(r"/doctor/clinic_web_\w+$",), unique=True) docdetail_links = docdetail_linkExtractor.extract_links(response) for link in docdetail_links: yield Request(link.url, callback=self.parse_doctor_detail)
def parse_list(self, response): link_extractor = SgmlLinkExtractor( restrict_xpaths=('//div[@class="product_grid"]//a')) links = link_extractor.extract_links(response) metadata = response.meta['userdata'] for link in links: m = copy.deepcopy(metadata) url = link.url yield Request(url=url, callback=self.parse_details, errback=self.onerr, meta={'userdata': m})
def parse(self, response): # changed to parse to crawl all home page lx = SgmlLinkExtractor() urls = lx.extract_links(response) noworder = 0 for oneurl in urls: noworder += 1 yield scrapy.Request( oneurl.url, callback=lambda response, crawllevel=1, order=noworder, loopstr="": self.parse_text( response, crawllevel, order, loopstr ), )
def parse_faculty_detail(self, response): """ This function parses a sample response. Some contracts are mingled with this docstring. @url http://www.haodf.com/faculty/DE4rO-XCoLU0Jq1rbc1P6dS2aO.htm @returns items 21 21 @returns requests 3 3 @scrapes _name specialty title shortDesc """ hxs = HtmlXPathSelector(response) linkExtractor = SgmlLinkExtractor( allow=(r"/faculty/\S+/menzhen.htm\?orderby", ), unique=True) links = linkExtractor.extract_links(response) for link in links: yield Request(link.url, callback=self.parse_faculty_detail) specialty = hxs.select( "/html/body/div[3]/div/div[2]/div/a[3]/text()").extract() hospital = hxs.select( "/html/body/div[3]/div/div[2]/div/a[2]/text()").extract() docLinks = hxs.select( "//table[@id='doc_list_index']/tr[descendant::td[contains(@class, 'tda')]]" ) #docLinks = hxs.select("//table[@id='doc_list_index']/tr") for doc in docLinks: l = XPathItemLoader(DoctorItem(), doc) docNames = doc.select( "./td[@class='tda']/li/a[contains(@href, 'http://www.haodf.com/doctor/')]/text()" ).extract() if len(docNames) != 0: print docNames[0] l.add_xpath( '_name', "./td[@class='tda']/li/a[contains(@href, 'http://www.haodf.com/doctor/')]/text()" ) l.add_value('specialty', specialty) l.add_value('hospital', hospital) l.add_xpath('title', "./td[@class='tda']/li/p[1]/text()") l.add_xpath('acadamicDegree', "./td[@class='tda']/li/p[2]/text()") l.add_xpath('shortDesc', "./td[@class='tdb']/text()") #clinic time todo ret = l.load_item() #print ret yield ret
def parse(self, response): if not self.book: log.msg("小说不在数据库中,下面创建小说") log.msg("获取小说标题和作者") hxs = Selector(response) join = Join("") _book = hxs.xpath(self.xpath_book).extract() if self.author: _author = hxs.xpath(self.author).extract() author = join(_author) else: author = "" book = join(_book) or None log.msg("插入小说到数据库中") novel = Novel(name=book, author=author, spider_class=self.config, start_url=self.url, interval=10, alias=slugify(book)) if self.category: category = Category.objects.get(self.category) novel.category = category else: category = Category.objects.all() novel.category = category[0] novel.save() log.msg("插入小说成功") self.book = novel if self.chapter_list: log.msg(u"开始获取章节列表") _sgml = SgmlLinkExtractor(restrict_xpaths=self.chapter_list) links = _sgml.extract_links(response) links = [ Link(url=l.url, text=l.text, regex=self.regex) for l in links ] log.msg(u"成功获取章节列表") if self.regex: log.msg("对章节列表进行排序") links = sorted(links, key=get_cid) log.msg("章节列表排序完成") for n, link in enumerate(links, start=1): _q = Collection.objects.filter( url_hash=hashlib.sha1(link.url).hexdigest()) if not _q: yield Request(url=link.url, callback=self._parse, meta=dict(link=link, num=n)) else: log.msg(u"没有获取到章节列表的XPATH,请修改配置文件") return
def parse_testfile(self, response): lx = SgmlLinkExtractor() urls = lx.extract_links(response) readed = 0 notreaded = 0 for oneurl in urls: handle = OpenMD5File(oneurl.url, 'rb') if handle == False: notreaded += 1 else: readed += 1 handle.close() print readed, notreaded
def parse_testfile(self, response): lx = SgmlLinkExtractor() urls = lx.extract_links(response) readed = 0 notreaded = 0 for oneurl in urls: handle = OpenMD5File(oneurl.url, "rb") if handle == False: notreaded += 1 else: readed += 1 handle.close() print readed, notreaded
def parse(self, response): """ This function parses a sample response. Some contracts are mingled with this docstring. @url http://www.chunyuyisheng.com/clinics/1/doctors @returns items 0 0 @returns requests 500 100000 """ hxs = HtmlXPathSelector(response) listlinkExtractor = SgmlLinkExtractor( allow=(r"/clinics/\d+/doctors(|\?page=\d+)", ), unique=True) list_links = listlinkExtractor.extract_links(response) for link in list_links: yield Request(link.url, callback=self.parse) docdetail_linkExtractor = SgmlLinkExtractor( allow=(r"/doctor/clinic_web_\w+$", ), unique=True) docdetail_links = docdetail_linkExtractor.extract_links(response) for link in docdetail_links: yield Request(link.url, callback=self.parse_doctor_detail)
class FollowAllSpider(BaseSpider): name = 'followall' def __init__(self, **kw): super(FollowAllSpider, self).__init__(**kw) url = kw.get('url') or kw.get('domain') or 'http://scrapinghub.com/' if not url.startswith('http://') and not url.startswith('https://'): url = 'http://%s/' % url self.url = url self.allowed_domains = [urlparse(url).hostname.lstrip('www.')] self.link_extractor = SgmlLinkExtractor() self.cookies_seen = set() def start_requests(self): return [Request(self.url, callback=self.parse)] def parse(self, response): page = self._get_item(response) r = [page] r.extend(self._extract_requests(response)) return r def _get_item(self, response): item = Page(url=response.url, size=str(len(response.body)), referer=response.request.headers.get('Referer')) self._set_title(item, response) self._set_new_cookies(item, response) return item def _extract_requests(self, response): r = [] if isinstance(response, HtmlResponse): links = self.link_extractor.extract_links(response) r.extend(Request(x.url, callback=self.parse) for x in links) return r def _set_title(self, page, response): if isinstance(response, HtmlResponse): title = HtmlXPathSelector(response).select("//title/text()").extract() if title: page['title'] = title[0] def _set_new_cookies(self, page, response): cookies = [] for cookie in [x.split(';', 1)[0] for x in response.headers.getlist('Set-Cookie')]: if cookie not in self.cookies_seen: self.cookies_seen.add(cookie) cookies.append(cookie) if cookies: page['newcookies'] = cookies
def parse(self, response): print('inside a thread') hxs = HtmlXPathSelector(response) filename_ = response.url.split("/")[-2][1:] filename= os.path.abspath(databasePath+ "\data\%s" % filename_) dumpFilePath = os.path.abspath(databasePath+ "\dump\%s" % filename_) try: a = response.meta['page'] except KeyError: a=0 os.mkdir(dumpFilePath) with open(filename, 'a') as f: #header forumTitle=hxs.select('//div[@class="module forums"]/h2/text()').extract()[0].encode('ascii','ignore').replace('\n','') extraInfo=hxs.select('//div[@class="module forums discussion tid"]/h4/text()').extract()[0].encode('ascii','ignore').replace('\n','') f.write("title:"+forumTitle+"\n") f.write("extraInfo:"+extraInfo+"\n") f.write(response.url+"\n") f.write(filename+"\n") f.write(dumpFilePath+"\n\n") with open(dumpFilePath+ "\\" +str(a)+'.html', 'a') as fd: fd.write(response.body) with open(filename, 'a') as f: for entry in hxs.select('//div[contains(@class,"forums-thread")]'): msgID= entry.select('span/@id').extract()[0] msgDate= entry.select('h4/text()').extract()[0].encode('ascii','ignore').replace('\n','') msgText=' '.join(entry.select('span/text()').extract()).encode('ascii','ignore').replace('\n','') try: mgAuthor= entry.select('h3/span/a/text()').extract()[0].encode('ascii','ignore').replace('\n','') except: mgAuthor='none' try: msgTitle= entry.select('h3/strong/text()').extract()[0].encode('ascii','ignore').replace('\n','') except: msgTitle="none" f.write('msgID:'+msgID+'\n') f.write('msgTitle:'+msgTitle+'\n') f.write('mgAuthor:'+mgAuthor+'\n') f.write('msgDate:'+msgDate+'\n') f.write('msgText:'+msgText+'\n\n') s = SgmlLinkExtractor(restrict_xpaths=['//li[contains(@class, "next")]']) Links = s.extract_links(response) if len(Links) > 0: print 'going to the next page' r = Request(googc+Links[0].url, callback=self.parse) r.meta['page']=a+1; yield r
class StockHeXun(Spider): name = "hexun" allowed_domains = ["stock.hexun.com"] child_link = re.compile(ur'http://stock.hexun.com/\d+-\d+-\d+/\d+.html') start_urls = [ "http://stock.hexun.com/"] def __init__(self ,*arg, **kw): super(StockHeXun , self).__init__(*arg , **kw) self.link_extract = SgmlLinkExtractor() def parse(self, response): if response.url in self.start_urls: links = self.link_extract.extract_links(response) for x in links: match = self.child_link.match(x.url) if match: if x.url not in BLOOM_FILTER: BLOOM_FILTER.add(x.url) yield Request(x.url, callback=self.parse) else: item = WenkrItem() title = response.xpath('//div[@id="artibodyTitle"]/h1/text()').extract() if len(title) == 0: title = response.xpath('//head/title//text()').extract()[0] else: title = title[0] item['title'] = title item['content'] = self.make_content( response.xpath('//div[@id="artibody"]//*//text()').extract()) item['category'] = '股票' author = response.xpath('//span[@id="author_baidu"]/font/text()').extract() if len(author) > 0 : item['author'] = response.xpath('//span[@id="author_baidu"]/font/text()').extract()[0] else: item['author'] = u'和讯' item['tags'] = response.xpath('//meta[@name="keywords"]/@content').extract()[0] if len(item['content']) > 80 : yield item def make_content(self, ps): content = [] for p in ps: if len(p) > 5 : content.append(' %s' % p) return '\r\n'.join(content)
def parse_leagues(self, response): sx = SgmlLinkExtractor(allow=[ r'http://www.sportingbet.com/sports-football/' '[A-Za-z0-9-]+/1-102-\d+.html' ]) league_links = sx.extract_links(response) # Remove unwanted links, returns True to filter out link league_links = [ link for link in league_links if not linkFilter(self.name, link.url) ] eventClassIdList = [] # Extract eventClassId from the link.url with regex for link in league_links: matches = re.findall( r'http://www.sportingbet.com/sports-football/' '[A-Za-z0-9-]+/1-102-(\d+?).html', link.url) if matches: eventClassIdList.append(matches[0]) base_url = 'http://www.sportingbet.com/services/CouponTemplate.mvc/GetCoupon' headers = { 'Content-Type': 'application/x-www-form-urlencoded', 'Referer': 'http://www.sportingbet.com/sports-football/0-102-410.html', 'X-Requested-With': 'XMLHttpRequest', 'Host': 'www.sportingbet.com', } # cookies =response.meta['cookies'] for id in eventClassIdList: # Build GETstr GETstr = '?couponAction=EVENTCLASSCOUPON&' GETstr += 'sportIds=102&' GETstr += 'marketTypeId=&' GETstr += 'eventId=&' GETstr += 'bookId=&' GETstr += 'eventClassId=' + str(id) + '&' GETstr += 'sportId=102&' GETstr += 'eventTimeGroup=ETG_NextFewHours_0_0' # make req yield Request(url=base_url + GETstr, headers=headers, meta={'eventClassId': str(id)}, callback=self.pre_parse_Data)
def parse_brands(self,response): lx = SgmlLinkExtractor(restrict_xpaths=('//td[@valign="top"]'), allow=('\S+\.com'), unique=True) links = lx.extract_links(response) brands_all = set(sorted(link.text for link in links)) self.log(u'Extracted {} brands.'.format(len(brands_all)), scrapy.log.DEBUG) """Traverse through all the pages to get all products""" """brands_alphabets = ['A','B','C','D','E','F','G','H','I', 'J','K','L','M','N','O','P','Q','R', 'S','T','U','V','W','X','Y','Z']""" brands_alphabets = ['A'] for alpha in brands_alphabets: yield Request(self.url_view_items + str(alpha), callback=self.items_list)
def test_process_value(self): """Test restrict_xpaths with encodings""" html = """ <a href="javascript:goToPage('../other/page.html','photo','width=600,height=540,scrollbars'); return false">Link text</a> <a href="/about.html">About us</a> """ response = HtmlResponse("http://example.org/somepage/index.html", body=html, encoding="windows-1252") def process_value(value): m = re.search("javascript:goToPage\('(.*?)'", value) if m: return m.group(1) lx = SgmlLinkExtractor(process_value=process_value) self.assertEqual(lx.extract_links(response), [Link(url="http://example.org/other/page.html", text="Link text")])
def parse_image_list_page(self, response): hxs = HtmlXPathSelector(response) selector = SgmlLinkExtractor(allow=('photos/\d+'), restrict_xpaths="//div[@class='gallery-list-wrapper page-block']", unique=True) next_page_link = SgmlLinkExtractor(allow=('shop/\d+/photos(\?pg=\d+)*'), restrict_xpaths="//a[@class='NextPage']", unique=True) # Prepare cookies cookies = {} if 'Set-Cookie' in response.headers: for eq in response.headers['Set-Cookie'].split(';'): k,v = eq.strip().split('=') cookies[k] = v requests = [] # follow next-page for link in next_page_link.extract_links(response): req = Request(link.url, cookies=cookies, callback=self.parse_image_list_page) requests.append(req) # follow image link for link in selector.extract_links(response): req = Request(link.url, cookies=cookies, callback=self.extract_image) requests.append(req) for req in requests: yield req
def test_process_value(self): """Test restrict_xpaths with encodings""" html = """ <a href="javascript:goToPage('../other/page.html','photo','width=600,height=540,scrollbars'); return false">Link text</a> <a href="/about.html">About us</a> """ response = HtmlResponse("http://example.org/somepage/index.html", body=html, encoding='windows-1252') def process_value(value): m = re.search("javascript:goToPage\('(.*?)'", value) if m: return m.group(1) lx = SgmlLinkExtractor(process_value=process_value) self.assertEqual(lx.extract_links(response), [Link(url='http://example.org/other/page.html', text='Link text')])