def parse(self, response): items = [] sel = Selector(response) brands_xpath = "//div[@id='J_NavCommonRowItems_0']/a" brands_links = sel.xpath(brands_xpath) assert len( brands_links) > 0, 'cant find brands on this page: {}'.format( response.url) count = 1 for brand in brands_links: # easton: avoid taobao's anti spider sleep(5) JSMiddleware.get_driver().get(self.start_urls[0]) item = Brand() item['name'] = brand.xpath('@title').extract()[0] try: # chrome and PhantomJS can't click invisible link, but js can make it #JSMiddleware.driver().find_element_by_xpath(brands_xpath + '[' + str(count) + ']').click() script = "var s=window.document.createElement('script');\ s.src='{}';\ window.document.head.appendChild(s); \ var s=window.document.createElement('script');\ s.src='{}';\ window.document.head.appendChild(s);".format( inject_jsfile_path1, inject_jsfile_path2) JSMiddleware.get_driver().execute_script(script) sleep(1) # wait browser loads js libs # chrome supports `$x(//div[@id="J_NavCommonRowItems_0"]/a)[0].click() script = '$(document).xpath("' + brands_xpath + '")[' + str( count - 1) + '].click()' JSMiddleware.get_driver().execute_script(script) except WebDriverException: # if can't click logger.error(traceback.format_exc()) logger.error(' '.join([ 'click failed.', 'count:', str(count), item['name'], JSMiddleware.get_driver.current_url, '\n' ])) else: item['url'] = JSMiddleware.get_driver().current_url items.append(item) yield item finally: # if jump to login page, or pop the login window if 'anti_Spider' in JSMiddleware.get_driver().current_url or \ JSMiddleware.get_driver().current_url.startswith(self.start_urls[0].decode('utf-8')): logger.critical( 'cant go ahead crawling, please manually pass the verification' ) sys.exit(0) for i in items: yield Request(i['url'], callback=self.parse_separate_brand)
def parse(self, response): items = [] sel = Selector(response) brands_xpath = "//div[@id='J_NavCommonRowItems_0']/a" brands_links = sel.xpath(brands_xpath) assert len(brands_links) > 0, 'cant find brands on this page: {}'.format(response.url) count = 1 for brand in brands_links: # easton: avoid taobao's anti spider sleep(5) JSMiddleware.get_driver().get(self.start_urls[0]) item = Brand() item['name'] = brand.xpath('@title').extract()[0] try: # chrome and PhantomJS can't click invisible link, but js can make it #JSMiddleware.driver().find_element_by_xpath(brands_xpath + '[' + str(count) + ']').click() script = "var s=window.document.createElement('script');\ s.src='{}';\ window.document.head.appendChild(s); \ var s=window.document.createElement('script');\ s.src='{}';\ window.document.head.appendChild(s);".format(inject_jsfile_path1, inject_jsfile_path2) JSMiddleware.get_driver().execute_script(script) sleep(1) # wait browser loads js libs # chrome supports `$x(//div[@id="J_NavCommonRowItems_0"]/a)[0].click() script = '$(document).xpath("' + brands_xpath + '")[' + str(count-1) + '].click()' JSMiddleware.get_driver().execute_script(script) except WebDriverException: # if can't click logger.error(traceback.format_exc()) logger.error(' '.join(['click failed.', 'count:', str(count), item['name'], JSMiddleware.get_driver.current_url, '\n'])) else: item['url'] = JSMiddleware.get_driver().current_url items.append(item) yield item finally: # if jump to login page, or pop the login window if 'anti_Spider' in JSMiddleware.get_driver().current_url or \ JSMiddleware.get_driver().current_url.startswith(self.start_urls[0].decode('utf-8')): logger.critical('cant go ahead crawling, please manually pass the verification') sys.exit(0) for i in items: yield Request(i['url'], callback = self.parse_separate_brand)
def process_request(self, request, spider): # TODO: easton: currently use PhantomJS to render all #if request.meta.get('js'): # you probably want a conditional trigger # easton: only chrome needs open the page before real one, and must in the same domain if DEBUG: self.get_driver().get('https://www.taobao.com/about/') load_cookies(self.get_driver()) self.get_driver().get(request.url) # easton: phantomjs stored other domain's cookie, serious. #store_cookies(self.driver()) if 'anti_Spider' in self.get_driver().current_url: logger.error('anti_Spider got you, gonna retry') sleep(1) reason = 'anti_Spider' return self._retry(request, reason, spider) body = self.get_driver().page_source return HtmlResponse(self.get_driver().current_url, body=body, encoding='utf-8', request=request)