def parse(self, response): items = [] sel = Selector(response) brands_xpath = "//div[@id='J_NavCommonRowItems_0']/a" brands_links = sel.xpath(brands_xpath) assert len( brands_links) > 0, 'cant find brands on this page: {}'.format( response.url) count = 1 for brand in brands_links: # easton: avoid taobao's anti spider sleep(5) JSMiddleware.get_driver().get(self.start_urls[0]) item = Brand() item['name'] = brand.xpath('@title').extract()[0] try: # chrome and PhantomJS can't click invisible link, but js can make it #JSMiddleware.driver().find_element_by_xpath(brands_xpath + '[' + str(count) + ']').click() script = "var s=window.document.createElement('script');\ s.src='{}';\ window.document.head.appendChild(s); \ var s=window.document.createElement('script');\ s.src='{}';\ window.document.head.appendChild(s);".format( inject_jsfile_path1, inject_jsfile_path2) JSMiddleware.get_driver().execute_script(script) sleep(1) # wait browser loads js libs # chrome supports `$x(//div[@id="J_NavCommonRowItems_0"]/a)[0].click() script = '$(document).xpath("' + brands_xpath + '")[' + str( count - 1) + '].click()' JSMiddleware.get_driver().execute_script(script) except WebDriverException: # if can't click logger.error(traceback.format_exc()) logger.error(' '.join([ 'click failed.', 'count:', str(count), item['name'], JSMiddleware.get_driver.current_url, '\n' ])) else: item['url'] = JSMiddleware.get_driver().current_url items.append(item) yield item finally: # if jump to login page, or pop the login window if 'anti_Spider' in JSMiddleware.get_driver().current_url or \ JSMiddleware.get_driver().current_url.startswith(self.start_urls[0].decode('utf-8')): logger.critical( 'cant go ahead crawling, please manually pass the verification' ) sys.exit(0) for i in items: yield Request(i['url'], callback=self.parse_separate_brand)
def parse(self, response): items = [] sel = Selector(response) brands_xpath = "//div[@id='J_NavCommonRowItems_0']/a" brands_links = sel.xpath(brands_xpath) assert len(brands_links) > 0, 'cant find brands on this page: {}'.format(response.url) count = 1 for brand in brands_links: # easton: avoid taobao's anti spider sleep(5) JSMiddleware.get_driver().get(self.start_urls[0]) item = Brand() item['name'] = brand.xpath('@title').extract()[0] try: # chrome and PhantomJS can't click invisible link, but js can make it #JSMiddleware.driver().find_element_by_xpath(brands_xpath + '[' + str(count) + ']').click() script = "var s=window.document.createElement('script');\ s.src='{}';\ window.document.head.appendChild(s); \ var s=window.document.createElement('script');\ s.src='{}';\ window.document.head.appendChild(s);".format(inject_jsfile_path1, inject_jsfile_path2) JSMiddleware.get_driver().execute_script(script) sleep(1) # wait browser loads js libs # chrome supports `$x(//div[@id="J_NavCommonRowItems_0"]/a)[0].click() script = '$(document).xpath("' + brands_xpath + '")[' + str(count-1) + '].click()' JSMiddleware.get_driver().execute_script(script) except WebDriverException: # if can't click logger.error(traceback.format_exc()) logger.error(' '.join(['click failed.', 'count:', str(count), item['name'], JSMiddleware.get_driver.current_url, '\n'])) else: item['url'] = JSMiddleware.get_driver().current_url items.append(item) yield item finally: # if jump to login page, or pop the login window if 'anti_Spider' in JSMiddleware.get_driver().current_url or \ JSMiddleware.get_driver().current_url.startswith(self.start_urls[0].decode('utf-8')): logger.critical('cant go ahead crawling, please manually pass the verification') sys.exit(0) for i in items: yield Request(i['url'], callback = self.parse_separate_brand)
def parse_js_obj_g_page_config(self, response): """ :return dicted js obj g_page_config: :rtype :dict """ if self.detect_anti_spider_from_response(response): logger.critical(anti_spider_breakpoit_msg) raise CloseSpider(anti_spider_breakpoit_msg) #sys.exit() #won't quit, just a exception # this name is from taobao page: https://s.taobao.com/search?q=空调 g_page_config = '' for line in response.body.split('\n'): if 'g_page_config' in line: g_page_config = line.split('{', 1)[1].rsplit('}', 1)[0] break js_obj_gen = ijson.items(StringIO(''.join(('{', g_page_config, '}'))), '') js_obj = list(js_obj_gen)[0] if self.detect_anti_spider_from_js_obj(js_obj, response): logger.critical(anti_spider_breakpoit_msg) raise CloseSpider(anti_spider_breakpoit_msg) return js_obj
def process_response(self, request, response, spider): if TaobaoSpider_2.detect_anti_spider_from_response(response): logger.critical(anti_spider_breakpoit_msg) raise CloseSpider(anti_spider_breakpoit_msg) return response