Example #1
0
    def parse(self, response):
        items = []
        sel = Selector(response)
        brands_xpath = "//div[@id='J_NavCommonRowItems_0']/a"
        brands_links = sel.xpath(brands_xpath)
        assert len(
            brands_links) > 0, 'cant find brands on this page: {}'.format(
                response.url)

        count = 1
        for brand in brands_links:
            # easton: avoid taobao's anti spider
            sleep(5)
            JSMiddleware.get_driver().get(self.start_urls[0])

            item = Brand()
            item['name'] = brand.xpath('@title').extract()[0]
            try:
                # chrome and PhantomJS can't click invisible link, but js can make it
                #JSMiddleware.driver().find_element_by_xpath(brands_xpath + '[' + str(count) + ']').click()
                script = "var s=window.document.createElement('script');\
                    s.src='{}';\
                    window.document.head.appendChild(s); \
                    var s=window.document.createElement('script');\
                    s.src='{}';\
                    window.document.head.appendChild(s);".format(
                    inject_jsfile_path1, inject_jsfile_path2)
                JSMiddleware.get_driver().execute_script(script)
                sleep(1)  # wait browser loads js libs
                # chrome supports `$x(//div[@id="J_NavCommonRowItems_0"]/a)[0].click()
                script = '$(document).xpath("' + brands_xpath + '")[' + str(
                    count - 1) + '].click()'
                JSMiddleware.get_driver().execute_script(script)
            except WebDriverException:
                # if can't click
                logger.error(traceback.format_exc())
                logger.error(' '.join([
                    'click failed.', 'count:',
                    str(count), item['name'],
                    JSMiddleware.get_driver.current_url, '\n'
                ]))
            else:
                item['url'] = JSMiddleware.get_driver().current_url
                items.append(item)
                yield item
            finally:
                # if jump to login page, or pop the login window
                if 'anti_Spider' in JSMiddleware.get_driver().current_url or \
                        JSMiddleware.get_driver().current_url.startswith(self.start_urls[0].decode('utf-8')):
                    logger.critical(
                        'cant go ahead crawling, please manually pass the verification'
                    )
                    sys.exit(0)

        for i in items:
            yield Request(i['url'], callback=self.parse_separate_brand)
Example #2
0
    def parse(self, response):
        items = []
        sel = Selector(response)
        brands_xpath = "//div[@id='J_NavCommonRowItems_0']/a"
        brands_links = sel.xpath(brands_xpath)
        assert len(brands_links) > 0, 'cant find brands on this page: {}'.format(response.url)

        count = 1
        for brand in brands_links:
            # easton: avoid taobao's anti spider
            sleep(5)
            JSMiddleware.get_driver().get(self.start_urls[0])

            item = Brand()
            item['name'] = brand.xpath('@title').extract()[0]
            try:
                # chrome and PhantomJS can't click invisible link, but js can make it
                #JSMiddleware.driver().find_element_by_xpath(brands_xpath + '[' + str(count) + ']').click()
                script = "var s=window.document.createElement('script');\
                    s.src='{}';\
                    window.document.head.appendChild(s); \
                    var s=window.document.createElement('script');\
                    s.src='{}';\
                    window.document.head.appendChild(s);".format(inject_jsfile_path1, inject_jsfile_path2)
                JSMiddleware.get_driver().execute_script(script)
                sleep(1) # wait browser loads js libs
                # chrome supports `$x(//div[@id="J_NavCommonRowItems_0"]/a)[0].click()
                script = '$(document).xpath("' + brands_xpath + '")[' + str(count-1) + '].click()'
                JSMiddleware.get_driver().execute_script(script)
            except WebDriverException:
                # if can't click
                logger.error(traceback.format_exc())
                logger.error(' '.join(['click failed.', 'count:', str(count), item['name'], JSMiddleware.get_driver.current_url, '\n']))
            else:
                item['url'] = JSMiddleware.get_driver().current_url
                items.append(item)
                yield item
            finally:
                 # if jump to login page, or pop the login window
                if 'anti_Spider' in JSMiddleware.get_driver().current_url or \
                        JSMiddleware.get_driver().current_url.startswith(self.start_urls[0].decode('utf-8')):
                    logger.critical('cant go ahead crawling, please manually pass the verification')
                    sys.exit(0)

        for i in items:
            yield Request(i['url'], callback = self.parse_separate_brand)
Example #3
0
 def process_request(self, request, spider):
     # TODO: easton: currently use PhantomJS to render all
     #if request.meta.get('js'): # you probably want a conditional trigger
         # easton: only chrome needs open the page before real one, and must in the same domain
         if DEBUG:
             self.get_driver().get('https://www.taobao.com/about/')
         load_cookies(self.get_driver())
         self.get_driver().get(request.url)
         # easton: phantomjs stored other domain's cookie, serious.
         #store_cookies(self.driver())
         if 'anti_Spider' in self.get_driver().current_url:
             logger.error('anti_Spider got you, gonna retry')
             sleep(1)
             reason = 'anti_Spider'
             return self._retry(request, reason, spider)
         body = self.get_driver().page_source
         return HtmlResponse(self.get_driver().current_url, body=body, encoding='utf-8', request=request)
Example #4
0
 def process_request(self, request, spider):
     # TODO: easton: currently use PhantomJS to render all
     #if request.meta.get('js'): # you probably want a conditional trigger
     # easton: only chrome needs open the page before real one, and must in the same domain
     if DEBUG:
         self.get_driver().get('https://www.taobao.com/about/')
     load_cookies(self.get_driver())
     self.get_driver().get(request.url)
     # easton: phantomjs stored other domain's cookie, serious.
     #store_cookies(self.driver())
     if 'anti_Spider' in self.get_driver().current_url:
         logger.error('anti_Spider got you, gonna retry')
         sleep(1)
         reason = 'anti_Spider'
         return self._retry(request, reason, spider)
     body = self.get_driver().page_source
     return HtmlResponse(self.get_driver().current_url,
                         body=body,
                         encoding='utf-8',
                         request=request)