Example #1
0
    def parse(self, response):
        items = []
        sel = Selector(response)
        brands_xpath = "//div[@id='J_NavCommonRowItems_0']/a"
        brands_links = sel.xpath(brands_xpath)
        assert len(
            brands_links) > 0, 'cant find brands on this page: {}'.format(
                response.url)

        count = 1
        for brand in brands_links:
            # easton: avoid taobao's anti spider
            sleep(5)
            JSMiddleware.get_driver().get(self.start_urls[0])

            item = Brand()
            item['name'] = brand.xpath('@title').extract()[0]
            try:
                # chrome and PhantomJS can't click invisible link, but js can make it
                #JSMiddleware.driver().find_element_by_xpath(brands_xpath + '[' + str(count) + ']').click()
                script = "var s=window.document.createElement('script');\
                    s.src='{}';\
                    window.document.head.appendChild(s); \
                    var s=window.document.createElement('script');\
                    s.src='{}';\
                    window.document.head.appendChild(s);".format(
                    inject_jsfile_path1, inject_jsfile_path2)
                JSMiddleware.get_driver().execute_script(script)
                sleep(1)  # wait browser loads js libs
                # chrome supports `$x(//div[@id="J_NavCommonRowItems_0"]/a)[0].click()
                script = '$(document).xpath("' + brands_xpath + '")[' + str(
                    count - 1) + '].click()'
                JSMiddleware.get_driver().execute_script(script)
            except WebDriverException:
                # if can't click
                logger.error(traceback.format_exc())
                logger.error(' '.join([
                    'click failed.', 'count:',
                    str(count), item['name'],
                    JSMiddleware.get_driver.current_url, '\n'
                ]))
            else:
                item['url'] = JSMiddleware.get_driver().current_url
                items.append(item)
                yield item
            finally:
                # if jump to login page, or pop the login window
                if 'anti_Spider' in JSMiddleware.get_driver().current_url or \
                        JSMiddleware.get_driver().current_url.startswith(self.start_urls[0].decode('utf-8')):
                    logger.critical(
                        'cant go ahead crawling, please manually pass the verification'
                    )
                    sys.exit(0)

        for i in items:
            yield Request(i['url'], callback=self.parse_separate_brand)
Example #2
0
    def parse(self, response):
        items = []
        sel = Selector(response)
        brands_xpath = "//div[@id='J_NavCommonRowItems_0']/a"
        brands_links = sel.xpath(brands_xpath)
        assert len(brands_links) > 0, 'cant find brands on this page: {}'.format(response.url)

        count = 1
        for brand in brands_links:
            # easton: avoid taobao's anti spider
            sleep(5)
            JSMiddleware.get_driver().get(self.start_urls[0])

            item = Brand()
            item['name'] = brand.xpath('@title').extract()[0]
            try:
                # chrome and PhantomJS can't click invisible link, but js can make it
                #JSMiddleware.driver().find_element_by_xpath(brands_xpath + '[' + str(count) + ']').click()
                script = "var s=window.document.createElement('script');\
                    s.src='{}';\
                    window.document.head.appendChild(s); \
                    var s=window.document.createElement('script');\
                    s.src='{}';\
                    window.document.head.appendChild(s);".format(inject_jsfile_path1, inject_jsfile_path2)
                JSMiddleware.get_driver().execute_script(script)
                sleep(1) # wait browser loads js libs
                # chrome supports `$x(//div[@id="J_NavCommonRowItems_0"]/a)[0].click()
                script = '$(document).xpath("' + brands_xpath + '")[' + str(count-1) + '].click()'
                JSMiddleware.get_driver().execute_script(script)
            except WebDriverException:
                # if can't click
                logger.error(traceback.format_exc())
                logger.error(' '.join(['click failed.', 'count:', str(count), item['name'], JSMiddleware.get_driver.current_url, '\n']))
            else:
                item['url'] = JSMiddleware.get_driver().current_url
                items.append(item)
                yield item
            finally:
                 # if jump to login page, or pop the login window
                if 'anti_Spider' in JSMiddleware.get_driver().current_url or \
                        JSMiddleware.get_driver().current_url.startswith(self.start_urls[0].decode('utf-8')):
                    logger.critical('cant go ahead crawling, please manually pass the verification')
                    sys.exit(0)

        for i in items:
            yield Request(i['url'], callback = self.parse_separate_brand)
Example #3
0
    def parse_js_obj_g_page_config(self, response):
        """
        :return dicted js obj  g_page_config:
        :rtype :dict
        """
        if self.detect_anti_spider_from_response(response):
            logger.critical(anti_spider_breakpoit_msg)
            raise CloseSpider(anti_spider_breakpoit_msg)
            #sys.exit() #won't quit, just a exception
        # this name is from taobao page: https://s.taobao.com/search?q=空调
        g_page_config = ''
        for line in response.body.split('\n'):
            if 'g_page_config' in line:
                g_page_config = line.split('{', 1)[1].rsplit('}', 1)[0]
                break

        js_obj_gen = ijson.items(StringIO(''.join(('{', g_page_config, '}'))), '')
        js_obj = list(js_obj_gen)[0]

        if self.detect_anti_spider_from_js_obj(js_obj, response):
            logger.critical(anti_spider_breakpoit_msg)
            raise CloseSpider(anti_spider_breakpoit_msg)
        return js_obj
Example #4
0
    def parse_js_obj_g_page_config(self, response):
        """
        :return dicted js obj  g_page_config:
        :rtype :dict
        """
        if self.detect_anti_spider_from_response(response):
            logger.critical(anti_spider_breakpoit_msg)
            raise CloseSpider(anti_spider_breakpoit_msg)
            #sys.exit() #won't quit, just a exception
        # this name is from taobao page: https://s.taobao.com/search?q=空调
        g_page_config = ''
        for line in response.body.split('\n'):
            if 'g_page_config' in line:
                g_page_config = line.split('{', 1)[1].rsplit('}', 1)[0]
                break

        js_obj_gen = ijson.items(StringIO(''.join(('{', g_page_config, '}'))),
                                 '')
        js_obj = list(js_obj_gen)[0]

        if self.detect_anti_spider_from_js_obj(js_obj, response):
            logger.critical(anti_spider_breakpoit_msg)
            raise CloseSpider(anti_spider_breakpoit_msg)
        return js_obj
Example #5
0
 def process_response(self, request, response, spider):
     if TaobaoSpider_2.detect_anti_spider_from_response(response):
         logger.critical(anti_spider_breakpoit_msg)
         raise CloseSpider(anti_spider_breakpoit_msg)
     return response
Example #6
0
 def process_response(self, request, response, spider):
     if TaobaoSpider_2.detect_anti_spider_from_response(response):
         logger.critical(anti_spider_breakpoit_msg)
         raise CloseSpider(anti_spider_breakpoit_msg)
     return response