Esempio n. 1
0
    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        selenium = request.meta.get('Selenium')
        spider.browser.set_page_load_timeout(self.timeout)
        spider.browser.set_window_size(self.width, self.height)
        if selenium:
            try:
                spider.browser.get(request.url)
                body = wait.WebDriverWait.until(
                    self=wait.WebDriverWait,
                    method=ec.presence_of_element_located(
                        (By.CLASS_NAME, "framework")))
            except Exception as e:
                print "Exception is %s" % e
                return http.HtmlResponse(request.url, body=body)
            else:
                time.sleep(3)
                return http.HtmlResponse(request.url, body=body)
    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called

        op = webdriver.ChromeOptions()

        op.add_argument('headless')
        prefs = {"profile.managed_default_content_settings.images": 2}
        op.add_experimental_option("prefs", prefs)

        browser = webdriver.Chrome(chrome_options=op)

        try:
            browser.get(request.url)
            browser.execute_script('onDownloadApk(0)')
        except:
            return None

        content = browser.page_source.encode('utf-8')
        url = browser.current_url
        browser.close()
        return http.HtmlResponse(url=url,
                                 encoding='utf-8',
                                 body=content,
                                 request=request)
Esempio n. 3
0
    def process_request(cls, request, spider):

        phantomjs_path = "D:\\phantomjs-2.1.1-windows\\bin\\phantomjs.exe"

        if request.meta.get('PhantomJS', False):

            headers = {
                # 'Accept': '*/*',
                # 'Accept-Language': 'en-US,en;q=0.8',
                # 'Cache-Control': 'max-age=0',
                'User-Agent':
                'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36',
            }

            dcap = dict(DesiredCapabilities.PHANTOMJS)

            for key, value in headers.items():

                dcap['phantomjs.page.customHeaders.{}'.format(key)] = value

            driver = webdriver.PhantomJS(executable_path=phantomjs_path,
                                         desired_capabilities=dcap)
            driver.get(request.url)
            content = driver.page_source.encode('utf-8')
            driver.quit()
            return http.HtmlResponse(request.url,
                                     encoding='utf-8',
                                     body=content,
                                     request=request)
Esempio n. 4
0
    def process_request(cls, request, spider):

        #2.7
        # if request.meta.has_key('PhantomJS'):
        #3
        if 'PhantomJS' in request.meta:
            dcap = dict(DesiredCapabilities.PHANTOMJS)
            dcap["phantomjs.page.settings.userAgent"] = (
            'Mozilla/5.0 (Windows NT 6.3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36'
            )


            driver = webdriver.PhantomJS(executable_path="D:\\phantomjs-2.1.1-windows\\bin\\phantomjs.exe",
                        desired_capabilities=dcap
                            )
# http://car.autohome.com.cn/config/series/script/config-min.js?v=201709041522
           
            # driver.set_page_load_timeout(10)
            # while True:         
            #     try:
            #         driver.get(request.url)
            #         time.sleep(1)
            #     except:
            #         pass
            #         #driver.execute_script('window.stop()')
            #     else:
            #         break

            driver.get(request.url)
            content = driver.page_source.encode('utf-8')
            driver.quit()  
            return http.HtmlResponse(request.url, encoding='utf-8', body=content, request=request)
Esempio n. 5
0
 def process_request(self, request, spider):
     driver.get(request.url)
     time.sleep(1)
     html = driver.page_source
     return http.HtmlResponse(url=request.url,
                              body=html.encode('utf-8'),
                              encoding='utf-8',
                              request=request)
    def process_request(self, request, spider):
        if request.meta.has_key('PhantomJS'):
            driver = webdriver.Chrome()
            driver.get(request.url)
            time.sleep(3)  #停留3秒等待js加载
            content = driver.page_source.encode('utf-8')
            driver.quit()

            return http.HtmlResponse(request.url,
                                     encoding='utf-8',
                                     body=content,
                                     request=request)
    def process_request(self, request, spider):
        if request.meta.has_key('PhantomJS'):
            driver = webdriver.Chrome()
            driver.get(request.url)
            time.sleep(3)  #停留3秒等待js加载
            driver.execute_script("$('#buttonSelIndustry').click()")  #点击选择行业按钮
            content = driver.page_source.encode('utf-8')
            driver.quit()

            return http.HtmlResponse(request.url,
                                     encoding='utf-8',
                                     body=content,
                                     request=request)
Esempio n. 8
0
    def process_request(cls, request, spider):

        if request.meta.get('PhantomJS', False):

            url = request.url

            dcap = dict(DesiredCapabilities.PHANTOMJS)
            headers = {
                'Accept': '*/*',
                'Accept-Language': 'en-US,en;q=0.8',
                'Cache-Control': 'max-age=0',
                'User-Agent':
                'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36',
                'Connection': 'keep-alive',
                'Referer': 'http://www.baidu.com/',
            }
            for key, value in headers.items():

                dcap['phantomjs.page.customHeaders.{}'.format(key)] = value

            dcap["phantomjs.page.settings.userAgent"] = (
                'Mozilla/5.0 (Windows NT 6.3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36'
            )

            driver = webdriver.PhantomJS(
                executable_path=
                "D:\\work-path\\phantomjs-2.1.1-windows\\bin\\phantomjs.exe",
                desired_capabilities=dcap)
            # driver = webdriver.Chrome('D:\\work-path\\chromedriver',desired_capabilities=dcap)

            driver.set_page_load_timeout(3)
            try:
                driver.get(request.url)
            except:
                driver.execute_script('window.stop()')

            content = driver.page_source.encode('utf-8')
            driver.quit()
            return http.HtmlResponse(url,
                                     status=200,
                                     encoding='utf-8',
                                     body=content,
                                     request=request)

        else:
            return None
Esempio n. 9
0
    def process_request(cls, request, spider):

        phantomjs_path = "D:\\phantomjs-2.1.1-windows\\bin\\phantomjs.exe"

        if request.meta.get('PhantomJS', False):
            dcap = dict(DesiredCapabilities.PHANTOMJS)
            dcap["phantomjs.page.settings.userAgent"] = (
            "Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.23 Mobile Safari/537.36"
            )

            driver = webdriver.PhantomJS(executable_path=phantomjs_path,
                        desired_capabilities=dcap
                            )
            driver.get(request.url)
            content = driver.page_source.encode('utf-8')
            driver.quit()  
            return http.HtmlResponse(request.url, encoding='utf-8', body=content, request=request)