Beispiel #1
0
 def start_downloader(self, url, args):
     """
     Start Downloader
     """
     resp = Response()
     if url.find(u'?args=') > -1:
         real_url, search_word = url.split('?args=')
         search_word = unicode(unquote(search_word))
         print 'url: ', real_url
         print 'search_word: ', search_word
         c = IndustryAndCommerceGeetestCrack(
             url=real_url,
             search_text=search_word,
             input_id="content",
             search_element_id="search",
             gt_element_class_name="gt_box",
             gt_slider_knob_name="gt_slider_knob",
             result_numbers_xpath='/html/body/div[1]/div[6]/div[1]/span',
             result_list_verify_class='clickStyle')
         result, cookies = c.crack()
         current_url = real_url
         body = result.encode(
             'utf-8') if result else u'<html>有异常出现了</html>'.encode('utf-8')
         # resp.status_code = 200
         resp._content = body
         resp.url = real_url
         resp.doc = PyQuery(body)
         return resp
     else:
         resp = self.download(url, args=args)
         return resp
Beispiel #2
0
    def download(self, url, args={}):
        """
        Downloader Download By tools
        :return: response object
        """
        if isinstance(args, basestring):
            args = json.loads(args)
        tools = args.get('tools', 'requests')
        method = args.get('method', 'GET')
        if tools == "requests":
            self.reqst = requests.Session()
            self.headers = {
                'Accept': 'text/html, application/xhtml+xml, */*',
                'Accept-Encoding': 'gzip, deflate',
                'Accept-Language':
                'en-US, en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
                'User-Agent': self.get_user_agent()
            }
            kwargs = {
                'headers': args.get('headers', self.headers),
                'cookies': args.get('cookies', None),
                'proxies': args.get('proxies', None),
                'timeout': args.get('timeout', 30)
            }
            if str(method).upper() == 'GET':
                kwargs['params'] = args.get('params', {})
            elif str(method).upper() == 'POST':
                kwargs['data'] = args.get('data', {})

            # try:
            resp = self.reqst.request(method=method, url=url, **kwargs)
            if resp.status_code != 200:
                raise ConnectionError("ConnectionError, {0}".format(
                    resp.status_code))
            resp.doc = PyQuery(resp.content)
            return resp
            # except Exception:
            #     print traceback.format_exc()
            #     raise Exception

        elif tools == 'phantomjs':
            """
            Download by Phantomjs
            """
            dcap = dict(DesiredCapabilities.PHANTOMJS)
            dcap['phantomjs.page.settings.userAgent'] = (
                "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
            )
            # driver = webdriver.PhantomJS(desired_capabilities=dcap)  # 指定使用的浏览器
            # driver = webdriver.PhantomJS()
            driver = webdriver.Chrome()
            try:
                print 'new get url: %s' % (url)
                driver.get(url)
                time.sleep(4)
                js = args.get('js_code', "var q=document.body.scrollTop=10000")
                driver.execute_script(js)  # 可执行js,模仿用户操作。此处为将页面拉至最底端。
                time.sleep(5)
                body = driver.page_source.encode('utf-8')
                print(u"访问" + url)
            except Exception as e:
                body = u'<html>有异常出现了</html>'.encode('utf-8')
                print str(e)
                traceback.print_exc()
            finally:
                current_url = driver.current_url
                driver.close()
            resp = Response()
            # resp.status_code = 200
            resp._content = body
            resp.url = current_url
            resp.doc = PyQuery(body)
            return resp