def start_downloader(self, url, args): """ Start Downloader """ resp = Response() if url.find(u'?args=') > -1: real_url, search_word = url.split('?args=') search_word = unicode(unquote(search_word)) print 'url: ', real_url print 'search_word: ', search_word c = IndustryAndCommerceGeetestCrack( url=real_url, search_text=search_word, input_id="content", search_element_id="search", gt_element_class_name="gt_box", gt_slider_knob_name="gt_slider_knob", result_numbers_xpath='/html/body/div[1]/div[6]/div[1]/span', result_list_verify_class='clickStyle') result, cookies = c.crack() current_url = real_url body = result.encode( 'utf-8') if result else u'<html>有异常出现了</html>'.encode('utf-8') # resp.status_code = 200 resp._content = body resp.url = real_url resp.doc = PyQuery(body) return resp else: resp = self.download(url, args=args) return resp
def download(self, url, args={}): """ Downloader Download By tools :return: response object """ if isinstance(args, basestring): args = json.loads(args) tools = args.get('tools', 'requests') method = args.get('method', 'GET') if tools == "requests": self.reqst = requests.Session() self.headers = { 'Accept': 'text/html, application/xhtml+xml, */*', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-US, en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3', 'User-Agent': self.get_user_agent() } kwargs = { 'headers': args.get('headers', self.headers), 'cookies': args.get('cookies', None), 'proxies': args.get('proxies', None), 'timeout': args.get('timeout', 30) } if str(method).upper() == 'GET': kwargs['params'] = args.get('params', {}) elif str(method).upper() == 'POST': kwargs['data'] = args.get('data', {}) # try: resp = self.reqst.request(method=method, url=url, **kwargs) if resp.status_code != 200: raise ConnectionError("ConnectionError, {0}".format( resp.status_code)) resp.doc = PyQuery(resp.content) return resp # except Exception: # print traceback.format_exc() # raise Exception elif tools == 'phantomjs': """ Download by Phantomjs """ dcap = dict(DesiredCapabilities.PHANTOMJS) dcap['phantomjs.page.settings.userAgent'] = ( "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36" ) # driver = webdriver.PhantomJS(desired_capabilities=dcap) # 指定使用的浏览器 # driver = webdriver.PhantomJS() driver = webdriver.Chrome() try: print 'new get url: %s' % (url) driver.get(url) time.sleep(4) js = args.get('js_code', "var q=document.body.scrollTop=10000") driver.execute_script(js) # 可执行js,模仿用户操作。此处为将页面拉至最底端。 time.sleep(5) body = driver.page_source.encode('utf-8') print(u"访问" + url) except Exception as e: body = u'<html>有异常出现了</html>'.encode('utf-8') print str(e) traceback.print_exc() finally: current_url = driver.current_url driver.close() resp = Response() # resp.status_code = 200 resp._content = body resp.url = current_url resp.doc = PyQuery(body) return resp