def _verify_get(self, url, times=0, headers=default_headers, refresh_ip=False, timeout=download_timeout): headers.update({'User-Agent': self._user_agent}) try: response = self._http_client.get(url, headers=headers, timeout=timeout) if response.status_code == 200: logging.debug(response.headers) pass elif response.status_code == 302: location = response.headers['Location'] logging.debug("location: %s" % location) raise Error302() elif response.status_code == 403: raise Error403() elif response.status_code == 404: raise Error404() elif response.status_code == 502: raise Error502() else: raise ErrorStatusCode(response.status_code) return response except Error403, err: raise err
def _verify_post(self, url, data=None, json=None, times=0, headers=default_headers, timeout=download_timeout): # headers.update({ # 'User-Agent': self._user_agent, # # "Proxy-Authorization": self.get_authHeader() # }) try: response = self._http_client.post(url=url, data=data, json=json, headers=headers, timeout=timeout) if response.status_code == 200: logging.debug(response.headers) pass elif response.status_code == 302: location = response.headers['Location'] logging.debug("location: %s" % location) raise Error302() elif response.status_code == 403: raise Error403() elif response.status_code == 404: raise Error404() elif response.status_code == 502: raise Error502() elif response.status_code == 503: raise Error503() else: raise ErrorStatusCode(response.status_code) return response except Error403, err: raise err
def _verify_get(self, url): # kwargs.setdefault("allow_redirects", False) response = self._http_client.get(url) if response.status_code == 200: pass elif response.status_code == 302: location = response.headers['Location'] user_verify_url = urljoin("http://qiye.qianzhan.com/", location) is_success = self.do_verify(user_verify_url) if is_success: response = self._verify_get(url) else: is_success = self.login() if is_success: response = self._http_client.get(url) else: raise Error302() elif response.status_code == 403: raise Error403() elif response.status_code == 404: is_success = self.login() if is_success: response = self._http_client.get(url) else: raise Error404() else: raise ErrorStatusCode() return response
def _run(self): for i in range(968, len(self._txt)): for j in range(i, len(self._txt)): # if i % 2 == 0: # j = i + 5 search_key = self._txt[i] + self._txt[j] # search_key = u'在线途游(北京)科技有限公司' # search_key = u'北京' if RedisClient.get_search_key_key(search_key): continue logging.info( "++++++crawl 1000:->i: %d, j: %d, len: %d, search_key: %s" % (i, j, len(self._txt), search_key)) # url = "http://www.qichacha.com/search?key=" + urllib.quote(search_key.encode('utf-8')) + "&index=0" # url = "http://qiye.qianzhan.com/orgcompany/searchlistview/qy/" + urllib.quote( # search_key.encode('utf-8')) + "?o=0&area=0&areaN=%E5%85%A8%E5%9B%BD&p=1" # url = "http://qiye.qianzhan.com/orgcompany/searchlistview/qy/" + urllib.quote( # search_key.encode('utf-8')) + "?o=0&area=11&areaN=%E5%8C%97%E4%BA%AC&p=" + str(page) url = "http://qiye.qianzhan.com/search/all/" + urllib.quote( search_key.encode( 'utf-8')) + "?o=0&area=11&areaN=%E5%8C%97%E4%BA%AC" try: self._get_search(url) RedisClient.set_search_key_key(search_key) except Error302, err: raise Error302(i, j) except Error403, err: raise Error403(i, j) except Exception, e: logging.exception( "_get_search:->i: %d, j: %d, len: %d, search_key: %s, %s" % (i, j, len(self._txt), search_key, e.message)) pass
def _verify_post(self, url, data=None, json=None, times=0): try: response = self._http_client.post(url, data, json) if response.status_code == 200: pass elif response.status_code == 302: location = response.headers['Location'] logging.debug("location: %s" % location) raise Error302() elif response.status_code == 403: raise Error403() elif response.status_code == 404: raise Error404() elif response.status_code == 502: raise Error502() elif response.status_code == 503: raise Error503() else: raise ErrorStatusCode(response.status_code) return response except HttpClientError, err: times += 1 if times < 3: return self._verify_post(url, data=data, json=json, times=times) else: raise err
def _verify_get(self, url, **kwargs): kwargs.setdefault("allow_redirects", False) response = self._http_client.get(url, **kwargs) if response.status_code == 200: pass elif response.status_code == 302: location = response.headers['Location'] logging.debug("location: %s" % location) raise Error302() elif response.status_code == 403: raise Error403() elif response.status_code == 404: raise Error404() else: raise ErrorStatusCode() return response
def _check_response(self, response): if response.status_code == 200: logging.debug(response.headers) pass elif response.status_code == 302: location = response.headers['Location'] logging.debug("location: %s" % location) raise Error302() elif response.status_code == 403: raise Error403() elif response.status_code == 404: raise Error404() elif response.status_code == 502: raise Error502() elif response.status_code == 503: raise Error503() else: raise ErrorStatusCode(response.status_code) return response
def _run(self): cur = ZhaopinDB.get_companys() for item in cur: search_key = item['company_name'] if RedisClient.get_search_key_detail_key(search_key): continue logging.info("++++++crawl zhaopin:->search_key: %s" % search_key) url = "http://qiye.qianzhan.com/search/all/" + urllib.quote( search_key.encode( 'utf-8')) + "?o=0&area=11&areaN=%E5%8C%97%E4%BA%AC" try: self._get_search(url) RedisClient.set_search_key_detail_key(search_key) except Error302, err: raise Error302() except Error403, err: raise Error403()