def get_geeTest(geeTest_params): """返回极验验证的三个验证参数""" url = 'http://api.geetest.com/ajax.php' headers = { 'Host': 'api.geetest.com', 'Connection': 'keep-alive', 'Accept': 'application/json, text/javascript, */*; q=0.01', 'X-Requested-With': 'XMLHttpRequest', 'User-Agent': url_requests.random_userAgent(), 'Referer': 'http://www.gsxt.gov.cn/index.html' } if not geeTest_params: return None response = url_requests.get(url=url, params=geeTest_params, headers=headers, proxies=proxies) json_string = response.content.split('(')[1][:-1] data = json.loads(json_string) if data['message'] != 'success': return None geeTest = {} geeTest['geetest_validate'] = data.get('validate') geeTest['geetest_seccode'] = data.get('validate') + '|jordan' return geeTest
def get_headers(): '''随机获取userAgent''' headers = { 'Host': 'www.gsxt.gov.cn', 'Referer': 'http://www.gsxt.gov.cn/corp-query-search-1.html', 'User-Agent': url_requests.random_userAgent() } return headers
def get_result(name, geeTest): """传入企业名称及极验验证通过的参数返回企业列表""" url = 'http://www.gsxt.gov.cn/corp-query-search-1.html' headers = { 'Host': 'www.gsxt.gov.cn', 'Origin': 'http://www.gsxt.gov.cn', 'Connection': 'keep-alive', 'Accept': 'application/json, text/javascript, */*; q=0.01', 'X-Requested-With': 'XMLHttpRequest', 'User-Agent': url_requests.random_userAgent(), 'Referer': 'http://www.gsxt.gov.cn/index.html' } geeTest.update({ 'searchword': name, 'tab': 'ent_tab', 'token': '49092658' # 伪装验证码 }) response = url_requests.post(url=url, data=geeTest, headers=headers, proxies=proxies) soup = BeautifulSoup(response.content, 'lxml') result_count = soup.find('span', class_='search_result_span1') if int(result_count.text) == 0: print '未查询到相关企业' return [] search_list = soup.find_all('a', class_='search_list_item db') result_items = [] for each_search in search_list: search_name = each_search.find('h1', class_='f20') search_name = search_name.text.replace('\n', '')\ .replace('\r', '')\ .replace('\t', '')\ .strip() detail_link = 'http://www.gsxt.gov.cn' + each_search['href'] item = {} item['company'] = search_name item['detail'] = detail_link result_items.append(item) return result_items
def get_path(gt): """获取验证码所需要用到的path参数""" url = 'http://api.geetest.com/gettype.php' headers = { 'Host': 'api.geetest.com', 'User-Agent': url_requests.random_userAgent(), 'Referer': 'http://www.gsxt.gov.cn/index.html' } params = {'gt': gt, 'callback': 'geetest_' + str(int(time.time() * 1000))} response = url_requests.get(url=url, params=params, headers=headers, proxies=proxies) res = json.loads(response.content.split('(')[1][:-1]) data = res.get('data', {}) path = data.get('path', '/static/js/geetest.5.7.0.js') return path
def get_four(gt, challenge, path): """更新gt和challenge,同时获取被剪掉的滑快和完整图的图片地址,组合成字典返回""" url = 'http://api.geetest.com/get.php' headers = { 'Host': 'api.geetest.com', 'Connection': 'keep-alive', 'Accept': 'application/json, text/javascript, */*; q=0.01', 'X-Requested-With': 'XMLHttpRequest', 'User-Agent': url_requests.random_userAgent(), 'Referer': 'http://www.gsxt.gov.cn/index.html' } params = { 'gt': gt, 'challenge': challenge, 'product': 'popup', 'offline': 'false', 'protocol': '', 'type': 'slide', 'path': path, 'callback': 'geetest_%s' % str(int(time.time() * 1000)) } response = url_requests.get(url=url, params=params, headers=headers, proxies=proxies) json_content = response.content.split('(')[1][:-1] data = json.loads(json_content) query_string = {} query_string['gt'] = data.get('gt') query_string['challenge'] = data.get('challenge') query_string['gb'] = data.get('bg') query_string['fullbg'] = data.get('fullbg') return query_string
def get_gt_challenge(): """获取gt和challenge参数,滑快验证码需要用到的。""" url = 'http://www.gsxt.gov.cn/SearchItemCaptcha' headers = { 'Host': 'www.gsxt.gov.cn', 'Connection': 'keep-alive', 'Accept': 'application/json, text/javascript, */*; q=0.01', 'X-Requested-With': 'XMLHttpRequest', 'User-Agent': url_requests.random_userAgent(), 'Referer': 'http://www.gsxt.gov.cn/index.html' } params = {'v': int(time.time() * 1000)} response = url_requests.get(url=url, params=params, headers=headers, proxies=proxies) data = json.loads(response.content) gt = data.get('gt') challenge = data.get('challenge') return gt, challenge