def retry_crawl(url, isProxy): response = None logging.error('抓取异常!正在试图重新抓取页面{}'.format(url)) for i in range(spider_retry_num): logging.error('重新抓取第{}次'.format(i + 1)) try: if isProxy: proxy = _proxy() print('正在使用代理{},抓取页面 {}'.format(proxy, url)) response = requests.get(url, headers=get_proxy_headers(proxy), proxies=proxy, timeout=spider_timeout) else: response = requests.get(url, headers=get_headers(), timeout=spider_timeout) except requests.exceptions.ProxyError as e: # logging.exception(e) logging.error(e) continue except requests.exceptions.ConnectTimeout as e: # logging.exception(e) logging.error(e) continue soup = BeautifulSoup(response.text, 'lxml') com_all_info = soup.find_all(class_='m_srchList') _response = response.text if len(com_all_info) > 0: break # elif '<script>window.location.href=' in _response: # 操作频繁验证链接 # verify_url = re.findall("<script>window.location.href='(.*?)';</script>", _response)[0] # print('由于操作频繁被企查查识别为爬虫,请手动点击此链接验证:{}'.format(verify_url)) # # verify(verify_url) # time.sleep(20) else: logging.error('=================返回异常=================') logging.error(response.text) time.sleep(random.randint(crawl_interval_mintime, crawl_interval_maxtime)) return response
f = open(enterprise_search_file, encoding='utf-8') enterprise_list = f.readlines() print('开始对文件进行重复检查......') _enterprise_list = remove_repeat(enterprise_list) print('去除重复后企业总数============={}'.format(len(_enterprise_list))) f.close() # 增加重试连接次数 requests.adapters.DEFAULT_RETRIES = 5 # 关闭多余的连接 s = requests.session() s.keep_alive = False i = 0 for name in _enterprise_list: if is_proxy: try: _proxy() except Exception as e: print( '========================请先启动ip代理程序=======================' ) break # 定义查询结果集 data_list = [] # 定义查询结果异常集 error_data_list = [] i += 1 start_url = base_url + str(name) # print(start_url) try: print("正在抓取第{}个公司==========================={}".format( i, name))
# for i in range(100): # generateCookie() # print('cookie=============={}'.format(cookies_local)) # generateProxyCookie(_proxy()) # print('等待一分钟...') # time.sleep(60) # interval_time = time.time() - start_time # print(interval_time//60) # print(os.path.join(os.getcwd(), phantomjs_driver)) # print(os.path.join(os.getcwd(), log_dir + r'\ghostdriver.log')) # print(ua.random) num = 0 for i in range(100): url = 'https://www.qichacha.com/user_login' start_url = 'https://www.qichacha.com/search?key=安徽宝光特钢集团万里电力铁塔有限公司' proxy_ip = _proxy() uag = ua.random print(proxy_ip) print(uag) # get_proxy_headers(proxy_ip) # headers = { # 'Host': "www.qichacha.com", # 需要修改 # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", # "Accept-Encoding": "gzip, deflate", # "Accept-Language": "en-US,en;q=0.5", # # "Connection": "keep-alive", # "User-Agent": uag # } # try: # with requests.Session() as r: # r.get(url, headers=headers, proxies=proxy_ip, timeout=20)
def craw(url, key_word, x): User_Agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0' # if x == 0: # re = 'http://www.qichacha.com/search?key='+key_word # else: # re = 'https://www.qichacha.com/search?key={}#p:{}&'.format(key_word,x-1) re = 'https://www.qichacha.com/search?key=' + key_word headers = { 'Host': 'www.qichacha.com', "Connection": 'keep-alive', 'Accept': 'text/html, */*; q=0.01', 'X-Requested-With': 'XMLHttpRequest', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', 'Referer': re, 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cookie': 'UM_distinctid=16d4d781a883e4-0d488cc5eb0bd7-3c375c0f-1fa400-16d4d781a897c2; _uab_collina=156896278000246547487356; acw_tc=73dc082815689627805371616ebe547185c282e64a38ad472dcfa708fb; zg_did=%7B%22did%22%3A%20%2216d4d781c73a19-0fb78f64911642-3c375c0f-1fa400-16d4d781c74a96%22%7D; Hm_lvt_3456bee468c83cc63fb5147f119f1075=1569726533,1570519913,1570523672,1570582899; QCCSESSID=lk9l0os8brt74vg77ma7af10c7; CNZZDATA1254842228=747979137-1568960098-https%253A%252F%252Fsp0.baidu.com%252F%7C1570580514; zg_de1d1a35bfa24ce29bbf2c7eb17e6c4f=%7B%22sid%22%3A%201570582899020%2C%22updated%22%3A%201570585759977%2C%22info%22%3A%201570519912754%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22%22%2C%22zs%22%3A%200%2C%22sc%22%3A%200%2C%22cuid%22%3A%20%220096bdea6a9eec62a4a96030c7eee5f4%22%7D; Hm_lpvt_3456bee468c83cc63fb5147f119f1075=1570585760', } try: # response = requests.get(url, headers=headers) response = requests.get(url, headers=headers, proxies=_proxy(), timeout=5) if response.status_code != 200: response.encoding = 'utf-8' print(response.status_code) print('ERROR') soup = BeautifulSoup(response.text, 'lxml') except Exception as e: print('请求都不让,这企查查是想逆天吗???') print(str(Exception) + ' ' + str(e) + ' ' + repr(e)) try: com_all_info = soup.find_all(class_='m_srchList')[0].tbody com_all_info_array = com_all_info.select('tr') print('开始爬取数据,请勿打开excel') for i in range(0, len(com_all_info_array)): # try: temp_g_name = com_all_info_array[i].select('td')[2].select( '.ma_h1')[0].text # 获取公司名 temp_g_tag = com_all_info_array[i].select('td')[2].select( '.search-tags')[0].text # 获取公司标签 temp_r_name = com_all_info_array[i].select('td')[2].select( 'p')[0].a.text # 获取法人名 temp_g_money = com_all_info_array[i].select('td')[2].select( 'p')[0].select('span')[0].text.strip('注册资本:') # 获取注册资本 temp_g_date = com_all_info_array[i].select('td')[2].select( 'p')[0].select('span')[1].text.strip('成立日期:') # 获取公司注册时间 temp_r_email = com_all_info_array[i].select('td')[2].select( 'p')[1].text.split('\n')[1].strip().strip('邮箱:') # 获取法人Email temp_r_phone = com_all_info_array[i].select('td')[2].select( 'p')[1].select('.m-l')[0].text.strip('电话:') # 获取法人手机号 temp_g_addr = com_all_info_array[i].select('td')[2].select( 'p')[2].text.strip().strip('地址:') # 获取公司地址 temp_g_state = com_all_info_array[i].select('td')[3].select( '.nstatus.text-success-lt.m-l-xs')[0].text.strip() # 获取公司状态 g_name_list.append(temp_g_name) g_tag_list.append(temp_g_tag) r_name_list.append(temp_r_name) g_money_list.append(temp_g_money) g_date_list.append(temp_g_date) r_email_list.append(temp_r_email) r_phone_list.append(temp_r_phone) g_addr_list.append(temp_g_addr) g_state_list.append(temp_g_state) # except Exception: # print('错误!') except Exception as e: print('好像被拒绝访问了呢...请稍后再试叭...') print(str(Exception) + ' ' + str(e) + ' ' + repr(e))