Ejemplo n.º 1
0
def retry_crawl(url, isProxy):
    response = None
    logging.error('抓取异常!正在试图重新抓取页面{}'.format(url))
    for i in range(spider_retry_num):
        logging.error('重新抓取第{}次'.format(i + 1))
        try:
            if isProxy:
                proxy = _proxy()
                print('正在使用代理{},抓取页面 {}'.format(proxy, url))
                response = requests.get(url, headers=get_proxy_headers(proxy), proxies=proxy, timeout=spider_timeout)
            else:
                response = requests.get(url, headers=get_headers(), timeout=spider_timeout)
        except requests.exceptions.ProxyError as e:
            # logging.exception(e)
            logging.error(e)
            continue
        except requests.exceptions.ConnectTimeout as e:
            # logging.exception(e)
            logging.error(e)
            continue
        soup = BeautifulSoup(response.text, 'lxml')
        com_all_info = soup.find_all(class_='m_srchList')
        _response = response.text
        if len(com_all_info) > 0:
            break
        # elif '<script>window.location.href=' in _response:  # 操作频繁验证链接
        #     verify_url = re.findall("<script>window.location.href='(.*?)';</script>", _response)[0]
        #     print('由于操作频繁被企查查识别为爬虫,请手动点击此链接验证:{}'.format(verify_url))
        #     # verify(verify_url)
        #     time.sleep(20)
        else:
            logging.error('=================返回异常=================')
            logging.error(response.text)
        time.sleep(random.randint(crawl_interval_mintime, crawl_interval_maxtime))
    return response
Ejemplo n.º 2
0
 f = open(enterprise_search_file, encoding='utf-8')
 enterprise_list = f.readlines()
 print('开始对文件进行重复检查......')
 _enterprise_list = remove_repeat(enterprise_list)
 print('去除重复后企业总数============={}'.format(len(_enterprise_list)))
 f.close()
 # 增加重试连接次数
 requests.adapters.DEFAULT_RETRIES = 5
 # 关闭多余的连接
 s = requests.session()
 s.keep_alive = False
 i = 0
 for name in _enterprise_list:
     if is_proxy:
         try:
             _proxy()
         except Exception as e:
             print(
                 '========================请先启动ip代理程序======================='
             )
             break
     # 定义查询结果集
     data_list = []
     # 定义查询结果异常集
     error_data_list = []
     i += 1
     start_url = base_url + str(name)
     # print(start_url)
     try:
         print("正在抓取第{}个公司==========================={}".format(
             i, name))
Ejemplo n.º 3
0
 # for i in range(100):
 # generateCookie()
 # print('cookie=============={}'.format(cookies_local))
 # generateProxyCookie(_proxy())
 # print('等待一分钟...')
 # time.sleep(60)
 # interval_time = time.time() - start_time
 # print(interval_time//60)
 # print(os.path.join(os.getcwd(), phantomjs_driver))
 # print(os.path.join(os.getcwd(), log_dir + r'\ghostdriver.log'))
 # print(ua.random)
 num = 0
 for i in range(100):
     url = 'https://www.qichacha.com/user_login'
     start_url = 'https://www.qichacha.com/search?key=安徽宝光特钢集团万里电力铁塔有限公司'
     proxy_ip = _proxy()
     uag = ua.random
     print(proxy_ip)
     print(uag)
     # get_proxy_headers(proxy_ip)
     # headers = {
     #     'Host': "www.qichacha.com",  # 需要修改
     #     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
     #     "Accept-Encoding": "gzip, deflate",
     #     "Accept-Language": "en-US,en;q=0.5",
     #     # "Connection": "keep-alive",
     #     "User-Agent": uag
     # }
     # try:
     #     with requests.Session() as r:
     #         r.get(url, headers=headers, proxies=proxy_ip, timeout=20)
Ejemplo n.º 4
0
def craw(url, key_word, x):
    User_Agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0'

    #  if x == 0:
    #  re = 'http://www.qichacha.com/search?key='+key_word
    #  else:
    #  re = 'https://www.qichacha.com/search?key={}#p:{}&'.format(key_word,x-1)
    re = 'https://www.qichacha.com/search?key=' + key_word
    headers = {
        'Host':
        'www.qichacha.com',
        "Connection":
        'keep-alive',
        'Accept':
        'text/html, */*; q=0.01',
        'X-Requested-With':
        'XMLHttpRequest',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
        'Referer':
        re,
        'Accept-Encoding':
        'gzip, deflate, br',
        'Accept-Language':
        'zh-CN,zh;q=0.9',
        'Cookie':
        'UM_distinctid=16d4d781a883e4-0d488cc5eb0bd7-3c375c0f-1fa400-16d4d781a897c2; _uab_collina=156896278000246547487356; acw_tc=73dc082815689627805371616ebe547185c282e64a38ad472dcfa708fb; zg_did=%7B%22did%22%3A%20%2216d4d781c73a19-0fb78f64911642-3c375c0f-1fa400-16d4d781c74a96%22%7D; Hm_lvt_3456bee468c83cc63fb5147f119f1075=1569726533,1570519913,1570523672,1570582899; QCCSESSID=lk9l0os8brt74vg77ma7af10c7; CNZZDATA1254842228=747979137-1568960098-https%253A%252F%252Fsp0.baidu.com%252F%7C1570580514; zg_de1d1a35bfa24ce29bbf2c7eb17e6c4f=%7B%22sid%22%3A%201570582899020%2C%22updated%22%3A%201570585759977%2C%22info%22%3A%201570519912754%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22%22%2C%22zs%22%3A%200%2C%22sc%22%3A%200%2C%22cuid%22%3A%20%220096bdea6a9eec62a4a96030c7eee5f4%22%7D; Hm_lpvt_3456bee468c83cc63fb5147f119f1075=1570585760',
    }

    try:
        # response = requests.get(url, headers=headers)
        response = requests.get(url,
                                headers=headers,
                                proxies=_proxy(),
                                timeout=5)
        if response.status_code != 200:
            response.encoding = 'utf-8'
            print(response.status_code)
            print('ERROR')
        soup = BeautifulSoup(response.text, 'lxml')
    except Exception as e:
        print('请求都不让,这企查查是想逆天吗???')
        print(str(Exception) + ' ' + str(e) + ' ' + repr(e))
    try:
        com_all_info = soup.find_all(class_='m_srchList')[0].tbody
        com_all_info_array = com_all_info.select('tr')
        print('开始爬取数据,请勿打开excel')
        for i in range(0, len(com_all_info_array)):
            # try:
            temp_g_name = com_all_info_array[i].select('td')[2].select(
                '.ma_h1')[0].text  # 获取公司名
            temp_g_tag = com_all_info_array[i].select('td')[2].select(
                '.search-tags')[0].text  # 获取公司标签
            temp_r_name = com_all_info_array[i].select('td')[2].select(
                'p')[0].a.text  # 获取法人名
            temp_g_money = com_all_info_array[i].select('td')[2].select(
                'p')[0].select('span')[0].text.strip('注册资本:')  # 获取注册资本
            temp_g_date = com_all_info_array[i].select('td')[2].select(
                'p')[0].select('span')[1].text.strip('成立日期:')  # 获取公司注册时间
            temp_r_email = com_all_info_array[i].select('td')[2].select(
                'p')[1].text.split('\n')[1].strip().strip('邮箱:')  # 获取法人Email
            temp_r_phone = com_all_info_array[i].select('td')[2].select(
                'p')[1].select('.m-l')[0].text.strip('电话:')  # 获取法人手机号
            temp_g_addr = com_all_info_array[i].select('td')[2].select(
                'p')[2].text.strip().strip('地址:')  # 获取公司地址
            temp_g_state = com_all_info_array[i].select('td')[3].select(
                '.nstatus.text-success-lt.m-l-xs')[0].text.strip()  # 获取公司状态

            g_name_list.append(temp_g_name)
            g_tag_list.append(temp_g_tag)
            r_name_list.append(temp_r_name)
            g_money_list.append(temp_g_money)
            g_date_list.append(temp_g_date)
            r_email_list.append(temp_r_email)
            r_phone_list.append(temp_r_phone)
            g_addr_list.append(temp_g_addr)
            g_state_list.append(temp_g_state)

            #  except Exception:
            #  print('错误!')
    except Exception as e:
        print('好像被拒绝访问了呢...请稍后再试叭...')
        print(str(Exception) + ' ' + str(e) + ' ' + repr(e))