def zjmyqyw_spdier(): company_deque=deque([],maxlen=35) zjmyqyw=SuperSpider() zjmyqyw.source_name='浙江名营企业网' zjmyqyw.fax='-' zjmyqyw.get_request('http://www.zj123.com/') url_list1=['http://www.zj123.com/'+i.replace('1.','{}.') for i in zjmyqyw.data_search('find','.indsort dd a','href')] profession_list=list(zjmyqyw.data_search('find','.indsort dd a')) error_index=profession_list.index('特种印刷') for profession,url1 in zip(profession_list[error_index:],url_list1[error_index:]): for page in range(1,100): print(f'{profession}——第{page}页') try: zjmyqyw.get_request(url1.format(page)) page_judge=zjmyqyw.data_search('find','.sleft .m.m1 .fred').__next__().split()[0] except: print(f'获取第{page}页失败') page+=1 continue if int(page_judge) != page: break url_list2=('http://www.zj123.com/member/VIPContact/'+i.split('-')[1]+'/index.htm' for i in zjmyqyw.data_search('find','.listdetail22 .listdetail dt a','href')) url_list3=('http://www.zj123.com/member/VIPCompany/'+i.split('-')[1]+'/index.htm' for i in zjmyqyw.data_search('find','.listdetail22 .listdetail dt a','href')) #print(url_list2) for url2,url3 in zip(url_list2,url_list3): try: zjmyqyw.get_request(url2) except: continue contact_info_dict={i.split(':')[0].strip():i.split(':')[-1].strip().replace('\xa0','') for i in zjmyqyw.data_search('find','.rkbody table tr')} zjmyqyw.company_name=contact_info_dict['公司名称'] if contact_info_dict['公司名称'] else '-' if zjmyqyw.company_name in company_deque: print('信息重复') continue zjmyqyw.person_name=contact_info_dict['联系人'] if contact_info_dict['联系人'] else '-' zjmyqyw.address=contact_info_dict['地 址'] if contact_info_dict['地 址'] else '-' zjmyqyw.phone_number=contact_info_dict['电 话'] if contact_info_dict['电 话'] else '-' zjmyqyw.qq=contact_info_dict['QQ'] if contact_info_dict['QQ'] else '-' zjmyqyw.website=contact_info_dict['网 址'] if contact_info_dict['网 址'] else '-' try: zjmyqyw.get_request(url3) except: continue company_info_list=list(zjmyqyw.data_search('find','.rkbody table tr td')) company_info_dict={company_info_list[n].strip(': '):company_info_list[n+1].strip(': ') for n in range(0,24,2)} #print(company_info_dict) zjmyqyw.main_product=company_info_dict['主营产品或服务'] if company_info_dict['主营产品或服务'] else '-' zjmyqyw.business_mode=company_info_dict['经营模式'] if company_info_dict['经营模式'] else '-' zjmyqyw.company_type=company_info_dict['企业类型'] if company_info_dict['企业类型'] else '-' zjmyqyw.register_money=company_info_dict['注册资本'] if company_info_dict['注册资本'] else '-' zjmyqyw.register_money=company_info_dict['员工人数'] if company_info_dict['员工人数'] else '-' zjmyqyw.source_page=url2 zjmyqyw.data_save() zjmyqyw.phone_number=contact_info_dict['手机'] if contact_info_dict['手机'] else '-' zjmyqyw.data_save() company_deque.append(zjmyqyw.company_name) print(f'{profession}——第{page}页——{zjmyqyw.company_name}信息导入完成') zjmyqyw.spider_end()
def zjmyqyw(): zjmyqyw=SuperSpider() zjmyqyw.source='浙江名营企业网' zjmyqyw.fax='-' zjmyqyw.get_request('http://www.zj123.com/') url_list1=('http://www.zj123.com/'+i.replace('1.','{}.') for i in zjmyqyw.data_search('find','.indsort dd a','href')) for url1 in url_list1: page=1 while True: print(f'第{page}页') zjmyqyw.get_request(url1.format(page)) page_judge=zjmyqyw.data_search('find','.sleft .m.m1 .fred').__next__().split()[0] if int(page_judge) != page: break print(page_judge) url_list2=('http://www.zj123.com/member/VIPContact/'+i.split('-')[1]+'/index.htm' for i in zjmyqyw.data_search('find','.listdetail22 .listdetail dt a','href')) url_list3=('http://www.zj123.com/member/VIPCompany/'+i.split('-')[1]+'/index.htm' for i in zjmyqyw.data_search('find','.listdetail22 .listdetail dt a','href')) #print(url_list2) for url2,url3 in zip(url_list2,url_list3): zjmyqyw.get_request(url2) contact_info_dict={i.split(':')[0].strip():i.split(':')[-1].strip().replace('\xa0','') for i in zjmyqyw.data_search('find','.rkbody table tr')} zjmyqyw.company_name=contact_info_dict['公司名称'] if contact_info_dict['公司名称'] else '-' zjmyqyw.person_name=contact_info_dict['联系人'] if contact_info_dict['联系人'] else '-' zjmyqyw.address=contact_info_dict['地 址'] if contact_info_dict['地 址'] else '-' zjmyqyw.phone_code=contact_info_dict['电 话'] if contact_info_dict['电 话'] else '-' zjmyqyw.cell_phone=contact_info_dict['手机'] if contact_info_dict['手机'] else '-' zjmyqyw.qq=contact_info_dict['QQ'] if contact_info_dict['QQ'] else '-' zjmyqyw.website=contact_info_dict['网 址'] if contact_info_dict['网 址'] else '-' zjmyqyw.get_request(url3) company_info_list=list(zjmyqyw.data_search('find','.rkbody table tr td')) company_info_dict={company_info_list[n].strip(': '):company_info_list[n+1].strip(': ') for n in range(0,24,2)} #print(company_info_dict) zjmyqyw.main_product=company_info_dict['主营产品或服务'] if company_info_dict['主营产品或服务'] else '-' zjmyqyw.business_mode=company_info_dict['经营模式'] if company_info_dict['经营模式'] else '-' zjmyqyw.company_type=company_info_dict['企业类型'] if company_info_dict['企业类型'] else '-' zjmyqyw.register_money=company_info_dict['注册资本'] if company_info_dict['注册资本'] else '-' zjmyqyw.data_save() print(f'浙江企业网——{zjmyqyw.company_name}信息导入完成') page+=1 zjmyqyw.spider_end() #zjmyqyw() # test_obj=SuperSpider() # js='var btn=document.querySelector(".see_a.inactive_scode");btn.click();' # test_obj.use_selenium() # test_obj.selenium_js('https://www.china.cn/shukongjichuang/3746553522.html',js) # test_obj.cell_phone=test_obj.selenium_search('css_selector','.inactive_top .number').__next__() # print('aaaaaaa') # print(test_obj.cell_phone)
def wl114_spider(): wl114 = SuperSpider() wl114.source_name = '网络114' wl114.business_mode = '-' wl114.register_money = '-' wl114.website = '-' wl114.qq = '-' wl114.get_request('http://www.net114.com/') url_list1 = [ i.replace('.html', '-p-{}.html') for i in wl114.data_search( 'xpath', '//*[@id="product_center_content"]/div/ul/li/p/a', attr='href') if i.endswith('.html') ] profession_list1 = [ i for i in wl114.data_search( 'xpath', '//*[@id="product_center_content"]/div/ul/li/p/a') if i != '更多>>' ] error_index = profession_list1.index('维护工具') url_list2 = (i for i in wl114.data_search( 'xpath', '//*[@id="product_center_content"]/div/ul/li/p/a', attr='href') if not i.endswith('.html')) profession_list2 = (i for i in wl114.data_search( 'xpath', '//*[@id="product_center_content"]/div/ul/li/p/a') if i == '更多>>') for url1, profession1 in zip(url_list1[error_index:], profession_list1[error_index:]): try: wl114.get_request(url1.format(1)) all_page = wl114.data_search( 'find', '.page_p:not(span)').__next__().split('\xa0')[1] except: continue for page in range(1, int(all_page) + 1): print(f'{profession1}——第{page}页') try: wl114.get_request(url1.format(page)) except: continue url_list3 = list( wl114.data_search('find', '.product_list_div_h143 h2 a', 'href')) if not url_list3: break for url3 in url_list3: try: wl114.get_request(url3) company_info_dict = { i.split(':')[0].strip(): i.split(':')[-1].strip() for i in wl114.data_search( 'find', '.right.w_250 .border.p_8 li') if ':' in i } phone_url = wl114.data_search( 'find', '.right.w_250 .border.p_8 li a', 'href').__next__() except: continue wl114.company_type = company_info_dict.get('企业性质', '-') wl114.main_product = company_info_dict.get('企业主营', '-') wl114.address = company_info_dict.get('企业地址', '-') try: wl114.get_request(phone_url) except: continue phone_info_data = wl114.data_search( 'find', 'td[valign="top"]:first-child') try: phone_info_list = phone_info_data.__next__().split('\n') phone_info_dict = { i.split(':')[0].strip(): i.split(':')[-1].strip() for i in phone_info_list if ':' in i } except: continue wl114.company_name = phone_info_dict.get('公司名称', '-') if wl114.company_name == '-': wl114.company_name = phone_info_dict.get('企业名称', '-') wl114.person_name = phone_info_dict.get('联系人', '-') wl114.fax = phone_info_dict.get('传真', '-') wl114.phone_number = phone_info_dict.get('手机', '-') wl114.source_page = url3 wl114.data_save() wl114.phone_number = phone_info_dict.get('联系电话', '-') wl114.data_save() print(f'{profession1}——第{page}页——{wl114.company_name}信息导入完成') page += 1 for url2 in url_list2: try: wl114.get_request(url2) except: continue url_list4 = (i.replace('.html', '-p-{}.html') for i in wl114.data_search( 'find', '.product_w369_list a[href]', 'href')) profession_list4 = wl114.data_search('find', '.product_w369_list a[href]') for profession4, url4 in zip(profession_list4, url_list4): try: wl114.get_request(url4.format(1)) all_page = wl114.data_search( 'find', '.page_p:not(span)').__next__().split('\xa0')[1] except: continue for page in range(1, int(all_page) + 1): print(f'{profession4}——第{page}页') try: wl114.get_request(url4.format(page)) except: continue url_list3 = list( wl114.data_search('find', '.product_list_div_h143 h2 a', 'href')) if not url_list3: break for url3 in url_list3: try: wl114.get_request(url3) company_info_dict = { i.split(':')[0].strip(): i.split(':')[-1].strip() for i in wl114.data_search( 'find', '.right.w_250 .border.p_8 li') if ':' in i } phone_url = wl114.data_search( 'find', '.right.w_250 .border.p_8 li a', 'href').__next__() except: continue wl114.company_type = company_info_dict.get('企业性质', '-') wl114.main_product = company_info_dict.get('企业主营', '-') wl114.address = company_info_dict.get('企业地址', '-') try: wl114.get_request(phone_url) except: continue phone_info_data = wl114.data_search( 'find', 'td[valign="top"]:first-child') try: phone_info_list = phone_info_data.__next__().split( '\n') phone_info_dict = { i.split(':')[0].strip(): i.split(':')[-1].strip() for i in phone_info_list if ':' in i } except: continue wl114.company_name = phone_info_dict.get('公司名称', '-') if wl114.company_name == '-': wl114.company_name = phone_info_dict.get('企业名称', '-') wl114.person_name = phone_info_dict.get('联系人', '-') wl114.fax = phone_info_dict.get('传真', '-') wl114.phone_number = phone_info_dict.get('手机', '-') wl114.source_page = url3 wl114.data_save() wl114.phone_number = phone_info_dict.get('联系电话', '-') wl114.data_save() print( f'{profession4}——第{page}页——{wl114.company_name}信息导入完成') page += 1 wl114.spider_end()
def zggys_spider(): zggys = SuperSpider(host='192.168.0.172', default_field='-') zggys.source_name = '中国供应商' proxies_list = zggys.sql_search('select ip from ip_pool') url_list1 = [ i + '?p={}' for i in zggys.data_search( 'https://cn.china.cn/', '//*[@id="content"]/div[1]/div[1]/div/div[2]/div/div[2]/div/ul/li/div[2]/a/@href' ) ] profession_list = zggys.data_search( 'https://cn.china.cn/', '//*[@id="content"]/div[1]/div[1]/div/div[2]/div/div[2]/div/ul/li/div[2]/a/text()', 'GBK') error_index = profession_list.index('睡袋') for url1, profession in zip(url_list1[error_index:], profession_list[error_index:]): page = 1 while True: time.sleep(2) print(f'{profession}——第{page}页') for i in range(20): proxies = random.choice(proxies_list)[0] print(f'使用代理-{proxies}') key = 'http' if not proxies.startswith('https') else 'https' try: url_list2 = zggys.data_search( url1.format(page), '//ul[@class="extension_ul"]//h3[@class="title"]/a/@href', 'GBK', proxies={key: proxies}, timeout=5) except Exception as error: print(error) continue if not url_list2: print(f'{profession}——第{page}页——没有数据') break for url2 in url_list2: for i in range(20): try: time.sleep(2) proxies = random.choice(proxies_list)[0] print(f'使用代理-{proxies}') key = 'http' if not proxies.startswith( 'https') else 'https' html = zggys.get_html(url2, charset='GBK', proxies={key: proxies}, timeout=5) zggys.source_page = url2 if zggys.data_search( html=html, xpath='//div[@class="column_xx"]//p//a/text()' ): zggys.company_name = zggys.data_search( html=html, xpath='//div[@class="column_xx"]//p//a/text()' )[0] company_info_list = [ i for i in zggys.data_search( html=html, xpath='//ul[@class="business_xx"]//li//text()') if i.strip('\r\n |') ] # print(company_info_list) except Exception as error: print(error) continue else: try: aim_index = company_info_list.index('经营模式') zggys.business_mode = company_info_list[aim_index + 1] except: pass try: aim_index = company_info_list.index('注册资本') zggys.register_money = company_info_list[ aim_index + 1].strip() except: pass try: aim_index = company_info_list.index('企业类型') zggys.company_type = company_info_list[aim_index + 1] except: pass try: aim_index = company_info_list.index('主营产品') zggys.main_product = company_info_list[aim_index + 1] except: pass try: aim_index = company_info_list.index('公司地址') zggys.address = company_info_list[aim_index + 1] except: pass try: zggys.person_name = zggys.data_search( html=html, xpath= '//div[@class="personal_top"]//div[@class="t"]//span/text()' )[0] except: pass phone_list = zggys.data_search( html=html, xpath='//div[@class="personal_bottom"]//span/text()' ) if not phone_list: # js=['var btn=document.querySelector(".see_a.inactive_scode");btn.click();'] # try: # zggys.selenium_open(url2) # zggys.selenium_js(js,sleep_time=2) # zggys.phone_number=zggys.selenium_search('css_selector','.inactive_top .number').__next__() # phone_info_dict={i.split('\n')[0]:i.split('\n')[1].strip('QQ交谈') for i in zggys.selenium_search('css_selector','.inactive_right .txt p')} # except: # continue # zggys.fax=phone_info_dict.get('传真','-').strip() # zggys.qq=phone_info_dict.get('Q Q','-').strip() # zggys.data_save() # zggys.phone_number=phone_info_dict.get('电话','-').strip() # zggys.data_save() break for phone in phone_list: zggys.phone_number = phone.strip() zggys.data_save() print( f'{profession}—第{page}页—{zggys.company_name}信息导入完成' ) break page += 1 zggys.spider_end()
def xarcw_spider(): word_list = ['网络'] xarcw = SuperSpider(host='192.168.0.172', default_field='-') xarcw.source_name = '新安人才网' data = {'memberName': '13155291086', 'password': '******'} xarcw.post_request('https://login.goodjobs.cn/index.php/action/UserLogin', data=data) for word in word_list: for city_code in range(1043, 1061): for page in range(1, 61): print(f'{word}-{city_code}-第{page}页') try: url_list = xarcw.data_search( f'https://search.goodjobs.cn/index.php?keyword={word}&boxwp=c{city_code}&page={page}', '//div[@class="dw_table"]//span[@class="e1"]/a/@href') except: print(f'{word}-{city_code}-第{page}页获取失败') continue if not url_list: print(f'{word}-{city_code}-第{page}页-爬取结束') break for url in url_list: # print(url) xarcw.source_page = url time.sleep(1) data_list = xarcw.data_search(url, [ '//p[@class="cname"]/a/text()', '//p[@class="msg ltype"]/text()', '//div[@class="w706 clearfix"]/text()', '//div[@class="w706 clearfix"]/img/@src', '//div[@class="comadress clearfix"]/text()' ]) if not data_list[0] or not data_list[3]: continue if not data_list[0]: data_list = xarcw.data_search(url, [ '//div[@class="w240 whitespace pb16"]//a[@class="org"]/text()', '//div[@class="w240 whitespace pb16"]//p[@class="grey lh28"]/span[@class="black"]/text()', '//p[@class="duol mt20"]/text()', '//p[@class="duol mt20"]/img/@src', '//div[@class="comadress clearfix"]/text()' ]) xarcw.company_type = data_list[1][0] xarcw.main_product = data_list[1][2] else: company_info_list = [ i.strip('\xa0\xa0\n ') for i in data_list[1][0].split('|') ] xarcw.company_type = company_info_list[0] for j in company_info_list[1:]: if '-' in j: xarcw.staff_number = j else: xarcw.main_product = j xarcw.company_name = data_list[0][0] xarcw.person_name = [i for i in data_list[2] if i.strip()][0] try: xarcw.phone_number = xarcw.use_tesseract( url=data_list[3][0], lang=None) except: continue xarcw.address = data_list[4][0].strip('工作地点:\u3000\n ') xarcw.data_save() print( f'{xarcw.company_name}-{xarcw.person_name}-{xarcw.phone_number}-导入完成' )
def zggys_spider(): zggys=SuperSpider(use_selenium=True) zggys.source='中国供应商' zggys.website='-' zggys.get_request('https://cn.china.cn/') url_list1=(i+'?p={}' for i in zggys.data_search('xpath','//*[@id="content"]/div[1]/div[1]/div/div[2]/div/div[2]/div/ul/li/div[2]/a','href')) for url1 in url_list1: page=10 while True: print(f'第{page}页') try: zggys.get_request(url1.format(page)) except: print(f'获取第{page}页失败') page+=1 continue url_list2=zggys.data_search('find','h3.title a','href') if not url_list2: break for url2 in url_list2: try: zggys.get_request(url2) zggys.company_name=zggys.data_search('find','.column_xx p a','title').__next__() except: continue company_info_list=(i for i in zggys.data_search('find','.business_xx').__next__().split('\n') if '|' in i) company_info_dict={i.split('|')[0]:i.split('|')[1] for i in company_info_list} zggys.business_mode=company_info_dict.get('经营模式','-') zggys.register_money=company_info_dict.get('注册资本','-') zggys.company_type=company_info_dict.get('企业类型','-') zggys.main_product=company_info_dict.get('主营产品','-') zggys.address=company_info_dict.get('公司地址','-') #print(business_mode,register_money,company_type,main_product,address) zggys.person_name=zggys.data_search('find','.personal_top .t span').__next__() phone_list=zggys.data_search('find','.personal_bottom span') #print(phone_list) cell_phone_list=[] phone_code_list=[] for phone in phone_list: if not phone: js='var btn=document.querySelector(".see_a.inactive_scode");btn.click();' zggys.selenium_js(url2,js) zggys.cell_phone=zggys.selenium_search('css_selector','.inactive_top .number').__next__() phone_info_dict={i.split('\n')[0]:i.split('\n')[1].strip('QQ交谈') for i in zggys.selenium_search('css_selector','.inactive_right .txt p')} zggys.phone_code=phone_info_dict.get('电话','-') zggys.fax=phone_info_dict.get('传真','-') zggys.qq=phone_info_dict.get('Q Q','-') else: if not phone.startswith('1'): phone_code_list.append(phone) else: cell_phone_list.append(phone) if cell_phone_list or phone_code_list: zggys.phone_code='/'.join(phone_code_list) if phone_code_list else '-' zggys.cell_phone='/'.join(cell_phone_list) if cell_phone_list else '-' zggys.fax='-' zggys.qq='-' zggys.data_save() print(f'中国供应商——{zggys.company_name}信息导入完成') page+=1 zggys.spider_end()
def skb_spider(phone,passwd,word,page_now=1): skb=SuperSpider(use_selenium=True) skb.source_name='搜客宝' skb.fax='-' skb.staff_number='-' skb.selenium_open('https://biz.lixiaoskb.com/login') skb.selenium_input('xpath','//*[@id="app"]/div[1]/div/div/div[1]/div[2]/div[2]/form/div[1]/div/div/div/input',phone) skb.selenium_input('xpath','//*[@id="app"]/div[1]/div/div/div[1]/div[2]/div[2]/form/div[2]/div/div/div/input',passwd,enter=True,sleep_time=3) js3='document.querySelector("#tab-0").click();' skb.selenium_js([js3]) skb.selenium_input('xpath','//*[@id="searchDeInput"]/div[1]/div/input',word,sleep_time=5,enter=True) all_page=500 if int(page_now) == int(all_page): print(f'{word}——所有数据爬取结束') skb.spider_end() return word,int(all_page) for page in range(page_now,int(all_page)+1): print(f'{word}——第{page}页') try: skb.selenium_scroll('//div[@id="jumpPage"]//input[@class="el-input__inner"]') skb.selenium_input('css_selector','#jumpPage .el-input input',page,sleep_time=2,enter=True) except Exception as e: print(e) continue url_list=skb.selenium_search('xpath',f'//div[@class="card"]//span[@class="name"]//a',attr='href') for url in url_list: skb.source_page=url js1=f'window.open("{url}")' skb.selenium_js([js1],sleep_time=3) skb.switch_window() try: skb.company_name=skb.selenium_search('css_selector','.top .name').__next__() except Exception as e: print(e) skb.window_close() skb.switch_window(sleep_time=2) continue try: company_info_dict1={i.split(':')[0].strip():i.split(':')[-1].strip() for i in skb.selenium_search('css_selector','.line .group')} skb.company_type=company_info_dict1.get('公司类型','-') skb.address=company_info_dict1.get('通讯地址','-') business_mode=company_info_dict1.get('所属行业','-') skb.website=company_info_dict1.get('官方网站','-').strip('更多>> ') except: pass try: company_info_dict2={i.split('\n')[0].strip('/ '):i.split('\n')[-1].strip('/ ') for i in skb.selenium_search('css_selector','.gongshang-col')} skb.person_name=company_info_dict2.get('法人/负责人','-') skb.register_money=company_info_dict2.get('注册资本','-') skb.main_product=company_info_dict2.get('经营范围','-') except: pass js2='var open_btn=document.querySelector(".mask-box .action span");open_btn.click();' try: skb.selenium_js([js2],sleep_time=3) except Exception as e: print(e) phone_list=[] qq_list=[] try: phone_info=skb.selenium_search('css_selector','.el-scrollbar__view') phone_info_list=list(phone_info)[1].split('\n') except Exception as e: print(e) skb.window_close() skb.switch_window(sleep_time=2) continue #print(phone_info_list) for i,j in enumerate(phone_info_list): if j == '选 择': skb.phone_number=phone_info_list[i-1] elif j == '联系人': skb.person_name=phone_info_list[i+1] elif j == 'qq号码': skb.qq=phone_info_list[i+1].strip(',') elif j == '电子邮箱': skb.mail=phone_info_list[i+1].strip(',') try: skb.data_save() except: continue print(f'{word}——第{page}页——{skb.company_name}信息导入完成') skb.window_close() skb.switch_window(sleep_time=2) continue phone_list=[] qq_list=[] try: phone_info=skb.selenium_search('css_selector','.el-scrollbar__view') phone_info_list=list(phone_info)[1].split('\n') except Exception as e: print(e) skb.window_close() skb.switch_window(sleep_time=2) continue #print(phone_info_list) for i,j in enumerate(phone_info_list): if j == '选 择': skb.phone_number=phone_info_list[i-1] elif j == '联系人': skb.person_name=phone_info_list[i+1] elif j == 'qq号码': skb.qq=phone_info_list[i+1].strip(',') elif j == '电子邮箱': skb.mail=phone_info_list[i+1].strip(',') try: skb.data_save() except: continue print(f'{word}——第{page}页——{skb.company_name}信息导入完成') use_number=skb.selenium_search('css_selector','.inner-user .viewCount:first-child').__next__() print(use_number) if int(use_number) == 0: print(f'{word}——第{page}页——今日次数已用完') skb.spider_end() return word,page skb.window_close() skb.switch_window(sleep_time=2) skb.spider_end() print(f'{word}——所有数据爬取结束') return word,int(all_page)
def zgcpw_spider(): zgcpw = SuperSpider() company_list = deque([], maxlen=35) zgcpw.source_name = '中国产品网' zgcpw.get_request('http://www.pe168.com/') url_list1 = zgcpw.data_search('find', 'td div:nth-child(2) a', 'href') profession_list = zgcpw.data_search('find', 'td div:nth-child(2) a') for profession, url1 in zip(profession_list, url_list1): try: zgcpw.get_request(url1) page_all = zgcpw.data_search('find', '.pages cite').__next__() page_all_number = zgcpw.re_find(r'/(\d+)页', page_all).__next__().group(1) except: continue for page in range(1, int(page_all_number) + 1): print(f'{profession}——第{page}页') url2 = url1.replace('.html', f'-{page}.html') try: zgcpw.get_request(url2) except: continue url_list3 = zgcpw.data_search( 'find', '.left_box form tr ul li:nth-last-child(1) a', 'href') company_list3 = zgcpw.data_search( 'find', '.left_box form tr ul li:nth-last-child(1) a') for company_name, url3 in zip(company_list3, url_list3): if company_name in company_list: print('信息重复') continue company_list.append(company_name) zgcpw.company_name = company_name try: zgcpw.get_request(url3) except: continue zgcpw.source_page = url3 try: company_info_url = zgcpw.data_search( 'find', 'a[title="公司介绍"]', 'href').__next__() except: company_list.append(company_name) continue try: zgcpw.get_request(company_info_url) except: continue company_info_list = list( zgcpw.data_search('find', '.main_body:nth-last-child(1) td')) zgcpw.company_type = company_info_list[ company_info_list.index('公司类型:') + 1] if '公司类型:' in company_info_list else '-' zgcpw.staff_number = company_info_list[ company_info_list.index('公司规模:') + 1] if '公司规模:' in company_info_list else '-' zgcpw.register_money = company_info_list[ company_info_list.index('注册资本:') + 1] if '注册资本:' in company_info_list else '-' zgcpw.business_mode = company_info_list[ company_info_list.index('经营模式:') + 1] if '经营模式:' in company_info_list else '-' zgcpw.main_product = company_info_list[ company_info_list.index('经营范围:') + 1] if '经营范围:' in company_info_list else '-' try: phone_info_url = zgcpw.data_search('find', 'a[title="联系方式"]', 'href').__next__() except: company_list.append(company_name) continue try: zgcpw.get_request(phone_info_url) except: continue phone_info_list = list( zgcpw.data_search('find', '.px13.lh18 td')) zgcpw.address = phone_info_list[ phone_info_list.index('公司地址:') + 1] if '公司地址:' in phone_info_list else '-' zgcpw.fax = phone_info_list[ phone_info_list.index('公司传真:') + 1] if '公司传真:' in phone_info_list else '-' zgcpw.website = phone_info_list[ phone_info_list.index('公司网址:') + 1] if '公司网址:' in phone_info_list else '-' zgcpw.person_name = phone_info_list[ phone_info_list.index('联 系 人:') + 1] if '联 系 人:' in phone_info_list else '-' zgcpw.phone_number = phone_info_list[ phone_info_list.index('公司电话:') + 1] if '公司电话:' in phone_info_list else '-' zgcpw.data_save() zgcpw.phone_number = phone_info_list[ phone_info_list.index('手机号码:') + 1] if '手机号码:' in phone_info_list else '-' zgcpw.data_save() print(f'{profession}——第{page}页——{company_name}导入完成') zgcpw.spider_end()