def zjmyqyw_spdier():
	company_deque=deque([],maxlen=35)
	zjmyqyw=SuperSpider()
	zjmyqyw.source_name='浙江名营企业网'
	zjmyqyw.fax='-'
	zjmyqyw.get_request('http://www.zj123.com/')
	url_list1=['http://www.zj123.com/'+i.replace('1.','{}.') for i in zjmyqyw.data_search('find','.indsort dd a','href')]
	profession_list=list(zjmyqyw.data_search('find','.indsort dd a'))
	error_index=profession_list.index('特种印刷')
	for profession,url1 in zip(profession_list[error_index:],url_list1[error_index:]):
		for page in range(1,100):
			print(f'{profession}——第{page}页')
			try:
				zjmyqyw.get_request(url1.format(page))
				page_judge=zjmyqyw.data_search('find','.sleft .m.m1 .fred').__next__().split()[0]
			except:
				print(f'获取第{page}页失败')
				page+=1
				continue
			if int(page_judge) != page:
				break
			url_list2=('http://www.zj123.com/member/VIPContact/'+i.split('-')[1]+'/index.htm' for i in zjmyqyw.data_search('find','.listdetail22 .listdetail dt a','href'))
			url_list3=('http://www.zj123.com/member/VIPCompany/'+i.split('-')[1]+'/index.htm' for i in zjmyqyw.data_search('find','.listdetail22 .listdetail dt a','href'))
			#print(url_list2)
			for url2,url3 in zip(url_list2,url_list3):
				try:
					zjmyqyw.get_request(url2)
				except:
					continue
				contact_info_dict={i.split(':')[0].strip():i.split(':')[-1].strip().replace('\xa0','') for i in zjmyqyw.data_search('find','.rkbody table tr')}
				zjmyqyw.company_name=contact_info_dict['公司名称'] if contact_info_dict['公司名称'] else '-'
				if zjmyqyw.company_name in company_deque:
					print('信息重复')
					continue
				zjmyqyw.person_name=contact_info_dict['联系人'] if contact_info_dict['联系人'] else '-'
				zjmyqyw.address=contact_info_dict['地 址'] if contact_info_dict['地 址'] else '-'
				zjmyqyw.phone_number=contact_info_dict['电 话'] if contact_info_dict['电 话'] else '-'
				zjmyqyw.qq=contact_info_dict['QQ'] if contact_info_dict['QQ'] else '-'
				zjmyqyw.website=contact_info_dict['网 址'] if contact_info_dict['网 址'] else '-'
				try:
					zjmyqyw.get_request(url3)
				except:
					continue
				company_info_list=list(zjmyqyw.data_search('find','.rkbody table tr td'))
				company_info_dict={company_info_list[n].strip(': '):company_info_list[n+1].strip(': ') for n in range(0,24,2)}
				#print(company_info_dict)
				zjmyqyw.main_product=company_info_dict['主营产品或服务'] if company_info_dict['主营产品或服务'] else '-'
				zjmyqyw.business_mode=company_info_dict['经营模式'] if company_info_dict['经营模式'] else '-'
				zjmyqyw.company_type=company_info_dict['企业类型'] if company_info_dict['企业类型'] else '-'
				zjmyqyw.register_money=company_info_dict['注册资本'] if company_info_dict['注册资本'] else '-'
				zjmyqyw.register_money=company_info_dict['员工人数'] if company_info_dict['员工人数'] else '-'
				zjmyqyw.source_page=url2
				zjmyqyw.data_save()
				zjmyqyw.phone_number=contact_info_dict['手机'] if contact_info_dict['手机'] else '-'
				zjmyqyw.data_save()
				company_deque.append(zjmyqyw.company_name)
				print(f'{profession}——第{page}页——{zjmyqyw.company_name}信息导入完成')
	zjmyqyw.spider_end()
Example #2
0
def zjmyqyw():
	zjmyqyw=SuperSpider()
	zjmyqyw.source='浙江名营企业网'
	zjmyqyw.fax='-'
	zjmyqyw.get_request('http://www.zj123.com/')
	url_list1=('http://www.zj123.com/'+i.replace('1.','{}.') for i in zjmyqyw.data_search('find','.indsort dd a','href'))
	for url1 in url_list1:
		page=1
		while True:
			print(f'第{page}页')
			zjmyqyw.get_request(url1.format(page))
			page_judge=zjmyqyw.data_search('find','.sleft .m.m1 .fred').__next__().split()[0]
			if int(page_judge) != page:
				break
			print(page_judge)
			url_list2=('http://www.zj123.com/member/VIPContact/'+i.split('-')[1]+'/index.htm' for i in zjmyqyw.data_search('find','.listdetail22 .listdetail dt a','href'))
			url_list3=('http://www.zj123.com/member/VIPCompany/'+i.split('-')[1]+'/index.htm' for i in zjmyqyw.data_search('find','.listdetail22 .listdetail dt a','href'))
			#print(url_list2)
			for url2,url3 in zip(url_list2,url_list3):
				zjmyqyw.get_request(url2)
				contact_info_dict={i.split(':')[0].strip():i.split(':')[-1].strip().replace('\xa0','') for i in zjmyqyw.data_search('find','.rkbody table tr')}
				zjmyqyw.company_name=contact_info_dict['公司名称'] if contact_info_dict['公司名称'] else '-'
				zjmyqyw.person_name=contact_info_dict['联系人'] if contact_info_dict['联系人'] else '-'
				zjmyqyw.address=contact_info_dict['地 址'] if contact_info_dict['地 址'] else '-'
				zjmyqyw.phone_code=contact_info_dict['电 话'] if contact_info_dict['电 话'] else '-'
				zjmyqyw.cell_phone=contact_info_dict['手机'] if contact_info_dict['手机'] else '-'
				zjmyqyw.qq=contact_info_dict['QQ'] if contact_info_dict['QQ'] else '-'
				zjmyqyw.website=contact_info_dict['网 址'] if contact_info_dict['网 址'] else '-'
				zjmyqyw.get_request(url3)
				company_info_list=list(zjmyqyw.data_search('find','.rkbody table tr td'))
				company_info_dict={company_info_list[n].strip(': '):company_info_list[n+1].strip(': ') for n in range(0,24,2)}
				#print(company_info_dict)
				zjmyqyw.main_product=company_info_dict['主营产品或服务'] if company_info_dict['主营产品或服务'] else '-'
				zjmyqyw.business_mode=company_info_dict['经营模式'] if company_info_dict['经营模式'] else '-'
				zjmyqyw.company_type=company_info_dict['企业类型'] if company_info_dict['企业类型'] else '-'
				zjmyqyw.register_money=company_info_dict['注册资本'] if company_info_dict['注册资本'] else '-'
				zjmyqyw.data_save()
				print(f'浙江企业网——{zjmyqyw.company_name}信息导入完成')
			page+=1
	zjmyqyw.spider_end()
#zjmyqyw()

# test_obj=SuperSpider()
# js='var btn=document.querySelector(".see_a.inactive_scode");btn.click();'
# test_obj.use_selenium()
# test_obj.selenium_js('https://www.china.cn/shukongjichuang/3746553522.html',js)
# test_obj.cell_phone=test_obj.selenium_search('css_selector','.inactive_top .number').__next__()
# print('aaaaaaa')
# print(test_obj.cell_phone)
Example #3
0
def wl114_spider():
    wl114 = SuperSpider()
    wl114.source_name = '网络114'
    wl114.business_mode = '-'
    wl114.register_money = '-'
    wl114.website = '-'
    wl114.qq = '-'
    wl114.get_request('http://www.net114.com/')
    url_list1 = [
        i.replace('.html', '-p-{}.html') for i in wl114.data_search(
            'xpath',
            '//*[@id="product_center_content"]/div/ul/li/p/a',
            attr='href') if i.endswith('.html')
    ]
    profession_list1 = [
        i for i in wl114.data_search(
            'xpath', '//*[@id="product_center_content"]/div/ul/li/p/a')
        if i != '更多>>'
    ]
    error_index = profession_list1.index('维护工具')
    url_list2 = (i for i in wl114.data_search(
        'xpath',
        '//*[@id="product_center_content"]/div/ul/li/p/a',
        attr='href') if not i.endswith('.html'))
    profession_list2 = (i for i in wl114.data_search(
        'xpath', '//*[@id="product_center_content"]/div/ul/li/p/a')
                        if i == '更多>>')
    for url1, profession1 in zip(url_list1[error_index:],
                                 profession_list1[error_index:]):
        try:
            wl114.get_request(url1.format(1))
            all_page = wl114.data_search(
                'find', '.page_p:not(span)').__next__().split('\xa0')[1]
        except:
            continue
        for page in range(1, int(all_page) + 1):
            print(f'{profession1}——第{page}页')
            try:
                wl114.get_request(url1.format(page))
            except:
                continue
            url_list3 = list(
                wl114.data_search('find', '.product_list_div_h143 h2 a',
                                  'href'))
            if not url_list3:
                break
            for url3 in url_list3:
                try:
                    wl114.get_request(url3)
                    company_info_dict = {
                        i.split(':')[0].strip(): i.split(':')[-1].strip()
                        for i in wl114.data_search(
                            'find', '.right.w_250 .border.p_8 li') if ':' in i
                    }
                    phone_url = wl114.data_search(
                        'find', '.right.w_250 .border.p_8 li a',
                        'href').__next__()
                except:
                    continue
                wl114.company_type = company_info_dict.get('企业性质', '-')
                wl114.main_product = company_info_dict.get('企业主营', '-')
                wl114.address = company_info_dict.get('企业地址', '-')
                try:
                    wl114.get_request(phone_url)
                except:
                    continue
                phone_info_data = wl114.data_search(
                    'find', 'td[valign="top"]:first-child')
                try:
                    phone_info_list = phone_info_data.__next__().split('\n')
                    phone_info_dict = {
                        i.split(':')[0].strip(): i.split(':')[-1].strip()
                        for i in phone_info_list if ':' in i
                    }
                except:
                    continue
                wl114.company_name = phone_info_dict.get('公司名称', '-')
                if wl114.company_name == '-':
                    wl114.company_name = phone_info_dict.get('企业名称', '-')
                wl114.person_name = phone_info_dict.get('联系人', '-')
                wl114.fax = phone_info_dict.get('传真', '-')
                wl114.phone_number = phone_info_dict.get('手机', '-')
                wl114.source_page = url3
                wl114.data_save()
                wl114.phone_number = phone_info_dict.get('联系电话', '-')
                wl114.data_save()
                print(f'{profession1}——第{page}页——{wl114.company_name}信息导入完成')
            page += 1
    for url2 in url_list2:
        try:
            wl114.get_request(url2)
        except:
            continue
        url_list4 = (i.replace('.html', '-p-{}.html')
                     for i in wl114.data_search(
                         'find', '.product_w369_list a[href]', 'href'))
        profession_list4 = wl114.data_search('find',
                                             '.product_w369_list a[href]')
        for profession4, url4 in zip(profession_list4, url_list4):
            try:
                wl114.get_request(url4.format(1))
                all_page = wl114.data_search(
                    'find', '.page_p:not(span)').__next__().split('\xa0')[1]
            except:
                continue
            for page in range(1, int(all_page) + 1):
                print(f'{profession4}——第{page}页')
                try:
                    wl114.get_request(url4.format(page))
                except:
                    continue
                url_list3 = list(
                    wl114.data_search('find', '.product_list_div_h143 h2 a',
                                      'href'))
                if not url_list3:
                    break
                for url3 in url_list3:
                    try:
                        wl114.get_request(url3)
                        company_info_dict = {
                            i.split(':')[0].strip(): i.split(':')[-1].strip()
                            for i in wl114.data_search(
                                'find', '.right.w_250 .border.p_8 li')
                            if ':' in i
                        }
                        phone_url = wl114.data_search(
                            'find', '.right.w_250 .border.p_8 li a',
                            'href').__next__()
                    except:
                        continue
                    wl114.company_type = company_info_dict.get('企业性质', '-')
                    wl114.main_product = company_info_dict.get('企业主营', '-')
                    wl114.address = company_info_dict.get('企业地址', '-')
                    try:
                        wl114.get_request(phone_url)
                    except:
                        continue
                    phone_info_data = wl114.data_search(
                        'find', 'td[valign="top"]:first-child')
                    try:
                        phone_info_list = phone_info_data.__next__().split(
                            '\n')
                        phone_info_dict = {
                            i.split(':')[0].strip(): i.split(':')[-1].strip()
                            for i in phone_info_list if ':' in i
                        }
                    except:
                        continue
                    wl114.company_name = phone_info_dict.get('公司名称', '-')
                    if wl114.company_name == '-':
                        wl114.company_name = phone_info_dict.get('企业名称', '-')
                    wl114.person_name = phone_info_dict.get('联系人', '-')
                    wl114.fax = phone_info_dict.get('传真', '-')
                    wl114.phone_number = phone_info_dict.get('手机', '-')
                    wl114.source_page = url3
                    wl114.data_save()
                    wl114.phone_number = phone_info_dict.get('联系电话', '-')
                    wl114.data_save()
                    print(
                        f'{profession4}——第{page}页——{wl114.company_name}信息导入完成')
                page += 1
    wl114.spider_end()
Example #4
0
def zggys_spider():	
	zggys=SuperSpider(use_selenium=True)
	zggys.source='中国供应商'
	zggys.website='-'
	zggys.get_request('https://cn.china.cn/')
	url_list1=(i+'?p={}' for i in zggys.data_search('xpath','//*[@id="content"]/div[1]/div[1]/div/div[2]/div/div[2]/div/ul/li/div[2]/a','href'))	
	for url1 in url_list1:
		page=10
		while True:
			print(f'第{page}页')
			try:
				zggys.get_request(url1.format(page))
			except:
				print(f'获取第{page}页失败')
				page+=1
				continue
			url_list2=zggys.data_search('find','h3.title a','href')
			if not url_list2:
				break
			for url2 in url_list2:
				try:
					zggys.get_request(url2)
					zggys.company_name=zggys.data_search('find','.column_xx p a','title').__next__()
				except:
					continue
				company_info_list=(i for i in zggys.data_search('find','.business_xx').__next__().split('\n') if '|' in i)
				company_info_dict={i.split('|')[0]:i.split('|')[1] for i in company_info_list}
				zggys.business_mode=company_info_dict.get('经营模式','-') 
				zggys.register_money=company_info_dict.get('注册资本','-') 
				zggys.company_type=company_info_dict.get('企业类型','-') 
				zggys.main_product=company_info_dict.get('主营产品','-') 
				zggys.address=company_info_dict.get('公司地址','-') 
				#print(business_mode,register_money,company_type,main_product,address)
				zggys.person_name=zggys.data_search('find','.personal_top .t span').__next__()
				phone_list=zggys.data_search('find','.personal_bottom span')
				#print(phone_list)
				cell_phone_list=[]
				phone_code_list=[]
				for phone in phone_list:
					if not phone:
						js='var btn=document.querySelector(".see_a.inactive_scode");btn.click();'
						zggys.selenium_js(url2,js)
						zggys.cell_phone=zggys.selenium_search('css_selector','.inactive_top .number').__next__()
						phone_info_dict={i.split('\n')[0]:i.split('\n')[1].strip('QQ交谈') for i in zggys.selenium_search('css_selector','.inactive_right .txt p')}	
						zggys.phone_code=phone_info_dict.get('电话','-')
						zggys.fax=phone_info_dict.get('传真','-')
						zggys.qq=phone_info_dict.get('Q  Q','-')
					else:
						if not phone.startswith('1'):
							phone_code_list.append(phone)
						else:
							cell_phone_list.append(phone)
				if cell_phone_list or phone_code_list:
					zggys.phone_code='/'.join(phone_code_list) if phone_code_list else '-'
					zggys.cell_phone='/'.join(cell_phone_list) if cell_phone_list else '-'
					zggys.fax='-'
					zggys.qq='-'
				zggys.data_save()
				print(f'中国供应商——{zggys.company_name}信息导入完成')
			page+=1
	zggys.spider_end()
Example #5
0
def skb_spider(phone,passwd,word,page_now=1):
	skb=SuperSpider(use_selenium=True)
	skb.source_name='搜客宝'
	skb.fax='-'
	skb.staff_number='-'
	skb.selenium_open('https://biz.lixiaoskb.com/login')
	skb.selenium_input('xpath','//*[@id="app"]/div[1]/div/div/div[1]/div[2]/div[2]/form/div[1]/div/div/div/input',phone)
	skb.selenium_input('xpath','//*[@id="app"]/div[1]/div/div/div[1]/div[2]/div[2]/form/div[2]/div/div/div/input',passwd,enter=True,sleep_time=3)
	js3='document.querySelector("#tab-0").click();'
	skb.selenium_js([js3])
	skb.selenium_input('xpath','//*[@id="searchDeInput"]/div[1]/div/input',word,sleep_time=5,enter=True)
	all_page=500
	if int(page_now) == int(all_page):
		print(f'{word}——所有数据爬取结束')
		skb.spider_end()
		return word,int(all_page)
	for page in range(page_now,int(all_page)+1):
		print(f'{word}——第{page}页')
		try:
			skb.selenium_scroll('//div[@id="jumpPage"]//input[@class="el-input__inner"]')
			skb.selenium_input('css_selector','#jumpPage .el-input input',page,sleep_time=2,enter=True)
		except Exception as e:
			print(e)
			continue
		url_list=skb.selenium_search('xpath',f'//div[@class="card"]//span[@class="name"]//a',attr='href')
		for url in url_list:
			skb.source_page=url
			js1=f'window.open("{url}")'
			skb.selenium_js([js1],sleep_time=3)
			skb.switch_window()
			try:
				skb.company_name=skb.selenium_search('css_selector','.top .name').__next__()
			except Exception as e:
				print(e)
				skb.window_close()
				skb.switch_window(sleep_time=2)
				continue
			try:
				company_info_dict1={i.split(':')[0].strip():i.split(':')[-1].strip() for i in skb.selenium_search('css_selector','.line .group')}
				skb.company_type=company_info_dict1.get('公司类型','-')
				skb.address=company_info_dict1.get('通讯地址','-')
				business_mode=company_info_dict1.get('所属行业','-')
				skb.website=company_info_dict1.get('官方网站','-').strip('更多>> ')
			except:
				pass
			try:
				company_info_dict2={i.split('\n')[0].strip('/ '):i.split('\n')[-1].strip('/ ') for i in skb.selenium_search('css_selector','.gongshang-col')}
				skb.person_name=company_info_dict2.get('法人/负责人','-')
				skb.register_money=company_info_dict2.get('注册资本','-')
				skb.main_product=company_info_dict2.get('经营范围','-')
			except:
				pass
			js2='var open_btn=document.querySelector(".mask-box .action span");open_btn.click();'
			try:
				skb.selenium_js([js2],sleep_time=3)
			except Exception as e:
				print(e)
				phone_list=[]
				qq_list=[]
				try:
					phone_info=skb.selenium_search('css_selector','.el-scrollbar__view')
					phone_info_list=list(phone_info)[1].split('\n')
				except Exception as e:
					print(e)
					skb.window_close()
					skb.switch_window(sleep_time=2)
					continue
				#print(phone_info_list)
				for i,j in enumerate(phone_info_list):
					if j == '选 择':
						skb.phone_number=phone_info_list[i-1]
					elif j == '联系人':
						skb.person_name=phone_info_list[i+1]
					elif j == 'qq号码':
						skb.qq=phone_info_list[i+1].strip(',')
					elif j == '电子邮箱':
						skb.mail=phone_info_list[i+1].strip(',')
						try:
							skb.data_save()
						except:
							continue
				print(f'{word}——第{page}页——{skb.company_name}信息导入完成')
				skb.window_close()
				skb.switch_window(sleep_time=2)
				continue
			phone_list=[]
			qq_list=[]
			try:
				phone_info=skb.selenium_search('css_selector','.el-scrollbar__view')
				phone_info_list=list(phone_info)[1].split('\n')
			except Exception as e:
				print(e)
				skb.window_close()
				skb.switch_window(sleep_time=2)
				continue
			#print(phone_info_list)
			for i,j in enumerate(phone_info_list):
				if j == '选 择':
					skb.phone_number=phone_info_list[i-1]
				elif j == '联系人':
					skb.person_name=phone_info_list[i+1]
				elif j == 'qq号码':
					skb.qq=phone_info_list[i+1].strip(',')
				elif j == '电子邮箱':
					skb.mail=phone_info_list[i+1].strip(',')
					try:
						skb.data_save()
					except:
						continue
			print(f'{word}——第{page}页——{skb.company_name}信息导入完成')
			use_number=skb.selenium_search('css_selector','.inner-user .viewCount:first-child').__next__()
			print(use_number)
			if int(use_number) == 0:
				print(f'{word}——第{page}页——今日次数已用完')
				skb.spider_end()
				return word,page
			skb.window_close()
			skb.switch_window(sleep_time=2)
	skb.spider_end()
	print(f'{word}——所有数据爬取结束')
	return word,int(all_page)
Example #6
0
def zgcpw_spider():
    zgcpw = SuperSpider()
    company_list = deque([], maxlen=35)
    zgcpw.source_name = '中国产品网'
    zgcpw.get_request('http://www.pe168.com/')
    url_list1 = zgcpw.data_search('find', 'td div:nth-child(2) a', 'href')
    profession_list = zgcpw.data_search('find', 'td div:nth-child(2) a')
    for profession, url1 in zip(profession_list, url_list1):
        try:
            zgcpw.get_request(url1)
            page_all = zgcpw.data_search('find', '.pages cite').__next__()
            page_all_number = zgcpw.re_find(r'/(\d+)页',
                                            page_all).__next__().group(1)
        except:
            continue
        for page in range(1, int(page_all_number) + 1):
            print(f'{profession}——第{page}页')
            url2 = url1.replace('.html', f'-{page}.html')
            try:
                zgcpw.get_request(url2)
            except:
                continue
            url_list3 = zgcpw.data_search(
                'find', '.left_box form tr ul li:nth-last-child(1) a', 'href')
            company_list3 = zgcpw.data_search(
                'find', '.left_box form tr ul li:nth-last-child(1) a')
            for company_name, url3 in zip(company_list3, url_list3):
                if company_name in company_list:
                    print('信息重复')
                    continue
                company_list.append(company_name)
                zgcpw.company_name = company_name
                try:
                    zgcpw.get_request(url3)
                except:
                    continue
                zgcpw.source_page = url3
                try:
                    company_info_url = zgcpw.data_search(
                        'find', 'a[title="公司介绍"]', 'href').__next__()
                except:
                    company_list.append(company_name)
                    continue
                try:
                    zgcpw.get_request(company_info_url)
                except:
                    continue
                company_info_list = list(
                    zgcpw.data_search('find',
                                      '.main_body:nth-last-child(1) td'))
                zgcpw.company_type = company_info_list[
                    company_info_list.index('公司类型:') +
                    1] if '公司类型:' in company_info_list else '-'
                zgcpw.staff_number = company_info_list[
                    company_info_list.index('公司规模:') +
                    1] if '公司规模:' in company_info_list else '-'
                zgcpw.register_money = company_info_list[
                    company_info_list.index('注册资本:') +
                    1] if '注册资本:' in company_info_list else '-'
                zgcpw.business_mode = company_info_list[
                    company_info_list.index('经营模式:') +
                    1] if '经营模式:' in company_info_list else '-'
                zgcpw.main_product = company_info_list[
                    company_info_list.index('经营范围:') +
                    1] if '经营范围:' in company_info_list else '-'
                try:
                    phone_info_url = zgcpw.data_search('find',
                                                       'a[title="联系方式"]',
                                                       'href').__next__()
                except:
                    company_list.append(company_name)
                    continue
                try:
                    zgcpw.get_request(phone_info_url)
                except:
                    continue
                phone_info_list = list(
                    zgcpw.data_search('find', '.px13.lh18 td'))
                zgcpw.address = phone_info_list[
                    phone_info_list.index('公司地址:') +
                    1] if '公司地址:' in phone_info_list else '-'
                zgcpw.fax = phone_info_list[
                    phone_info_list.index('公司传真:') +
                    1] if '公司传真:' in phone_info_list else '-'
                zgcpw.website = phone_info_list[
                    phone_info_list.index('公司网址:') +
                    1] if '公司网址:' in phone_info_list else '-'
                zgcpw.person_name = phone_info_list[
                    phone_info_list.index('联 系 人:') +
                    1] if '联 系 人:' in phone_info_list else '-'
                zgcpw.phone_number = phone_info_list[
                    phone_info_list.index('公司电话:') +
                    1] if '公司电话:' in phone_info_list else '-'
                zgcpw.data_save()
                zgcpw.phone_number = phone_info_list[
                    phone_info_list.index('手机号码:') +
                    1] if '手机号码:' in phone_info_list else '-'
                zgcpw.data_save()
                print(f'{profession}——第{page}页——{company_name}导入完成')
    zgcpw.spider_end()