Ejemplo n.º 1
0
def zjmyqyw_spdier():
	company_deque=deque([],maxlen=35)
	zjmyqyw=SuperSpider()
	zjmyqyw.source_name='浙江名营企业网'
	zjmyqyw.fax='-'
	zjmyqyw.get_request('http://www.zj123.com/')
	url_list1=['http://www.zj123.com/'+i.replace('1.','{}.') for i in zjmyqyw.data_search('find','.indsort dd a','href')]
	profession_list=list(zjmyqyw.data_search('find','.indsort dd a'))
	error_index=profession_list.index('特种印刷')
	for profession,url1 in zip(profession_list[error_index:],url_list1[error_index:]):
		for page in range(1,100):
			print(f'{profession}——第{page}页')
			try:
				zjmyqyw.get_request(url1.format(page))
				page_judge=zjmyqyw.data_search('find','.sleft .m.m1 .fred').__next__().split()[0]
			except:
				print(f'获取第{page}页失败')
				page+=1
				continue
			if int(page_judge) != page:
				break
			url_list2=('http://www.zj123.com/member/VIPContact/'+i.split('-')[1]+'/index.htm' for i in zjmyqyw.data_search('find','.listdetail22 .listdetail dt a','href'))
			url_list3=('http://www.zj123.com/member/VIPCompany/'+i.split('-')[1]+'/index.htm' for i in zjmyqyw.data_search('find','.listdetail22 .listdetail dt a','href'))
			#print(url_list2)
			for url2,url3 in zip(url_list2,url_list3):
				try:
					zjmyqyw.get_request(url2)
				except:
					continue
				contact_info_dict={i.split(':')[0].strip():i.split(':')[-1].strip().replace('\xa0','') for i in zjmyqyw.data_search('find','.rkbody table tr')}
				zjmyqyw.company_name=contact_info_dict['公司名称'] if contact_info_dict['公司名称'] else '-'
				if zjmyqyw.company_name in company_deque:
					print('信息重复')
					continue
				zjmyqyw.person_name=contact_info_dict['联系人'] if contact_info_dict['联系人'] else '-'
				zjmyqyw.address=contact_info_dict['地 址'] if contact_info_dict['地 址'] else '-'
				zjmyqyw.phone_number=contact_info_dict['电 话'] if contact_info_dict['电 话'] else '-'
				zjmyqyw.qq=contact_info_dict['QQ'] if contact_info_dict['QQ'] else '-'
				zjmyqyw.website=contact_info_dict['网 址'] if contact_info_dict['网 址'] else '-'
				try:
					zjmyqyw.get_request(url3)
				except:
					continue
				company_info_list=list(zjmyqyw.data_search('find','.rkbody table tr td'))
				company_info_dict={company_info_list[n].strip(': '):company_info_list[n+1].strip(': ') for n in range(0,24,2)}
				#print(company_info_dict)
				zjmyqyw.main_product=company_info_dict['主营产品或服务'] if company_info_dict['主营产品或服务'] else '-'
				zjmyqyw.business_mode=company_info_dict['经营模式'] if company_info_dict['经营模式'] else '-'
				zjmyqyw.company_type=company_info_dict['企业类型'] if company_info_dict['企业类型'] else '-'
				zjmyqyw.register_money=company_info_dict['注册资本'] if company_info_dict['注册资本'] else '-'
				zjmyqyw.register_money=company_info_dict['员工人数'] if company_info_dict['员工人数'] else '-'
				zjmyqyw.source_page=url2
				zjmyqyw.data_save()
				zjmyqyw.phone_number=contact_info_dict['手机'] if contact_info_dict['手机'] else '-'
				zjmyqyw.data_save()
				company_deque.append(zjmyqyw.company_name)
				print(f'{profession}——第{page}页——{zjmyqyw.company_name}信息导入完成')
	zjmyqyw.spider_end()
Ejemplo n.º 2
0
def zjmyqyw():
	zjmyqyw=SuperSpider()
	zjmyqyw.source='浙江名营企业网'
	zjmyqyw.fax='-'
	zjmyqyw.get_request('http://www.zj123.com/')
	url_list1=('http://www.zj123.com/'+i.replace('1.','{}.') for i in zjmyqyw.data_search('find','.indsort dd a','href'))
	for url1 in url_list1:
		page=1
		while True:
			print(f'第{page}页')
			zjmyqyw.get_request(url1.format(page))
			page_judge=zjmyqyw.data_search('find','.sleft .m.m1 .fred').__next__().split()[0]
			if int(page_judge) != page:
				break
			print(page_judge)
			url_list2=('http://www.zj123.com/member/VIPContact/'+i.split('-')[1]+'/index.htm' for i in zjmyqyw.data_search('find','.listdetail22 .listdetail dt a','href'))
			url_list3=('http://www.zj123.com/member/VIPCompany/'+i.split('-')[1]+'/index.htm' for i in zjmyqyw.data_search('find','.listdetail22 .listdetail dt a','href'))
			#print(url_list2)
			for url2,url3 in zip(url_list2,url_list3):
				zjmyqyw.get_request(url2)
				contact_info_dict={i.split(':')[0].strip():i.split(':')[-1].strip().replace('\xa0','') for i in zjmyqyw.data_search('find','.rkbody table tr')}
				zjmyqyw.company_name=contact_info_dict['公司名称'] if contact_info_dict['公司名称'] else '-'
				zjmyqyw.person_name=contact_info_dict['联系人'] if contact_info_dict['联系人'] else '-'
				zjmyqyw.address=contact_info_dict['地 址'] if contact_info_dict['地 址'] else '-'
				zjmyqyw.phone_code=contact_info_dict['电 话'] if contact_info_dict['电 话'] else '-'
				zjmyqyw.cell_phone=contact_info_dict['手机'] if contact_info_dict['手机'] else '-'
				zjmyqyw.qq=contact_info_dict['QQ'] if contact_info_dict['QQ'] else '-'
				zjmyqyw.website=contact_info_dict['网 址'] if contact_info_dict['网 址'] else '-'
				zjmyqyw.get_request(url3)
				company_info_list=list(zjmyqyw.data_search('find','.rkbody table tr td'))
				company_info_dict={company_info_list[n].strip(': '):company_info_list[n+1].strip(': ') for n in range(0,24,2)}
				#print(company_info_dict)
				zjmyqyw.main_product=company_info_dict['主营产品或服务'] if company_info_dict['主营产品或服务'] else '-'
				zjmyqyw.business_mode=company_info_dict['经营模式'] if company_info_dict['经营模式'] else '-'
				zjmyqyw.company_type=company_info_dict['企业类型'] if company_info_dict['企业类型'] else '-'
				zjmyqyw.register_money=company_info_dict['注册资本'] if company_info_dict['注册资本'] else '-'
				zjmyqyw.data_save()
				print(f'浙江企业网——{zjmyqyw.company_name}信息导入完成')
			page+=1
	zjmyqyw.spider_end()
#zjmyqyw()

# test_obj=SuperSpider()
# js='var btn=document.querySelector(".see_a.inactive_scode");btn.click();'
# test_obj.use_selenium()
# test_obj.selenium_js('https://www.china.cn/shukongjichuang/3746553522.html',js)
# test_obj.cell_phone=test_obj.selenium_search('css_selector','.inactive_top .number').__next__()
# print('aaaaaaa')
# print(test_obj.cell_phone)
Ejemplo n.º 3
0
def wl114_spider():
    wl114 = SuperSpider()
    wl114.source_name = '网络114'
    wl114.business_mode = '-'
    wl114.register_money = '-'
    wl114.website = '-'
    wl114.qq = '-'
    wl114.get_request('http://www.net114.com/')
    url_list1 = [
        i.replace('.html', '-p-{}.html') for i in wl114.data_search(
            'xpath',
            '//*[@id="product_center_content"]/div/ul/li/p/a',
            attr='href') if i.endswith('.html')
    ]
    profession_list1 = [
        i for i in wl114.data_search(
            'xpath', '//*[@id="product_center_content"]/div/ul/li/p/a')
        if i != '更多>>'
    ]
    error_index = profession_list1.index('维护工具')
    url_list2 = (i for i in wl114.data_search(
        'xpath',
        '//*[@id="product_center_content"]/div/ul/li/p/a',
        attr='href') if not i.endswith('.html'))
    profession_list2 = (i for i in wl114.data_search(
        'xpath', '//*[@id="product_center_content"]/div/ul/li/p/a')
                        if i == '更多>>')
    for url1, profession1 in zip(url_list1[error_index:],
                                 profession_list1[error_index:]):
        try:
            wl114.get_request(url1.format(1))
            all_page = wl114.data_search(
                'find', '.page_p:not(span)').__next__().split('\xa0')[1]
        except:
            continue
        for page in range(1, int(all_page) + 1):
            print(f'{profession1}——第{page}页')
            try:
                wl114.get_request(url1.format(page))
            except:
                continue
            url_list3 = list(
                wl114.data_search('find', '.product_list_div_h143 h2 a',
                                  'href'))
            if not url_list3:
                break
            for url3 in url_list3:
                try:
                    wl114.get_request(url3)
                    company_info_dict = {
                        i.split(':')[0].strip(): i.split(':')[-1].strip()
                        for i in wl114.data_search(
                            'find', '.right.w_250 .border.p_8 li') if ':' in i
                    }
                    phone_url = wl114.data_search(
                        'find', '.right.w_250 .border.p_8 li a',
                        'href').__next__()
                except:
                    continue
                wl114.company_type = company_info_dict.get('企业性质', '-')
                wl114.main_product = company_info_dict.get('企业主营', '-')
                wl114.address = company_info_dict.get('企业地址', '-')
                try:
                    wl114.get_request(phone_url)
                except:
                    continue
                phone_info_data = wl114.data_search(
                    'find', 'td[valign="top"]:first-child')
                try:
                    phone_info_list = phone_info_data.__next__().split('\n')
                    phone_info_dict = {
                        i.split(':')[0].strip(): i.split(':')[-1].strip()
                        for i in phone_info_list if ':' in i
                    }
                except:
                    continue
                wl114.company_name = phone_info_dict.get('公司名称', '-')
                if wl114.company_name == '-':
                    wl114.company_name = phone_info_dict.get('企业名称', '-')
                wl114.person_name = phone_info_dict.get('联系人', '-')
                wl114.fax = phone_info_dict.get('传真', '-')
                wl114.phone_number = phone_info_dict.get('手机', '-')
                wl114.source_page = url3
                wl114.data_save()
                wl114.phone_number = phone_info_dict.get('联系电话', '-')
                wl114.data_save()
                print(f'{profession1}——第{page}页——{wl114.company_name}信息导入完成')
            page += 1
    for url2 in url_list2:
        try:
            wl114.get_request(url2)
        except:
            continue
        url_list4 = (i.replace('.html', '-p-{}.html')
                     for i in wl114.data_search(
                         'find', '.product_w369_list a[href]', 'href'))
        profession_list4 = wl114.data_search('find',
                                             '.product_w369_list a[href]')
        for profession4, url4 in zip(profession_list4, url_list4):
            try:
                wl114.get_request(url4.format(1))
                all_page = wl114.data_search(
                    'find', '.page_p:not(span)').__next__().split('\xa0')[1]
            except:
                continue
            for page in range(1, int(all_page) + 1):
                print(f'{profession4}——第{page}页')
                try:
                    wl114.get_request(url4.format(page))
                except:
                    continue
                url_list3 = list(
                    wl114.data_search('find', '.product_list_div_h143 h2 a',
                                      'href'))
                if not url_list3:
                    break
                for url3 in url_list3:
                    try:
                        wl114.get_request(url3)
                        company_info_dict = {
                            i.split(':')[0].strip(): i.split(':')[-1].strip()
                            for i in wl114.data_search(
                                'find', '.right.w_250 .border.p_8 li')
                            if ':' in i
                        }
                        phone_url = wl114.data_search(
                            'find', '.right.w_250 .border.p_8 li a',
                            'href').__next__()
                    except:
                        continue
                    wl114.company_type = company_info_dict.get('企业性质', '-')
                    wl114.main_product = company_info_dict.get('企业主营', '-')
                    wl114.address = company_info_dict.get('企业地址', '-')
                    try:
                        wl114.get_request(phone_url)
                    except:
                        continue
                    phone_info_data = wl114.data_search(
                        'find', 'td[valign="top"]:first-child')
                    try:
                        phone_info_list = phone_info_data.__next__().split(
                            '\n')
                        phone_info_dict = {
                            i.split(':')[0].strip(): i.split(':')[-1].strip()
                            for i in phone_info_list if ':' in i
                        }
                    except:
                        continue
                    wl114.company_name = phone_info_dict.get('公司名称', '-')
                    if wl114.company_name == '-':
                        wl114.company_name = phone_info_dict.get('企业名称', '-')
                    wl114.person_name = phone_info_dict.get('联系人', '-')
                    wl114.fax = phone_info_dict.get('传真', '-')
                    wl114.phone_number = phone_info_dict.get('手机', '-')
                    wl114.source_page = url3
                    wl114.data_save()
                    wl114.phone_number = phone_info_dict.get('联系电话', '-')
                    wl114.data_save()
                    print(
                        f'{profession4}——第{page}页——{wl114.company_name}信息导入完成')
                page += 1
    wl114.spider_end()
Ejemplo n.º 4
0
def zggys_spider():
    zggys = SuperSpider(host='192.168.0.172', default_field='-')
    zggys.source_name = '中国供应商'
    proxies_list = zggys.sql_search('select ip from ip_pool')
    url_list1 = [
        i + '?p={}' for i in zggys.data_search(
            'https://cn.china.cn/',
            '//*[@id="content"]/div[1]/div[1]/div/div[2]/div/div[2]/div/ul/li/div[2]/a/@href'
        )
    ]
    profession_list = zggys.data_search(
        'https://cn.china.cn/',
        '//*[@id="content"]/div[1]/div[1]/div/div[2]/div/div[2]/div/ul/li/div[2]/a/text()',
        'GBK')
    error_index = profession_list.index('睡袋')
    for url1, profession in zip(url_list1[error_index:],
                                profession_list[error_index:]):
        page = 1
        while True:
            time.sleep(2)
            print(f'{profession}——第{page}页')
            for i in range(20):
                proxies = random.choice(proxies_list)[0]
                print(f'使用代理-{proxies}')
                key = 'http' if not proxies.startswith('https') else 'https'
                try:
                    url_list2 = zggys.data_search(
                        url1.format(page),
                        '//ul[@class="extension_ul"]//h3[@class="title"]/a/@href',
                        'GBK',
                        proxies={key: proxies},
                        timeout=5)
                except Exception as error:
                    print(error)
                    continue
            if not url_list2:
                print(f'{profession}——第{page}页——没有数据')
                break
            for url2 in url_list2:
                for i in range(20):
                    try:
                        time.sleep(2)
                        proxies = random.choice(proxies_list)[0]
                        print(f'使用代理-{proxies}')
                        key = 'http' if not proxies.startswith(
                            'https') else 'https'
                        html = zggys.get_html(url2,
                                              charset='GBK',
                                              proxies={key: proxies},
                                              timeout=5)
                        zggys.source_page = url2
                        if zggys.data_search(
                                html=html,
                                xpath='//div[@class="column_xx"]//p//a/text()'
                        ):
                            zggys.company_name = zggys.data_search(
                                html=html,
                                xpath='//div[@class="column_xx"]//p//a/text()'
                            )[0]
                        company_info_list = [
                            i for i in zggys.data_search(
                                html=html,
                                xpath='//ul[@class="business_xx"]//li//text()')
                            if i.strip('\r\n |')
                        ]
                        # print(company_info_list)
                    except Exception as error:
                        print(error)
                        continue
                    else:
                        try:
                            aim_index = company_info_list.index('经营模式')
                            zggys.business_mode = company_info_list[aim_index +
                                                                    1]
                        except:
                            pass
                        try:
                            aim_index = company_info_list.index('注册资本')
                            zggys.register_money = company_info_list[
                                aim_index + 1].strip()
                        except:
                            pass
                        try:
                            aim_index = company_info_list.index('企业类型')
                            zggys.company_type = company_info_list[aim_index +
                                                                   1]
                        except:
                            pass
                        try:
                            aim_index = company_info_list.index('主营产品')
                            zggys.main_product = company_info_list[aim_index +
                                                                   1]
                        except:
                            pass
                        try:
                            aim_index = company_info_list.index('公司地址')
                            zggys.address = company_info_list[aim_index + 1]
                        except:
                            pass
                        try:
                            zggys.person_name = zggys.data_search(
                                html=html,
                                xpath=
                                '//div[@class="personal_top"]//div[@class="t"]//span/text()'
                            )[0]
                        except:
                            pass
                        phone_list = zggys.data_search(
                            html=html,
                            xpath='//div[@class="personal_bottom"]//span/text()'
                        )
                        if not phone_list:
                            # js=['var btn=document.querySelector(".see_a.inactive_scode");btn.click();']
                            # try:
                            # 	zggys.selenium_open(url2)
                            # 	zggys.selenium_js(js,sleep_time=2)
                            # 	zggys.phone_number=zggys.selenium_search('css_selector','.inactive_top .number').__next__()
                            # 	phone_info_dict={i.split('\n')[0]:i.split('\n')[1].strip('QQ交谈') for i in zggys.selenium_search('css_selector','.inactive_right .txt p')}
                            # except:
                            # 	continue
                            # zggys.fax=phone_info_dict.get('传真','-').strip()
                            # zggys.qq=phone_info_dict.get('Q  Q','-').strip()
                            # zggys.data_save()
                            # zggys.phone_number=phone_info_dict.get('电话','-').strip()
                            # zggys.data_save()
                            break
                        for phone in phone_list:
                            zggys.phone_number = phone.strip()
                            zggys.data_save()
                        print(
                            f'{profession}—第{page}页—{zggys.company_name}信息导入完成'
                        )
                    break
            page += 1
    zggys.spider_end()
Ejemplo n.º 5
0
def zggys_spider():	
	zggys=SuperSpider(use_selenium=True)
	zggys.source='中国供应商'
	zggys.website='-'
	zggys.get_request('https://cn.china.cn/')
	url_list1=(i+'?p={}' for i in zggys.data_search('xpath','//*[@id="content"]/div[1]/div[1]/div/div[2]/div/div[2]/div/ul/li/div[2]/a','href'))	
	for url1 in url_list1:
		page=10
		while True:
			print(f'第{page}页')
			try:
				zggys.get_request(url1.format(page))
			except:
				print(f'获取第{page}页失败')
				page+=1
				continue
			url_list2=zggys.data_search('find','h3.title a','href')
			if not url_list2:
				break
			for url2 in url_list2:
				try:
					zggys.get_request(url2)
					zggys.company_name=zggys.data_search('find','.column_xx p a','title').__next__()
				except:
					continue
				company_info_list=(i for i in zggys.data_search('find','.business_xx').__next__().split('\n') if '|' in i)
				company_info_dict={i.split('|')[0]:i.split('|')[1] for i in company_info_list}
				zggys.business_mode=company_info_dict.get('经营模式','-') 
				zggys.register_money=company_info_dict.get('注册资本','-') 
				zggys.company_type=company_info_dict.get('企业类型','-') 
				zggys.main_product=company_info_dict.get('主营产品','-') 
				zggys.address=company_info_dict.get('公司地址','-') 
				#print(business_mode,register_money,company_type,main_product,address)
				zggys.person_name=zggys.data_search('find','.personal_top .t span').__next__()
				phone_list=zggys.data_search('find','.personal_bottom span')
				#print(phone_list)
				cell_phone_list=[]
				phone_code_list=[]
				for phone in phone_list:
					if not phone:
						js='var btn=document.querySelector(".see_a.inactive_scode");btn.click();'
						zggys.selenium_js(url2,js)
						zggys.cell_phone=zggys.selenium_search('css_selector','.inactive_top .number').__next__()
						phone_info_dict={i.split('\n')[0]:i.split('\n')[1].strip('QQ交谈') for i in zggys.selenium_search('css_selector','.inactive_right .txt p')}	
						zggys.phone_code=phone_info_dict.get('电话','-')
						zggys.fax=phone_info_dict.get('传真','-')
						zggys.qq=phone_info_dict.get('Q  Q','-')
					else:
						if not phone.startswith('1'):
							phone_code_list.append(phone)
						else:
							cell_phone_list.append(phone)
				if cell_phone_list or phone_code_list:
					zggys.phone_code='/'.join(phone_code_list) if phone_code_list else '-'
					zggys.cell_phone='/'.join(cell_phone_list) if cell_phone_list else '-'
					zggys.fax='-'
					zggys.qq='-'
				zggys.data_save()
				print(f'中国供应商——{zggys.company_name}信息导入完成')
			page+=1
	zggys.spider_end()
Ejemplo n.º 6
0
def zgcpw_spider():
    zgcpw = SuperSpider()
    company_list = deque([], maxlen=35)
    zgcpw.source_name = '中国产品网'
    zgcpw.get_request('http://www.pe168.com/')
    url_list1 = zgcpw.data_search('find', 'td div:nth-child(2) a', 'href')
    profession_list = zgcpw.data_search('find', 'td div:nth-child(2) a')
    for profession, url1 in zip(profession_list, url_list1):
        try:
            zgcpw.get_request(url1)
            page_all = zgcpw.data_search('find', '.pages cite').__next__()
            page_all_number = zgcpw.re_find(r'/(\d+)页',
                                            page_all).__next__().group(1)
        except:
            continue
        for page in range(1, int(page_all_number) + 1):
            print(f'{profession}——第{page}页')
            url2 = url1.replace('.html', f'-{page}.html')
            try:
                zgcpw.get_request(url2)
            except:
                continue
            url_list3 = zgcpw.data_search(
                'find', '.left_box form tr ul li:nth-last-child(1) a', 'href')
            company_list3 = zgcpw.data_search(
                'find', '.left_box form tr ul li:nth-last-child(1) a')
            for company_name, url3 in zip(company_list3, url_list3):
                if company_name in company_list:
                    print('信息重复')
                    continue
                company_list.append(company_name)
                zgcpw.company_name = company_name
                try:
                    zgcpw.get_request(url3)
                except:
                    continue
                zgcpw.source_page = url3
                try:
                    company_info_url = zgcpw.data_search(
                        'find', 'a[title="公司介绍"]', 'href').__next__()
                except:
                    company_list.append(company_name)
                    continue
                try:
                    zgcpw.get_request(company_info_url)
                except:
                    continue
                company_info_list = list(
                    zgcpw.data_search('find',
                                      '.main_body:nth-last-child(1) td'))
                zgcpw.company_type = company_info_list[
                    company_info_list.index('公司类型:') +
                    1] if '公司类型:' in company_info_list else '-'
                zgcpw.staff_number = company_info_list[
                    company_info_list.index('公司规模:') +
                    1] if '公司规模:' in company_info_list else '-'
                zgcpw.register_money = company_info_list[
                    company_info_list.index('注册资本:') +
                    1] if '注册资本:' in company_info_list else '-'
                zgcpw.business_mode = company_info_list[
                    company_info_list.index('经营模式:') +
                    1] if '经营模式:' in company_info_list else '-'
                zgcpw.main_product = company_info_list[
                    company_info_list.index('经营范围:') +
                    1] if '经营范围:' in company_info_list else '-'
                try:
                    phone_info_url = zgcpw.data_search('find',
                                                       'a[title="联系方式"]',
                                                       'href').__next__()
                except:
                    company_list.append(company_name)
                    continue
                try:
                    zgcpw.get_request(phone_info_url)
                except:
                    continue
                phone_info_list = list(
                    zgcpw.data_search('find', '.px13.lh18 td'))
                zgcpw.address = phone_info_list[
                    phone_info_list.index('公司地址:') +
                    1] if '公司地址:' in phone_info_list else '-'
                zgcpw.fax = phone_info_list[
                    phone_info_list.index('公司传真:') +
                    1] if '公司传真:' in phone_info_list else '-'
                zgcpw.website = phone_info_list[
                    phone_info_list.index('公司网址:') +
                    1] if '公司网址:' in phone_info_list else '-'
                zgcpw.person_name = phone_info_list[
                    phone_info_list.index('联 系 人:') +
                    1] if '联 系 人:' in phone_info_list else '-'
                zgcpw.phone_number = phone_info_list[
                    phone_info_list.index('公司电话:') +
                    1] if '公司电话:' in phone_info_list else '-'
                zgcpw.data_save()
                zgcpw.phone_number = phone_info_list[
                    phone_info_list.index('手机号码:') +
                    1] if '手机号码:' in phone_info_list else '-'
                zgcpw.data_save()
                print(f'{profession}——第{page}页——{company_name}导入完成')
    zgcpw.spider_end()