Exemple #1
0
def ip_spider2():
    ip = SuperSpider(host='192.168.0.172',
                     table_name='ip_pool',
                     field_list=[
                         'spider_datetime', 'source_name', 'source_page', 'ip',
                         'address'
                     ])
    ip.source_name = '89免费代理'
    page = 1
    while True:
        ip.source_page = f'http://www.89ip.cn/index_{page}.html'
        data_list = ip.data_search(f'http://www.89ip.cn/index_{page}.html',
                                   '//table[@class="layui-table"]//td/text()')
        if not data_list:
            break
        print(f'第{page}页')
        for i in range(0, 75, 5):
            try:
                ip_value = data_list[i].strip(' \n\t')
                ip_port = data_list[i + 1].strip(' \n\t')
                ip.ip = f"http://{ip_value}:{ip_port}"
                ip.address = data_list[i + 2].strip(' \n\t')
            except:
                break
            ip.data_save()
            print(f'{ip.source_name}-第{page}页-{ip.ip}-导入完成')
        page += 1
        time.sleep(2)
    ip.spider_end()
Exemple #2
0
def ip_spider7():
    ip = SuperSpider(host='192.168.0.172',
                     table_name='ip_pool',
                     field_list=[
                         'spider_datetime', 'source_name', 'source_page', 'ip',
                         'address'
                     ])
    page_all = ip.data_search('http://www.66ip.cn/index.html',
                              '//div[@id="PageList"]//a[last()-1]/text()')[0]
    for page in range(1, int(page_all) + 1):
        data_list = ip.data_search(
            f'http://www.66ip.cn/{page}.html',
            '//div[@class="containerbox boxindex"]//table//tr//text()',
            'gbk')[5:]
        for i in range(0, 10000, 5):
            try:
                ip.ip = f'http://{data_list[i]}:{data_list[i+1]}'
                ip.address = data_list[i + 2]
                ip.source_name = '66代理'
                ip.source_page = f'http://www.66ip.cn/{page}.html'
                ip.data_save()
                print(f'{ip.source_name}-第{page}页-{ip.ip}-导入完成')
            except:
                break
    ip.spider_end()
Exemple #3
0
def ip_spider5():
    ip = SuperSpider(host='192.168.0.172',
                     table_name='ip_pool',
                     field_list=[
                         'spider_datetime', 'source_name', 'source_page', 'ip',
                         'address'
                     ])
    ip.source_name = '开心代理'
    page = 1
    while True:
        ip.source_page = f'http://ip.kxdaili.com/ipList/{page}.html#ip'
        data_list = ip.data_search(
            f'http://ip.kxdaili.com/ipList/{page}.html#ip',
            '//table[@class="ui table segment"]//td/text()')
        if not data_list:
            break
        for i in range(0, 70, 7):
            ip.address = data_list[i + 5]
            h_list = data_list[i + 3].split(',')
            for h in h_list:
                ip.ip = f'{h.lower()}://{data_list[i]}:{data_list[i+1]}'
                ip.data_save()
                print(f'{ip.source_name}-第{page}页-{ip.ip}-导入完成')
        page += 1
    ip.spider_end()
def zjmyqyw_spdier():
	company_deque=deque([],maxlen=35)
	zjmyqyw=SuperSpider()
	zjmyqyw.source_name='浙江名营企业网'
	zjmyqyw.fax='-'
	zjmyqyw.get_request('http://www.zj123.com/')
	url_list1=['http://www.zj123.com/'+i.replace('1.','{}.') for i in zjmyqyw.data_search('find','.indsort dd a','href')]
	profession_list=list(zjmyqyw.data_search('find','.indsort dd a'))
	error_index=profession_list.index('特种印刷')
	for profession,url1 in zip(profession_list[error_index:],url_list1[error_index:]):
		for page in range(1,100):
			print(f'{profession}——第{page}页')
			try:
				zjmyqyw.get_request(url1.format(page))
				page_judge=zjmyqyw.data_search('find','.sleft .m.m1 .fred').__next__().split()[0]
			except:
				print(f'获取第{page}页失败')
				page+=1
				continue
			if int(page_judge) != page:
				break
			url_list2=('http://www.zj123.com/member/VIPContact/'+i.split('-')[1]+'/index.htm' for i in zjmyqyw.data_search('find','.listdetail22 .listdetail dt a','href'))
			url_list3=('http://www.zj123.com/member/VIPCompany/'+i.split('-')[1]+'/index.htm' for i in zjmyqyw.data_search('find','.listdetail22 .listdetail dt a','href'))
			#print(url_list2)
			for url2,url3 in zip(url_list2,url_list3):
				try:
					zjmyqyw.get_request(url2)
				except:
					continue
				contact_info_dict={i.split(':')[0].strip():i.split(':')[-1].strip().replace('\xa0','') for i in zjmyqyw.data_search('find','.rkbody table tr')}
				zjmyqyw.company_name=contact_info_dict['公司名称'] if contact_info_dict['公司名称'] else '-'
				if zjmyqyw.company_name in company_deque:
					print('信息重复')
					continue
				zjmyqyw.person_name=contact_info_dict['联系人'] if contact_info_dict['联系人'] else '-'
				zjmyqyw.address=contact_info_dict['地 址'] if contact_info_dict['地 址'] else '-'
				zjmyqyw.phone_number=contact_info_dict['电 话'] if contact_info_dict['电 话'] else '-'
				zjmyqyw.qq=contact_info_dict['QQ'] if contact_info_dict['QQ'] else '-'
				zjmyqyw.website=contact_info_dict['网 址'] if contact_info_dict['网 址'] else '-'
				try:
					zjmyqyw.get_request(url3)
				except:
					continue
				company_info_list=list(zjmyqyw.data_search('find','.rkbody table tr td'))
				company_info_dict={company_info_list[n].strip(': '):company_info_list[n+1].strip(': ') for n in range(0,24,2)}
				#print(company_info_dict)
				zjmyqyw.main_product=company_info_dict['主营产品或服务'] if company_info_dict['主营产品或服务'] else '-'
				zjmyqyw.business_mode=company_info_dict['经营模式'] if company_info_dict['经营模式'] else '-'
				zjmyqyw.company_type=company_info_dict['企业类型'] if company_info_dict['企业类型'] else '-'
				zjmyqyw.register_money=company_info_dict['注册资本'] if company_info_dict['注册资本'] else '-'
				zjmyqyw.register_money=company_info_dict['员工人数'] if company_info_dict['员工人数'] else '-'
				zjmyqyw.source_page=url2
				zjmyqyw.data_save()
				zjmyqyw.phone_number=contact_info_dict['手机'] if contact_info_dict['手机'] else '-'
				zjmyqyw.data_save()
				company_deque.append(zjmyqyw.company_name)
				print(f'{profession}——第{page}页——{zjmyqyw.company_name}信息导入完成')
	zjmyqyw.spider_end()
def zjmyqyw():
	zjmyqyw=SuperSpider()
	zjmyqyw.source='浙江名营企业网'
	zjmyqyw.fax='-'
	zjmyqyw.get_request('http://www.zj123.com/')
	url_list1=('http://www.zj123.com/'+i.replace('1.','{}.') for i in zjmyqyw.data_search('find','.indsort dd a','href'))
	for url1 in url_list1:
		page=1
		while True:
			print(f'第{page}页')
			zjmyqyw.get_request(url1.format(page))
			page_judge=zjmyqyw.data_search('find','.sleft .m.m1 .fred').__next__().split()[0]
			if int(page_judge) != page:
				break
			print(page_judge)
			url_list2=('http://www.zj123.com/member/VIPContact/'+i.split('-')[1]+'/index.htm' for i in zjmyqyw.data_search('find','.listdetail22 .listdetail dt a','href'))
			url_list3=('http://www.zj123.com/member/VIPCompany/'+i.split('-')[1]+'/index.htm' for i in zjmyqyw.data_search('find','.listdetail22 .listdetail dt a','href'))
			#print(url_list2)
			for url2,url3 in zip(url_list2,url_list3):
				zjmyqyw.get_request(url2)
				contact_info_dict={i.split(':')[0].strip():i.split(':')[-1].strip().replace('\xa0','') for i in zjmyqyw.data_search('find','.rkbody table tr')}
				zjmyqyw.company_name=contact_info_dict['公司名称'] if contact_info_dict['公司名称'] else '-'
				zjmyqyw.person_name=contact_info_dict['联系人'] if contact_info_dict['联系人'] else '-'
				zjmyqyw.address=contact_info_dict['地 址'] if contact_info_dict['地 址'] else '-'
				zjmyqyw.phone_code=contact_info_dict['电 话'] if contact_info_dict['电 话'] else '-'
				zjmyqyw.cell_phone=contact_info_dict['手机'] if contact_info_dict['手机'] else '-'
				zjmyqyw.qq=contact_info_dict['QQ'] if contact_info_dict['QQ'] else '-'
				zjmyqyw.website=contact_info_dict['网 址'] if contact_info_dict['网 址'] else '-'
				zjmyqyw.get_request(url3)
				company_info_list=list(zjmyqyw.data_search('find','.rkbody table tr td'))
				company_info_dict={company_info_list[n].strip(': '):company_info_list[n+1].strip(': ') for n in range(0,24,2)}
				#print(company_info_dict)
				zjmyqyw.main_product=company_info_dict['主营产品或服务'] if company_info_dict['主营产品或服务'] else '-'
				zjmyqyw.business_mode=company_info_dict['经营模式'] if company_info_dict['经营模式'] else '-'
				zjmyqyw.company_type=company_info_dict['企业类型'] if company_info_dict['企业类型'] else '-'
				zjmyqyw.register_money=company_info_dict['注册资本'] if company_info_dict['注册资本'] else '-'
				zjmyqyw.data_save()
				print(f'浙江企业网——{zjmyqyw.company_name}信息导入完成')
			page+=1
	zjmyqyw.spider_end()
#zjmyqyw()

# test_obj=SuperSpider()
# js='var btn=document.querySelector(".see_a.inactive_scode");btn.click();'
# test_obj.use_selenium()
# test_obj.selenium_js('https://www.china.cn/shukongjichuang/3746553522.html',js)
# test_obj.cell_phone=test_obj.selenium_search('css_selector','.inactive_top .number').__next__()
# print('aaaaaaa')
# print(test_obj.cell_phone)
Exemple #6
0
def ip_spider4():
    ip = SuperSpider(host='192.168.0.172',
                     table_name='ip_pool',
                     field_list=[
                         'spider_datetime', 'source_name', 'source_page', 'ip',
                         'address'
                     ])
    ip.source_name = '方法SEO'
    ip.source_page = 'https://ip.seofangfa.com/'
    data_list = ip.data_search('https://ip.seofangfa.com/',
                               '//table[@class="table"]//td/text()')
    for i in range(0, 250, 5):
        ip.ip = f'http://{data_list[i]}:{data_list[i+1]}'
        ip.address = data_list[i + 3]
        ip.data_save()
        print(f'{ip.source_name}-{ip.ip}-导入完成')
    ip.spider_end()
Exemple #7
0
def ip_spider1():
    ip = SuperSpider(host='192.168.0.172',
                     table_name='ip_pool',
                     field_list=[
                         'spider_datetime', 'source_name', 'source_page', 'ip',
                         'address'
                     ])
    ip.source_name = '快代理'
    for page in range(1, 100):
        print(f'第{page}页')
        ip.source_page = f'https://www.kuaidaili.com/free/inha/{page}/'
        data_list = ip.data_search(
            f'https://www.kuaidaili.com/free/inha/{page}/',
            '//table[@class="table table-bordered table-striped"]//td/text()')
        for i in range(0, 105, 7):
            try:
                ip.ip = f'http://{data_list[i]}:{data_list[i+1]}'
                ip.address = data_list[i + 4]
            except:
                break
            ip.data_save()
            print(f'{ip.source_name}-第{page}页-{ip.ip}-导入完成')
        time.sleep(10)
    ip.spider_end()
Exemple #8
0
def wl114_spider():
    wl114 = SuperSpider()
    wl114.source_name = '网络114'
    wl114.business_mode = '-'
    wl114.register_money = '-'
    wl114.website = '-'
    wl114.qq = '-'
    wl114.get_request('http://www.net114.com/')
    url_list1 = [
        i.replace('.html', '-p-{}.html') for i in wl114.data_search(
            'xpath',
            '//*[@id="product_center_content"]/div/ul/li/p/a',
            attr='href') if i.endswith('.html')
    ]
    profession_list1 = [
        i for i in wl114.data_search(
            'xpath', '//*[@id="product_center_content"]/div/ul/li/p/a')
        if i != '更多>>'
    ]
    error_index = profession_list1.index('维护工具')
    url_list2 = (i for i in wl114.data_search(
        'xpath',
        '//*[@id="product_center_content"]/div/ul/li/p/a',
        attr='href') if not i.endswith('.html'))
    profession_list2 = (i for i in wl114.data_search(
        'xpath', '//*[@id="product_center_content"]/div/ul/li/p/a')
                        if i == '更多>>')
    for url1, profession1 in zip(url_list1[error_index:],
                                 profession_list1[error_index:]):
        try:
            wl114.get_request(url1.format(1))
            all_page = wl114.data_search(
                'find', '.page_p:not(span)').__next__().split('\xa0')[1]
        except:
            continue
        for page in range(1, int(all_page) + 1):
            print(f'{profession1}——第{page}页')
            try:
                wl114.get_request(url1.format(page))
            except:
                continue
            url_list3 = list(
                wl114.data_search('find', '.product_list_div_h143 h2 a',
                                  'href'))
            if not url_list3:
                break
            for url3 in url_list3:
                try:
                    wl114.get_request(url3)
                    company_info_dict = {
                        i.split(':')[0].strip(): i.split(':')[-1].strip()
                        for i in wl114.data_search(
                            'find', '.right.w_250 .border.p_8 li') if ':' in i
                    }
                    phone_url = wl114.data_search(
                        'find', '.right.w_250 .border.p_8 li a',
                        'href').__next__()
                except:
                    continue
                wl114.company_type = company_info_dict.get('企业性质', '-')
                wl114.main_product = company_info_dict.get('企业主营', '-')
                wl114.address = company_info_dict.get('企业地址', '-')
                try:
                    wl114.get_request(phone_url)
                except:
                    continue
                phone_info_data = wl114.data_search(
                    'find', 'td[valign="top"]:first-child')
                try:
                    phone_info_list = phone_info_data.__next__().split('\n')
                    phone_info_dict = {
                        i.split(':')[0].strip(): i.split(':')[-1].strip()
                        for i in phone_info_list if ':' in i
                    }
                except:
                    continue
                wl114.company_name = phone_info_dict.get('公司名称', '-')
                if wl114.company_name == '-':
                    wl114.company_name = phone_info_dict.get('企业名称', '-')
                wl114.person_name = phone_info_dict.get('联系人', '-')
                wl114.fax = phone_info_dict.get('传真', '-')
                wl114.phone_number = phone_info_dict.get('手机', '-')
                wl114.source_page = url3
                wl114.data_save()
                wl114.phone_number = phone_info_dict.get('联系电话', '-')
                wl114.data_save()
                print(f'{profession1}——第{page}页——{wl114.company_name}信息导入完成')
            page += 1
    for url2 in url_list2:
        try:
            wl114.get_request(url2)
        except:
            continue
        url_list4 = (i.replace('.html', '-p-{}.html')
                     for i in wl114.data_search(
                         'find', '.product_w369_list a[href]', 'href'))
        profession_list4 = wl114.data_search('find',
                                             '.product_w369_list a[href]')
        for profession4, url4 in zip(profession_list4, url_list4):
            try:
                wl114.get_request(url4.format(1))
                all_page = wl114.data_search(
                    'find', '.page_p:not(span)').__next__().split('\xa0')[1]
            except:
                continue
            for page in range(1, int(all_page) + 1):
                print(f'{profession4}——第{page}页')
                try:
                    wl114.get_request(url4.format(page))
                except:
                    continue
                url_list3 = list(
                    wl114.data_search('find', '.product_list_div_h143 h2 a',
                                      'href'))
                if not url_list3:
                    break
                for url3 in url_list3:
                    try:
                        wl114.get_request(url3)
                        company_info_dict = {
                            i.split(':')[0].strip(): i.split(':')[-1].strip()
                            for i in wl114.data_search(
                                'find', '.right.w_250 .border.p_8 li')
                            if ':' in i
                        }
                        phone_url = wl114.data_search(
                            'find', '.right.w_250 .border.p_8 li a',
                            'href').__next__()
                    except:
                        continue
                    wl114.company_type = company_info_dict.get('企业性质', '-')
                    wl114.main_product = company_info_dict.get('企业主营', '-')
                    wl114.address = company_info_dict.get('企业地址', '-')
                    try:
                        wl114.get_request(phone_url)
                    except:
                        continue
                    phone_info_data = wl114.data_search(
                        'find', 'td[valign="top"]:first-child')
                    try:
                        phone_info_list = phone_info_data.__next__().split(
                            '\n')
                        phone_info_dict = {
                            i.split(':')[0].strip(): i.split(':')[-1].strip()
                            for i in phone_info_list if ':' in i
                        }
                    except:
                        continue
                    wl114.company_name = phone_info_dict.get('公司名称', '-')
                    if wl114.company_name == '-':
                        wl114.company_name = phone_info_dict.get('企业名称', '-')
                    wl114.person_name = phone_info_dict.get('联系人', '-')
                    wl114.fax = phone_info_dict.get('传真', '-')
                    wl114.phone_number = phone_info_dict.get('手机', '-')
                    wl114.source_page = url3
                    wl114.data_save()
                    wl114.phone_number = phone_info_dict.get('联系电话', '-')
                    wl114.data_save()
                    print(
                        f'{profession4}——第{page}页——{wl114.company_name}信息导入完成')
                page += 1
    wl114.spider_end()
def zggys_spider():
    zggys = SuperSpider(host='192.168.0.172', default_field='-')
    zggys.source_name = '中国供应商'
    proxies_list = zggys.sql_search('select ip from ip_pool')
    url_list1 = [
        i + '?p={}' for i in zggys.data_search(
            'https://cn.china.cn/',
            '//*[@id="content"]/div[1]/div[1]/div/div[2]/div/div[2]/div/ul/li/div[2]/a/@href'
        )
    ]
    profession_list = zggys.data_search(
        'https://cn.china.cn/',
        '//*[@id="content"]/div[1]/div[1]/div/div[2]/div/div[2]/div/ul/li/div[2]/a/text()',
        'GBK')
    error_index = profession_list.index('睡袋')
    for url1, profession in zip(url_list1[error_index:],
                                profession_list[error_index:]):
        page = 1
        while True:
            time.sleep(2)
            print(f'{profession}——第{page}页')
            for i in range(20):
                proxies = random.choice(proxies_list)[0]
                print(f'使用代理-{proxies}')
                key = 'http' if not proxies.startswith('https') else 'https'
                try:
                    url_list2 = zggys.data_search(
                        url1.format(page),
                        '//ul[@class="extension_ul"]//h3[@class="title"]/a/@href',
                        'GBK',
                        proxies={key: proxies},
                        timeout=5)
                except Exception as error:
                    print(error)
                    continue
            if not url_list2:
                print(f'{profession}——第{page}页——没有数据')
                break
            for url2 in url_list2:
                for i in range(20):
                    try:
                        time.sleep(2)
                        proxies = random.choice(proxies_list)[0]
                        print(f'使用代理-{proxies}')
                        key = 'http' if not proxies.startswith(
                            'https') else 'https'
                        html = zggys.get_html(url2,
                                              charset='GBK',
                                              proxies={key: proxies},
                                              timeout=5)
                        zggys.source_page = url2
                        if zggys.data_search(
                                html=html,
                                xpath='//div[@class="column_xx"]//p//a/text()'
                        ):
                            zggys.company_name = zggys.data_search(
                                html=html,
                                xpath='//div[@class="column_xx"]//p//a/text()'
                            )[0]
                        company_info_list = [
                            i for i in zggys.data_search(
                                html=html,
                                xpath='//ul[@class="business_xx"]//li//text()')
                            if i.strip('\r\n |')
                        ]
                        # print(company_info_list)
                    except Exception as error:
                        print(error)
                        continue
                    else:
                        try:
                            aim_index = company_info_list.index('经营模式')
                            zggys.business_mode = company_info_list[aim_index +
                                                                    1]
                        except:
                            pass
                        try:
                            aim_index = company_info_list.index('注册资本')
                            zggys.register_money = company_info_list[
                                aim_index + 1].strip()
                        except:
                            pass
                        try:
                            aim_index = company_info_list.index('企业类型')
                            zggys.company_type = company_info_list[aim_index +
                                                                   1]
                        except:
                            pass
                        try:
                            aim_index = company_info_list.index('主营产品')
                            zggys.main_product = company_info_list[aim_index +
                                                                   1]
                        except:
                            pass
                        try:
                            aim_index = company_info_list.index('公司地址')
                            zggys.address = company_info_list[aim_index + 1]
                        except:
                            pass
                        try:
                            zggys.person_name = zggys.data_search(
                                html=html,
                                xpath=
                                '//div[@class="personal_top"]//div[@class="t"]//span/text()'
                            )[0]
                        except:
                            pass
                        phone_list = zggys.data_search(
                            html=html,
                            xpath='//div[@class="personal_bottom"]//span/text()'
                        )
                        if not phone_list:
                            # js=['var btn=document.querySelector(".see_a.inactive_scode");btn.click();']
                            # try:
                            # 	zggys.selenium_open(url2)
                            # 	zggys.selenium_js(js,sleep_time=2)
                            # 	zggys.phone_number=zggys.selenium_search('css_selector','.inactive_top .number').__next__()
                            # 	phone_info_dict={i.split('\n')[0]:i.split('\n')[1].strip('QQ交谈') for i in zggys.selenium_search('css_selector','.inactive_right .txt p')}
                            # except:
                            # 	continue
                            # zggys.fax=phone_info_dict.get('传真','-').strip()
                            # zggys.qq=phone_info_dict.get('Q  Q','-').strip()
                            # zggys.data_save()
                            # zggys.phone_number=phone_info_dict.get('电话','-').strip()
                            # zggys.data_save()
                            break
                        for phone in phone_list:
                            zggys.phone_number = phone.strip()
                            zggys.data_save()
                        print(
                            f'{profession}—第{page}页—{zggys.company_name}信息导入完成'
                        )
                    break
            page += 1
    zggys.spider_end()
Exemple #10
0
def xarcw_spider():
    word_list = ['网络']
    xarcw = SuperSpider(host='192.168.0.172', default_field='-')
    xarcw.source_name = '新安人才网'
    data = {'memberName': '13155291086', 'password': '******'}
    xarcw.post_request('https://login.goodjobs.cn/index.php/action/UserLogin',
                       data=data)
    for word in word_list:
        for city_code in range(1043, 1061):
            for page in range(1, 61):
                print(f'{word}-{city_code}-第{page}页')
                try:
                    url_list = xarcw.data_search(
                        f'https://search.goodjobs.cn/index.php?keyword={word}&boxwp=c{city_code}&page={page}',
                        '//div[@class="dw_table"]//span[@class="e1"]/a/@href')
                except:
                    print(f'{word}-{city_code}-第{page}页获取失败')
                    continue
                if not url_list:
                    print(f'{word}-{city_code}-第{page}页-爬取结束')
                    break
                for url in url_list:
                    # print(url)
                    xarcw.source_page = url
                    time.sleep(1)
                    data_list = xarcw.data_search(url, [
                        '//p[@class="cname"]/a/text()',
                        '//p[@class="msg ltype"]/text()',
                        '//div[@class="w706 clearfix"]/text()',
                        '//div[@class="w706 clearfix"]/img/@src',
                        '//div[@class="comadress clearfix"]/text()'
                    ])
                    if not data_list[0] or not data_list[3]:
                        continue
                    if not data_list[0]:
                        data_list = xarcw.data_search(url, [
                            '//div[@class="w240 whitespace pb16"]//a[@class="org"]/text()',
                            '//div[@class="w240 whitespace pb16"]//p[@class="grey lh28"]/span[@class="black"]/text()',
                            '//p[@class="duol mt20"]/text()',
                            '//p[@class="duol mt20"]/img/@src',
                            '//div[@class="comadress clearfix"]/text()'
                        ])
                        xarcw.company_type = data_list[1][0]
                        xarcw.main_product = data_list[1][2]
                    else:
                        company_info_list = [
                            i.strip('\xa0\xa0\n ')
                            for i in data_list[1][0].split('|')
                        ]
                        xarcw.company_type = company_info_list[0]
                        for j in company_info_list[1:]:
                            if '-' in j:
                                xarcw.staff_number = j
                            else:
                                xarcw.main_product = j
                    xarcw.company_name = data_list[0][0]
                    xarcw.person_name = [i for i in data_list[2]
                                         if i.strip()][0]
                    try:
                        xarcw.phone_number = xarcw.use_tesseract(
                            url=data_list[3][0], lang=None)
                    except:
                        continue
                    xarcw.address = data_list[4][0].strip('工作地点:\u3000\n ')
                    xarcw.data_save()
                    print(
                        f'{xarcw.company_name}-{xarcw.person_name}-{xarcw.phone_number}-导入完成'
                    )
def zggys_spider():	
	zggys=SuperSpider(use_selenium=True)
	zggys.source='中国供应商'
	zggys.website='-'
	zggys.get_request('https://cn.china.cn/')
	url_list1=(i+'?p={}' for i in zggys.data_search('xpath','//*[@id="content"]/div[1]/div[1]/div/div[2]/div/div[2]/div/ul/li/div[2]/a','href'))	
	for url1 in url_list1:
		page=10
		while True:
			print(f'第{page}页')
			try:
				zggys.get_request(url1.format(page))
			except:
				print(f'获取第{page}页失败')
				page+=1
				continue
			url_list2=zggys.data_search('find','h3.title a','href')
			if not url_list2:
				break
			for url2 in url_list2:
				try:
					zggys.get_request(url2)
					zggys.company_name=zggys.data_search('find','.column_xx p a','title').__next__()
				except:
					continue
				company_info_list=(i for i in zggys.data_search('find','.business_xx').__next__().split('\n') if '|' in i)
				company_info_dict={i.split('|')[0]:i.split('|')[1] for i in company_info_list}
				zggys.business_mode=company_info_dict.get('经营模式','-') 
				zggys.register_money=company_info_dict.get('注册资本','-') 
				zggys.company_type=company_info_dict.get('企业类型','-') 
				zggys.main_product=company_info_dict.get('主营产品','-') 
				zggys.address=company_info_dict.get('公司地址','-') 
				#print(business_mode,register_money,company_type,main_product,address)
				zggys.person_name=zggys.data_search('find','.personal_top .t span').__next__()
				phone_list=zggys.data_search('find','.personal_bottom span')
				#print(phone_list)
				cell_phone_list=[]
				phone_code_list=[]
				for phone in phone_list:
					if not phone:
						js='var btn=document.querySelector(".see_a.inactive_scode");btn.click();'
						zggys.selenium_js(url2,js)
						zggys.cell_phone=zggys.selenium_search('css_selector','.inactive_top .number').__next__()
						phone_info_dict={i.split('\n')[0]:i.split('\n')[1].strip('QQ交谈') for i in zggys.selenium_search('css_selector','.inactive_right .txt p')}	
						zggys.phone_code=phone_info_dict.get('电话','-')
						zggys.fax=phone_info_dict.get('传真','-')
						zggys.qq=phone_info_dict.get('Q  Q','-')
					else:
						if not phone.startswith('1'):
							phone_code_list.append(phone)
						else:
							cell_phone_list.append(phone)
				if cell_phone_list or phone_code_list:
					zggys.phone_code='/'.join(phone_code_list) if phone_code_list else '-'
					zggys.cell_phone='/'.join(cell_phone_list) if cell_phone_list else '-'
					zggys.fax='-'
					zggys.qq='-'
				zggys.data_save()
				print(f'中国供应商——{zggys.company_name}信息导入完成')
			page+=1
	zggys.spider_end()
Exemple #12
0
def skb_spider(phone,passwd,word,page_now=1):
	skb=SuperSpider(use_selenium=True)
	skb.source_name='搜客宝'
	skb.fax='-'
	skb.staff_number='-'
	skb.selenium_open('https://biz.lixiaoskb.com/login')
	skb.selenium_input('xpath','//*[@id="app"]/div[1]/div/div/div[1]/div[2]/div[2]/form/div[1]/div/div/div/input',phone)
	skb.selenium_input('xpath','//*[@id="app"]/div[1]/div/div/div[1]/div[2]/div[2]/form/div[2]/div/div/div/input',passwd,enter=True,sleep_time=3)
	js3='document.querySelector("#tab-0").click();'
	skb.selenium_js([js3])
	skb.selenium_input('xpath','//*[@id="searchDeInput"]/div[1]/div/input',word,sleep_time=5,enter=True)
	all_page=500
	if int(page_now) == int(all_page):
		print(f'{word}——所有数据爬取结束')
		skb.spider_end()
		return word,int(all_page)
	for page in range(page_now,int(all_page)+1):
		print(f'{word}——第{page}页')
		try:
			skb.selenium_scroll('//div[@id="jumpPage"]//input[@class="el-input__inner"]')
			skb.selenium_input('css_selector','#jumpPage .el-input input',page,sleep_time=2,enter=True)
		except Exception as e:
			print(e)
			continue
		url_list=skb.selenium_search('xpath',f'//div[@class="card"]//span[@class="name"]//a',attr='href')
		for url in url_list:
			skb.source_page=url
			js1=f'window.open("{url}")'
			skb.selenium_js([js1],sleep_time=3)
			skb.switch_window()
			try:
				skb.company_name=skb.selenium_search('css_selector','.top .name').__next__()
			except Exception as e:
				print(e)
				skb.window_close()
				skb.switch_window(sleep_time=2)
				continue
			try:
				company_info_dict1={i.split(':')[0].strip():i.split(':')[-1].strip() for i in skb.selenium_search('css_selector','.line .group')}
				skb.company_type=company_info_dict1.get('公司类型','-')
				skb.address=company_info_dict1.get('通讯地址','-')
				business_mode=company_info_dict1.get('所属行业','-')
				skb.website=company_info_dict1.get('官方网站','-').strip('更多>> ')
			except:
				pass
			try:
				company_info_dict2={i.split('\n')[0].strip('/ '):i.split('\n')[-1].strip('/ ') for i in skb.selenium_search('css_selector','.gongshang-col')}
				skb.person_name=company_info_dict2.get('法人/负责人','-')
				skb.register_money=company_info_dict2.get('注册资本','-')
				skb.main_product=company_info_dict2.get('经营范围','-')
			except:
				pass
			js2='var open_btn=document.querySelector(".mask-box .action span");open_btn.click();'
			try:
				skb.selenium_js([js2],sleep_time=3)
			except Exception as e:
				print(e)
				phone_list=[]
				qq_list=[]
				try:
					phone_info=skb.selenium_search('css_selector','.el-scrollbar__view')
					phone_info_list=list(phone_info)[1].split('\n')
				except Exception as e:
					print(e)
					skb.window_close()
					skb.switch_window(sleep_time=2)
					continue
				#print(phone_info_list)
				for i,j in enumerate(phone_info_list):
					if j == '选 择':
						skb.phone_number=phone_info_list[i-1]
					elif j == '联系人':
						skb.person_name=phone_info_list[i+1]
					elif j == 'qq号码':
						skb.qq=phone_info_list[i+1].strip(',')
					elif j == '电子邮箱':
						skb.mail=phone_info_list[i+1].strip(',')
						try:
							skb.data_save()
						except:
							continue
				print(f'{word}——第{page}页——{skb.company_name}信息导入完成')
				skb.window_close()
				skb.switch_window(sleep_time=2)
				continue
			phone_list=[]
			qq_list=[]
			try:
				phone_info=skb.selenium_search('css_selector','.el-scrollbar__view')
				phone_info_list=list(phone_info)[1].split('\n')
			except Exception as e:
				print(e)
				skb.window_close()
				skb.switch_window(sleep_time=2)
				continue
			#print(phone_info_list)
			for i,j in enumerate(phone_info_list):
				if j == '选 择':
					skb.phone_number=phone_info_list[i-1]
				elif j == '联系人':
					skb.person_name=phone_info_list[i+1]
				elif j == 'qq号码':
					skb.qq=phone_info_list[i+1].strip(',')
				elif j == '电子邮箱':
					skb.mail=phone_info_list[i+1].strip(',')
					try:
						skb.data_save()
					except:
						continue
			print(f'{word}——第{page}页——{skb.company_name}信息导入完成')
			use_number=skb.selenium_search('css_selector','.inner-user .viewCount:first-child').__next__()
			print(use_number)
			if int(use_number) == 0:
				print(f'{word}——第{page}页——今日次数已用完')
				skb.spider_end()
				return word,page
			skb.window_close()
			skb.switch_window(sleep_time=2)
	skb.spider_end()
	print(f'{word}——所有数据爬取结束')
	return word,int(all_page)
def zgcpw_spider():
    zgcpw = SuperSpider()
    company_list = deque([], maxlen=35)
    zgcpw.source_name = '中国产品网'
    zgcpw.get_request('http://www.pe168.com/')
    url_list1 = zgcpw.data_search('find', 'td div:nth-child(2) a', 'href')
    profession_list = zgcpw.data_search('find', 'td div:nth-child(2) a')
    for profession, url1 in zip(profession_list, url_list1):
        try:
            zgcpw.get_request(url1)
            page_all = zgcpw.data_search('find', '.pages cite').__next__()
            page_all_number = zgcpw.re_find(r'/(\d+)页',
                                            page_all).__next__().group(1)
        except:
            continue
        for page in range(1, int(page_all_number) + 1):
            print(f'{profession}——第{page}页')
            url2 = url1.replace('.html', f'-{page}.html')
            try:
                zgcpw.get_request(url2)
            except:
                continue
            url_list3 = zgcpw.data_search(
                'find', '.left_box form tr ul li:nth-last-child(1) a', 'href')
            company_list3 = zgcpw.data_search(
                'find', '.left_box form tr ul li:nth-last-child(1) a')
            for company_name, url3 in zip(company_list3, url_list3):
                if company_name in company_list:
                    print('信息重复')
                    continue
                company_list.append(company_name)
                zgcpw.company_name = company_name
                try:
                    zgcpw.get_request(url3)
                except:
                    continue
                zgcpw.source_page = url3
                try:
                    company_info_url = zgcpw.data_search(
                        'find', 'a[title="公司介绍"]', 'href').__next__()
                except:
                    company_list.append(company_name)
                    continue
                try:
                    zgcpw.get_request(company_info_url)
                except:
                    continue
                company_info_list = list(
                    zgcpw.data_search('find',
                                      '.main_body:nth-last-child(1) td'))
                zgcpw.company_type = company_info_list[
                    company_info_list.index('公司类型:') +
                    1] if '公司类型:' in company_info_list else '-'
                zgcpw.staff_number = company_info_list[
                    company_info_list.index('公司规模:') +
                    1] if '公司规模:' in company_info_list else '-'
                zgcpw.register_money = company_info_list[
                    company_info_list.index('注册资本:') +
                    1] if '注册资本:' in company_info_list else '-'
                zgcpw.business_mode = company_info_list[
                    company_info_list.index('经营模式:') +
                    1] if '经营模式:' in company_info_list else '-'
                zgcpw.main_product = company_info_list[
                    company_info_list.index('经营范围:') +
                    1] if '经营范围:' in company_info_list else '-'
                try:
                    phone_info_url = zgcpw.data_search('find',
                                                       'a[title="联系方式"]',
                                                       'href').__next__()
                except:
                    company_list.append(company_name)
                    continue
                try:
                    zgcpw.get_request(phone_info_url)
                except:
                    continue
                phone_info_list = list(
                    zgcpw.data_search('find', '.px13.lh18 td'))
                zgcpw.address = phone_info_list[
                    phone_info_list.index('公司地址:') +
                    1] if '公司地址:' in phone_info_list else '-'
                zgcpw.fax = phone_info_list[
                    phone_info_list.index('公司传真:') +
                    1] if '公司传真:' in phone_info_list else '-'
                zgcpw.website = phone_info_list[
                    phone_info_list.index('公司网址:') +
                    1] if '公司网址:' in phone_info_list else '-'
                zgcpw.person_name = phone_info_list[
                    phone_info_list.index('联 系 人:') +
                    1] if '联 系 人:' in phone_info_list else '-'
                zgcpw.phone_number = phone_info_list[
                    phone_info_list.index('公司电话:') +
                    1] if '公司电话:' in phone_info_list else '-'
                zgcpw.data_save()
                zgcpw.phone_number = phone_info_list[
                    phone_info_list.index('手机号码:') +
                    1] if '手机号码:' in phone_info_list else '-'
                zgcpw.data_save()
                print(f'{profession}——第{page}页——{company_name}导入完成')
    zgcpw.spider_end()