Exemple #1
0
def business_detail_spider():
	stock_list=[]
	business_detail=SuperSpider(host='47.102.40.81',passwd='Abc12345',db='bryframe',table_name='business_detail',field_list=('spider_date','up_date','code','name','department_name','amount'))
	business_detail.up_date=business_detail.spider_date
	page=1
	while True:
		try:
			json_data=business_detail.use_requests_to_html(f'http://data.eastmoney.com/DataCenter_V3/stock2016/ActiveStatistics/pagesize=50,page={page},sortRule=-1,sortType=JmMoney,startDate={business_detail.spider_date},endDate={business_detail.spider_date},gpfw=0,js=var%20data_tab_1.html?rt=25861061','GB2312')
			data_list=business_detail.json_to_py(json_data,deal=True)['data']
		except:
			print(f'第{page}页获取失败')
			page+=1
			continue
		if not data_list or page == 500:
			break
		print(f'第{page}页')
		for data in data_list:
			if not data['SName']:
				continue
			stock_data_list=business_detail.json_to_py(data['SName'])
			for stock_data in stock_data_list:
				if stock_data['CodeName'] not in stock_list:
					stock_list.append(stock_data['CodeName'])
				else:
					continue
				business_detail.name=stock_data['CodeName']
				business_detail.code=stock_data['SCode']
				try:
					url_code=business_detail.re_find(r'\d+',business_detail.code).__next__().group()
				except:
					continue
				print(url_code)
				url=f'http://data.eastmoney.com/stock/lhb,{business_detail.spider_date},{url_code}.html'
				try:
					business_detail.get_request(url)
				except:
					continue
				detail_data_list=list(business_detail.data_search('find','table tbody td'))
				for i,j in zip(range(1,71,7),range(6,71,7)):
					try:
						business_detail.department_name=detail_data_list[i].split('\n')[0]
					except:
						break
					business_detail.amount=detail_data_list[j]
					business_detail.data_save()
					print(f'每日成交明细——{business_detail.up_date}——{business_detail.code}——{business_detail.name}——{business_detail.department_name}——导入完成')
		page+=1
	business_detail.spider_end()
def zggys_spider():	
	zggys=SuperSpider(use_selenium=True)
	zggys.source='中国供应商'
	zggys.website='-'
	zggys.get_request('https://cn.china.cn/')
	url_list1=(i+'?p={}' for i in zggys.data_search('xpath','//*[@id="content"]/div[1]/div[1]/div/div[2]/div/div[2]/div/ul/li/div[2]/a','href'))	
	for url1 in url_list1:
		page=10
		while True:
			print(f'第{page}页')
			try:
				zggys.get_request(url1.format(page))
			except:
				print(f'获取第{page}页失败')
				page+=1
				continue
			url_list2=zggys.data_search('find','h3.title a','href')
			if not url_list2:
				break
			for url2 in url_list2:
				try:
					zggys.get_request(url2)
					zggys.company_name=zggys.data_search('find','.column_xx p a','title').__next__()
				except:
					continue
				company_info_list=(i for i in zggys.data_search('find','.business_xx').__next__().split('\n') if '|' in i)
				company_info_dict={i.split('|')[0]:i.split('|')[1] for i in company_info_list}
				zggys.business_mode=company_info_dict.get('经营模式','-') 
				zggys.register_money=company_info_dict.get('注册资本','-') 
				zggys.company_type=company_info_dict.get('企业类型','-') 
				zggys.main_product=company_info_dict.get('主营产品','-') 
				zggys.address=company_info_dict.get('公司地址','-') 
				#print(business_mode,register_money,company_type,main_product,address)
				zggys.person_name=zggys.data_search('find','.personal_top .t span').__next__()
				phone_list=zggys.data_search('find','.personal_bottom span')
				#print(phone_list)
				cell_phone_list=[]
				phone_code_list=[]
				for phone in phone_list:
					if not phone:
						js='var btn=document.querySelector(".see_a.inactive_scode");btn.click();'
						zggys.selenium_js(url2,js)
						zggys.cell_phone=zggys.selenium_search('css_selector','.inactive_top .number').__next__()
						phone_info_dict={i.split('\n')[0]:i.split('\n')[1].strip('QQ交谈') for i in zggys.selenium_search('css_selector','.inactive_right .txt p')}	
						zggys.phone_code=phone_info_dict.get('电话','-')
						zggys.fax=phone_info_dict.get('传真','-')
						zggys.qq=phone_info_dict.get('Q  Q','-')
					else:
						if not phone.startswith('1'):
							phone_code_list.append(phone)
						else:
							cell_phone_list.append(phone)
				if cell_phone_list or phone_code_list:
					zggys.phone_code='/'.join(phone_code_list) if phone_code_list else '-'
					zggys.cell_phone='/'.join(cell_phone_list) if cell_phone_list else '-'
					zggys.fax='-'
					zggys.qq='-'
				zggys.data_save()
				print(f'中国供应商——{zggys.company_name}信息导入完成')
			page+=1
	zggys.spider_end()
def zgcpw_spider():
    zgcpw = SuperSpider()
    company_list = deque([], maxlen=35)
    zgcpw.source_name = '中国产品网'
    zgcpw.get_request('http://www.pe168.com/')
    url_list1 = zgcpw.data_search('find', 'td div:nth-child(2) a', 'href')
    profession_list = zgcpw.data_search('find', 'td div:nth-child(2) a')
    for profession, url1 in zip(profession_list, url_list1):
        try:
            zgcpw.get_request(url1)
            page_all = zgcpw.data_search('find', '.pages cite').__next__()
            page_all_number = zgcpw.re_find(r'/(\d+)页',
                                            page_all).__next__().group(1)
        except:
            continue
        for page in range(1, int(page_all_number) + 1):
            print(f'{profession}——第{page}页')
            url2 = url1.replace('.html', f'-{page}.html')
            try:
                zgcpw.get_request(url2)
            except:
                continue
            url_list3 = zgcpw.data_search(
                'find', '.left_box form tr ul li:nth-last-child(1) a', 'href')
            company_list3 = zgcpw.data_search(
                'find', '.left_box form tr ul li:nth-last-child(1) a')
            for company_name, url3 in zip(company_list3, url_list3):
                if company_name in company_list:
                    print('信息重复')
                    continue
                company_list.append(company_name)
                zgcpw.company_name = company_name
                try:
                    zgcpw.get_request(url3)
                except:
                    continue
                zgcpw.source_page = url3
                try:
                    company_info_url = zgcpw.data_search(
                        'find', 'a[title="公司介绍"]', 'href').__next__()
                except:
                    company_list.append(company_name)
                    continue
                try:
                    zgcpw.get_request(company_info_url)
                except:
                    continue
                company_info_list = list(
                    zgcpw.data_search('find',
                                      '.main_body:nth-last-child(1) td'))
                zgcpw.company_type = company_info_list[
                    company_info_list.index('公司类型:') +
                    1] if '公司类型:' in company_info_list else '-'
                zgcpw.staff_number = company_info_list[
                    company_info_list.index('公司规模:') +
                    1] if '公司规模:' in company_info_list else '-'
                zgcpw.register_money = company_info_list[
                    company_info_list.index('注册资本:') +
                    1] if '注册资本:' in company_info_list else '-'
                zgcpw.business_mode = company_info_list[
                    company_info_list.index('经营模式:') +
                    1] if '经营模式:' in company_info_list else '-'
                zgcpw.main_product = company_info_list[
                    company_info_list.index('经营范围:') +
                    1] if '经营范围:' in company_info_list else '-'
                try:
                    phone_info_url = zgcpw.data_search('find',
                                                       'a[title="联系方式"]',
                                                       'href').__next__()
                except:
                    company_list.append(company_name)
                    continue
                try:
                    zgcpw.get_request(phone_info_url)
                except:
                    continue
                phone_info_list = list(
                    zgcpw.data_search('find', '.px13.lh18 td'))
                zgcpw.address = phone_info_list[
                    phone_info_list.index('公司地址:') +
                    1] if '公司地址:' in phone_info_list else '-'
                zgcpw.fax = phone_info_list[
                    phone_info_list.index('公司传真:') +
                    1] if '公司传真:' in phone_info_list else '-'
                zgcpw.website = phone_info_list[
                    phone_info_list.index('公司网址:') +
                    1] if '公司网址:' in phone_info_list else '-'
                zgcpw.person_name = phone_info_list[
                    phone_info_list.index('联 系 人:') +
                    1] if '联 系 人:' in phone_info_list else '-'
                zgcpw.phone_number = phone_info_list[
                    phone_info_list.index('公司电话:') +
                    1] if '公司电话:' in phone_info_list else '-'
                zgcpw.data_save()
                zgcpw.phone_number = phone_info_list[
                    phone_info_list.index('手机号码:') +
                    1] if '手机号码:' in phone_info_list else '-'
                zgcpw.data_save()
                print(f'{profession}——第{page}页——{company_name}导入完成')
    zgcpw.spider_end()