Esempio n. 1
0
def ip_spider7():
    ip = SuperSpider(host='192.168.0.172',
                     table_name='ip_pool',
                     field_list=[
                         'spider_datetime', 'source_name', 'source_page', 'ip',
                         'address'
                     ])
    page_all = ip.data_search('http://www.66ip.cn/index.html',
                              '//div[@id="PageList"]//a[last()-1]/text()')[0]
    for page in range(1, int(page_all) + 1):
        data_list = ip.data_search(
            f'http://www.66ip.cn/{page}.html',
            '//div[@class="containerbox boxindex"]//table//tr//text()',
            'gbk')[5:]
        for i in range(0, 10000, 5):
            try:
                ip.ip = f'http://{data_list[i]}:{data_list[i+1]}'
                ip.address = data_list[i + 2]
                ip.source_name = '66代理'
                ip.source_page = f'http://www.66ip.cn/{page}.html'
                ip.data_save()
                print(f'{ip.source_name}-第{page}页-{ip.ip}-导入完成')
            except:
                break
    ip.spider_end()
Esempio n. 2
0
def ht_spider(start_time='2019-01-02 00:00:00',end_time='2019-04-02 00:00:00'):
	ht=SuperSpider(use_selenium=True,default_field='null',field_list=('start_time','call_duration','connect_duration','talk_duration','ring_duration','call_direction','connect_status','sound_file','customer_id','caller','called','caller_department','caller_number','caller_user_name','project_name','call_type'),table_name='ht_data')
	ht.selenium_get(r'http://210.13.87.106:8088/ec2')
	ht.selenium_click('//td[@tabindex="-1"]//div[@class="v-captiontext"]',3)
	ht.selenium_input('//input[@class="v-textfield"]','mgrdefault8',index=3)
	ht.selenium_input('//input[@class="v-textfield"]','fuyan2018',index=-1)
	ht.selenium_click('//span[@class="v-button-caption"]',3,index=1)
	ht.selenium_click('//span[@class="v-nativebutton-caption"]',3,index=2)
	ht.selenium_input('//input[@class="v-textfield v-datefield-textfield"]',start_time,index=0)
	ht.selenium_input('//input[@class="v-textfield v-datefield-textfield"]',end_time,index=0)
	ht.selenium_click('//div[@class="v-filterselect-button"]',index=2)
	ht.selenium_click('//td[@class="gwt-MenuItem"]/span',index=0)
	ht.selenium_click('//div[@class="v-button v-button-default default"]//span[@class="v-button-caption"]',3,index=0)
	page_all=ht.selenium_search('//*[@id="ec2-100180"]/div/div[2]/div/div[2]/div/div/div/div[1]/div/div/div/div[1]/div/div[2]/div/div[2]/div/div/div[2]/div/div/div/div[2]/div/div/div/div[2]/div/div/div/div[2]/div/div/div/div[7]/div/div')[0]
	page_number=ht.re_find(r'/(\d+)页',page_all).__next__().group(1)
	for page in range(1,int(page_number)):
		html=ht.page_source()
		data_list=ht.data_search(html=html,xpath='//td[@class="v-table-cell-content"]//text()')
		for i,index1,index2 in zip(range(1,1000),range(0,1000,18),range(18,1000,18)):
			split_list=data_list[index1:index2]
			if split_list:
				split_list.pop(8)
				split_list.pop(8)
				for field,data in zip(ht.field_list,split_list):
					exec(f'ht.{field}=data')
				ht.data_save()
				print(f'第{page}页——第{i}条数据——导入完成')
			else:
				break
		ht.selenium_click('//*[@id="ec2-100180"]/div/div[2]/div/div[2]/div/div/div/div[1]/div/div/div/div[1]/div/div[2]/div/div[2]/div/div/div[2]/div/div/div/div[2]/div/div/div/div[2]/div/div/div/div[2]/div/div/div/div[3]/div/div/span/span',3)
	ht.spider_end()
Esempio n. 3
0
def ip_spider2():
    ip = SuperSpider(host='192.168.0.172',
                     table_name='ip_pool',
                     field_list=[
                         'spider_datetime', 'source_name', 'source_page', 'ip',
                         'address'
                     ])
    ip.source_name = '89免费代理'
    page = 1
    while True:
        ip.source_page = f'http://www.89ip.cn/index_{page}.html'
        data_list = ip.data_search(f'http://www.89ip.cn/index_{page}.html',
                                   '//table[@class="layui-table"]//td/text()')
        if not data_list:
            break
        print(f'第{page}页')
        for i in range(0, 75, 5):
            try:
                ip_value = data_list[i].strip(' \n\t')
                ip_port = data_list[i + 1].strip(' \n\t')
                ip.ip = f"http://{ip_value}:{ip_port}"
                ip.address = data_list[i + 2].strip(' \n\t')
            except:
                break
            ip.data_save()
            print(f'{ip.source_name}-第{page}页-{ip.ip}-导入完成')
        page += 1
        time.sleep(2)
    ip.spider_end()
Esempio n. 4
0
def ip_spider5():
    ip = SuperSpider(host='192.168.0.172',
                     table_name='ip_pool',
                     field_list=[
                         'spider_datetime', 'source_name', 'source_page', 'ip',
                         'address'
                     ])
    ip.source_name = '开心代理'
    page = 1
    while True:
        ip.source_page = f'http://ip.kxdaili.com/ipList/{page}.html#ip'
        data_list = ip.data_search(
            f'http://ip.kxdaili.com/ipList/{page}.html#ip',
            '//table[@class="ui table segment"]//td/text()')
        if not data_list:
            break
        for i in range(0, 70, 7):
            ip.address = data_list[i + 5]
            h_list = data_list[i + 3].split(',')
            for h in h_list:
                ip.ip = f'{h.lower()}://{data_list[i]}:{data_list[i+1]}'
                ip.data_save()
                print(f'{ip.source_name}-第{page}页-{ip.ip}-导入完成')
        page += 1
    ip.spider_end()
Esempio n. 5
0
def profession_report_spider():
    profession_report = SuperSpider(
        table_name='profession_report',
        field_list=('name', 'spider_date', 'up_date', 'up_down', 'report',
                    'grade', 'grade_change', 'institution'))
    sql1 = 'select MAX(up_date) from profession_report'
    latest_time = profession_report.sql_search(sql1)[0][0]
    if not latest_time:
        latest_datetime = datetime.now() - timedelta(days=1)
    else:
        latest_datetime = datetime(latest_time.year, latest_time.month,
                                   latest_time.day)
    is_end = False
    for page in range(1, 1337):
        url = 'http://datainterface.eastmoney.com//EM_DataCenter/js.aspx?type=SR&sty=HYSR&mkt=0&stat=0&cmd=4&code=&sc=&ps=50&p=' + str(
            page
        ) + '&js=var%20vMcgaFDg={%22data%22:[(x)],%22pages%22:%22(pc)%22,%22update%22:%22(ud)%22,%22count%22:%22(count)%22}&rt=51553086'
        try:
            json_data = profession_report.use_requests_to_html(url, 'utf8')
            data_list = profession_report.json_to_py(json_data,
                                                     deal=True)['data']
        except:
            print(f'第{page}页获取失败')
            page += 1
            continue
        for data in data_list:
            data = data.split(',')
            time1 = data[1].split(' ')[0].replace('/', '-')
            datetime1 = datetime.strptime(time1, '%Y-%m-%d')
            if datetime1 <= latest_datetime:
                print('暂无数据更新')
                is_end = True
                break
            infocode = data[2]
            time2 = time1.replace('-', '')
            try:
                profession_report.get_request(
                    f'http://data.eastmoney.com/report/{time2}/{infocode}.html'
                )
            except:
                continue
            report = ''
            for par in profession_report.data_search('find', '.newsContent p'):
                report = report + par
            profession_report.name = data[10]
            profession_report.up_date = time1
            profession_report.up_down = profession_report.to_null(data[11])
            profession_report.report = report
            profession_report.grade = data[7]
            profession_report.grade_change = data[0]
            profession_report.institution = data[4]
            profession_report.data_save()
            print(
                f'行业研报:{profession_report.up_date}-{profession_report.name}-{profession_report.institution}-导入完成'
            )
        if is_end == True:
            break
    profession_report.spider_end()
    print('end:行业研报')
Esempio n. 6
0
def department_count_spider():
    department_count_list = []
    department_count = SuperSpider(
        host='47.102.40.81',
        passwd='Abc12345',
        db='bryframe',
        table_name='department_count',
        field_list=('spider_date', 'up_date', 'name', 'list_time', 'buy_time',
                    'buy_sum', 'sell_time', 'sell_sum'))
    month_ago = department_count.date_ago(30)
    page = 1
    while True:
        try:
            json_data = department_count.get_html(
                f'http://data.eastmoney.com/DataCenter_V3/stock2016/TraderStatistic/pagesize=50,page={page},sortRule=-1,sortType=,startDate={month_ago},endDate={department_count.spider_date},gpfw=0,js=var%20data_tab_1.html?rt=25754789',
                'GB2312')
            data_list = department_count.json_to_py(json_data,
                                                    deal=True)['data']
        except:
            print(f'第{page}页获取失败')
            page += 1
            continue
        if not data_list or page == 500:
            break
        print(f'第{page}页')
        for data in data_list:
            department_count.up_date = department_count.spider_date
            department_count.name = data['SalesName']
            if department_count.name not in department_count_list:
                department_count_list.append(department_count.name)
            else:
                print(f'{department_count.name}-数据重复')
                continue
            sql = f'select name from department_count where name="{department_count.name}" and spider_date="{department_count.spider_date}"'
            same_data = department_count.sql_search(sql)
            if same_data:
                department_count.sql_search(
                    f'delete from department_count where name="{department_count.name}" and spider_date="{department_count.spider_date}"'
                )
                print(
                    f'重新爬取-{department_count.spider_date}-{department_count.name}'
                )
            department_count.list_time = department_count.to_null(
                data['UpCount'])
            department_count.buy_time = department_count.to_null(
                data['BCount'])
            department_count.buy_sum = department_count.to_null(
                data['SumActBMoney'])
            department_count.sell_time = department_count.to_null(
                data['SCount'])
            department_count.sell_sum = department_count.to_null(
                data['SumActSMoney'])
            department_count.data_save()
            print(
                f'证券营业部上榜统计:{department_count.up_date}-{department_count.name}-导入完成'
            )
        page += 1
    department_count.spider_end()
    print('end:证券营业部上榜统计')
Esempio n. 7
0
def zjmyqyw_spdier():
	company_deque=deque([],maxlen=35)
	zjmyqyw=SuperSpider()
	zjmyqyw.source_name='浙江名营企业网'
	zjmyqyw.fax='-'
	zjmyqyw.get_request('http://www.zj123.com/')
	url_list1=['http://www.zj123.com/'+i.replace('1.','{}.') for i in zjmyqyw.data_search('find','.indsort dd a','href')]
	profession_list=list(zjmyqyw.data_search('find','.indsort dd a'))
	error_index=profession_list.index('特种印刷')
	for profession,url1 in zip(profession_list[error_index:],url_list1[error_index:]):
		for page in range(1,100):
			print(f'{profession}——第{page}页')
			try:
				zjmyqyw.get_request(url1.format(page))
				page_judge=zjmyqyw.data_search('find','.sleft .m.m1 .fred').__next__().split()[0]
			except:
				print(f'获取第{page}页失败')
				page+=1
				continue
			if int(page_judge) != page:
				break
			url_list2=('http://www.zj123.com/member/VIPContact/'+i.split('-')[1]+'/index.htm' for i in zjmyqyw.data_search('find','.listdetail22 .listdetail dt a','href'))
			url_list3=('http://www.zj123.com/member/VIPCompany/'+i.split('-')[1]+'/index.htm' for i in zjmyqyw.data_search('find','.listdetail22 .listdetail dt a','href'))
			#print(url_list2)
			for url2,url3 in zip(url_list2,url_list3):
				try:
					zjmyqyw.get_request(url2)
				except:
					continue
				contact_info_dict={i.split(':')[0].strip():i.split(':')[-1].strip().replace('\xa0','') for i in zjmyqyw.data_search('find','.rkbody table tr')}
				zjmyqyw.company_name=contact_info_dict['公司名称'] if contact_info_dict['公司名称'] else '-'
				if zjmyqyw.company_name in company_deque:
					print('信息重复')
					continue
				zjmyqyw.person_name=contact_info_dict['联系人'] if contact_info_dict['联系人'] else '-'
				zjmyqyw.address=contact_info_dict['地 址'] if contact_info_dict['地 址'] else '-'
				zjmyqyw.phone_number=contact_info_dict['电 话'] if contact_info_dict['电 话'] else '-'
				zjmyqyw.qq=contact_info_dict['QQ'] if contact_info_dict['QQ'] else '-'
				zjmyqyw.website=contact_info_dict['网 址'] if contact_info_dict['网 址'] else '-'
				try:
					zjmyqyw.get_request(url3)
				except:
					continue
				company_info_list=list(zjmyqyw.data_search('find','.rkbody table tr td'))
				company_info_dict={company_info_list[n].strip(': '):company_info_list[n+1].strip(': ') for n in range(0,24,2)}
				#print(company_info_dict)
				zjmyqyw.main_product=company_info_dict['主营产品或服务'] if company_info_dict['主营产品或服务'] else '-'
				zjmyqyw.business_mode=company_info_dict['经营模式'] if company_info_dict['经营模式'] else '-'
				zjmyqyw.company_type=company_info_dict['企业类型'] if company_info_dict['企业类型'] else '-'
				zjmyqyw.register_money=company_info_dict['注册资本'] if company_info_dict['注册资本'] else '-'
				zjmyqyw.register_money=company_info_dict['员工人数'] if company_info_dict['员工人数'] else '-'
				zjmyqyw.source_page=url2
				zjmyqyw.data_save()
				zjmyqyw.phone_number=contact_info_dict['手机'] if contact_info_dict['手机'] else '-'
				zjmyqyw.data_save()
				company_deque.append(zjmyqyw.company_name)
				print(f'{profession}——第{page}页——{zjmyqyw.company_name}信息导入完成')
	zjmyqyw.spider_end()
Esempio n. 8
0
def stock_count_spider():
    stock_count_list = []
    stock_count = SuperSpider(host='47.102.40.81',
                              passwd='Abc12345',
                              db='bryframe',
                              table_name='stock_count',
                              field_list=('spider_date', 'up_date', 'code',
                                          'name', 'list_time', 'buy_sum',
                                          'sell_sum', 'buy_amount'))
    month_ago = stock_count.date_ago(30)
    page = 1
    while True:
        try:
            json_data = stock_count.get_html(
                f'http://data.eastmoney.com/DataCenter_V3/stock2016/StockStatistic/pagesize=50,page={page},sortRule=-1,sortType=,startDate={month_ago},endDate={stock_count.spider_date},gpfw=0,js=var%20data_tab_3.html?rt=25754758',
                'GB2312')
            data_list = stock_count.json_to_py(json_data, deal=True)['data']
        except:
            print(f'第{page}页获取失败')
            page += 1
            continue
        if not data_list or page == 500:
            break
        print(f'第{page}页')
        for data in data_list:
            stock_count.up_date = data['Tdate']
            stock_count.code = data['SCode']
            stock_count.name = data['SName']
            if (stock_count.up_date, stock_count.code) not in stock_count_list:
                stock_count_list.append(
                    (stock_count.up_date, stock_count.code))
            else:
                print(
                    f'{stock_count.up_date}-{stock_count.code}-{stock_count.name}-数据重复'
                )
                continue
            sql = f'select code from stock_count where code="{stock_count.code}" and spider_date="{stock_count.spider_date}" and up_date="{stock_count.up_date}"'
            same_data = stock_count.sql_search(sql)
            if same_data:
                stock_count.sql_search(
                    f'delete from stock_count where code="{stock_count.code}" and spider_date="{stock_count.spider_date}" and up_date="{stock_count.up_date}"'
                )
                print(
                    f'重新爬取-{stock_count.spider_date}-{stock_count.code}-{stock_count.name}'
                )
            stock_count.list_time = stock_count.to_null(data['SumCount'])
            stock_count.buy_sum = stock_count.to_null(data['Bmoney'])
            stock_count.sell_sum = stock_count.to_null(data['Smoney'])
            stock_count.buy_amount = stock_count.to_null(data['JmMoney'])
            stock_count.data_save()
            print(
                f'个股龙虎榜统计:{stock_count.up_date}-{stock_count.code}-{stock_count.name}-导入完成'
            )
        page += 1
    stock_count.spider_end()
    print('end:个股龙虎榜统计')
Esempio n. 9
0
def lhb_rank_spider():
    lhb_rank_list = []
    lhb_rank = SuperSpider(
        host='47.102.40.81',
        passwd='Abc12345',
        db='bryframe',
        table_name='lhb_rank',
        field_list=('spider_date', 'up_date', 'code', 'name', 'close_price',
                    'up_down', 'buy_amount', 'change_rate', 'currency_market'))
    page = 1
    while True:
        try:
            json_data = lhb_rank.get_html(
                url=
                f'http://data.eastmoney.com/DataCenter_V3/stock2016/TradeDetail/pagesize=200,page={page},sortRule=-1,sortType=,startDate={lhb_rank.spider_date},endDate={lhb_rank.spider_date},gpfw=0,js=var%20data_tab_1.html?rt=25754497',
                charset='GB2312')
            data_list = lhb_rank.json_to_py(json_data, deal=True)['data']
        except:
            print(f'第{page}页获取失败')
            page += 1
            continue
        if not data_list or page == 500:
            break
        print(f'第{page}页')
        for data in data_list:
            lhb_rank.up_date = lhb_rank.spider_date
            lhb_rank.code = data['SCode']
            lhb_rank.name = data['SName']
            if lhb_rank.code not in lhb_rank_list:
                lhb_rank_list.append(lhb_rank.code)
            else:
                print(f'{lhb_rank.code}-{lhb_rank.name}-数据重复')
                continue
            sql = f'select code from lhb_rank where code="{lhb_rank.code}" and spider_date="{lhb_rank.spider_date}"'
            same_data = lhb_rank.sql_search(sql)
            if same_data:
                lhb_rank.sql_search(
                    f'delete from lhb_rank where code="{lhb_rank.code}" and spider_date="{lhb_rank.spider_date}"'
                )
                print(
                    f'重新爬取-{lhb_rank.spider_date}-{lhb_rank.code}-{lhb_rank.name}'
                )
            lhb_rank.close_price = lhb_rank.to_null(data['ClosePrice'])
            lhb_rank.up_down = lhb_rank.to_null(data['Chgradio'])
            lhb_rank.buy_amount = lhb_rank.to_null(data['JmMoney'])
            lhb_rank.change_rate = lhb_rank.to_null(data['Dchratio'])
            lhb_rank.currency_market = lhb_rank.to_null(data['Ltsz'])
            lhb_rank.data_save()
            print(
                f'当日龙虎榜涨跌幅排名:{lhb_rank.up_date}-{lhb_rank.code}-{lhb_rank.name}-导入完成'
            )
        page += 1
    lhb_rank.spider_end()
    print('end:龙虎榜当日跌幅排名')
Esempio n. 10
0
def active_department_spider():
    active_department = SuperSpider(
        host='47.102.40.81',
        passwd='Abc12345',
        db='bryframe',
        table_name='active_department',
        field_list=('spider_date', 'up_date', 'name', 'buy_number',
                    'sell_number', 'buy_sum', 'sell_sum', 'business_amount',
                    'code', 'stock_name'))
    page = 1
    while True:
        try:
            json_data = active_department.use_requests_to_html(
                f'http://data.eastmoney.com/DataCenter_V3/stock2016/ActiveStatistics/pagesize=50,page={page},sortRule=-1,sortType=JmMoney,startDate={active_department.spider_date},endDate={active_department.spider_date},gpfw=0,js=var%20data_tab_1.html?rt=25754772',
                'GB2312')
            data_list = active_department.json_to_py(json_data,
                                                     deal=True)['data']
        except:
            print(f'第{page}页获取失败')
            page += 1
            continue
        if not data_list or page == 500:
            break
        print(f'第{page}页')
        for data in data_list:
            active_department.up_date = active_department.spider_date
            active_department.name = data['YybName']
            active_department.buy_number = active_department.to_null(
                data['YybBCount'])
            active_department.sell_number = active_department.to_null(
                data['YybSCount'])
            active_department.buy_sum = active_department.to_null(
                data['Bmoney'])
            active_department.sell_sum = active_department.to_null(
                data['Smoney'])
            active_department.business_amount = active_department.to_null(
                data['JmMoney'])
            if not data['SName']:
                active_department.code = 'null'
                active_department.stock_name = 'null'
                active_department.data_save()
            else:
                for data_s in active_department.json_to_py(data['SName']):
                    active_department.code = data_s['SCode']
                    active_department.stock_name = data_s['CodeName']
                    active_department.data_save()
                    print(
                        f'每日活跃营业部:{active_department.up_date}-{active_department.name}-导入完成'
                    )
        page += 1
    active_department.spider_end()
    print('end:每日活跃营业部')
Esempio n. 11
0
def stock_info_spider():
    stock_info_list = []
    stock_info = SuperSpider(host='47.102.40.81',
                             passwd='Abc12345',
                             db='bryframe',
                             table_name='stock_info',
                             field_list=('code', 'name', 'spider_date',
                                         'up_date', 'highest', 'lowest',
                                         'today', 'yesterday'))
    for page in range(1, 181):
        try:
            json_data = stock_info.get_html(
                f'http://nufm.dfcfw.com/EM_Finance2014NumericApplication/JS.aspx?cb=jQuery11240974473783255319_1545290975192&type=CT&token=4f1862fc3b5e77c150a2b985b12db0fd&sty=FCOIATC&js=(%7Bdata%3A%5B(x)%5D%2CrecordsFiltered%3A(tot)%7D)&cmd=C._A&st=(ChangePercent)&sr=-1&p={page}&ps=20&_=1545290975206'
            )
            data_list = stock_info.json_to_py(json_data, deal=True)['data']
        except:
            print(f'第{page}页获取失败')
            page += 1
            continue
        print(f'第{page}页')
        for data_str in data_list:
            data = data_str.replace('-', 'null').split(',')
            stock_info.code = data[1]
            stock_info.name = data[2]
            if stock_info.code not in stock_info_list:
                stock_info_list.append(stock_info.code)
            else:
                print(f'{stock_info.code}-{stock_info.name}-数据重复')
                continue
            sql = f'select code from stock_info where code="{stock_info.code}" and spider_date="{stock_info.spider_date}"'
            same_data = stock_info.sql_search(sql)
            if same_data:
                stock_info.sql_search(
                    f'delete from stock_info where code="{stock_info.code}" and spider_date="{stock_info.spider_date}"'
                )
                print(
                    f'重新爬取-{stock_info.spider_date}-{stock_info.code}-{stock_info.name}'
                )
            stock_info.spider_date = stock_info.spider_date
            stock_info.up_date = stock_info.spider_date
            stock_info.highest = stock_info.to_null(data[9])
            stock_info.lowest = stock_info.to_null(data[10])
            stock_info.today = stock_info.to_null(data[11])
            stock_info.yesterday = stock_info.to_null(data[12])
            stock_info.data_save()
            print(
                f'行情中心:{stock_info.up_date}-{stock_info.code}-{stock_info.name}-导入完成'
            )
        page += 1
    stock_info.spider_end()
    print('end:行情中心')
Esempio n. 12
0
def zjmyqyw():
	zjmyqyw=SuperSpider()
	zjmyqyw.source='浙江名营企业网'
	zjmyqyw.fax='-'
	zjmyqyw.get_request('http://www.zj123.com/')
	url_list1=('http://www.zj123.com/'+i.replace('1.','{}.') for i in zjmyqyw.data_search('find','.indsort dd a','href'))
	for url1 in url_list1:
		page=1
		while True:
			print(f'第{page}页')
			zjmyqyw.get_request(url1.format(page))
			page_judge=zjmyqyw.data_search('find','.sleft .m.m1 .fred').__next__().split()[0]
			if int(page_judge) != page:
				break
			print(page_judge)
			url_list2=('http://www.zj123.com/member/VIPContact/'+i.split('-')[1]+'/index.htm' for i in zjmyqyw.data_search('find','.listdetail22 .listdetail dt a','href'))
			url_list3=('http://www.zj123.com/member/VIPCompany/'+i.split('-')[1]+'/index.htm' for i in zjmyqyw.data_search('find','.listdetail22 .listdetail dt a','href'))
			#print(url_list2)
			for url2,url3 in zip(url_list2,url_list3):
				zjmyqyw.get_request(url2)
				contact_info_dict={i.split(':')[0].strip():i.split(':')[-1].strip().replace('\xa0','') for i in zjmyqyw.data_search('find','.rkbody table tr')}
				zjmyqyw.company_name=contact_info_dict['公司名称'] if contact_info_dict['公司名称'] else '-'
				zjmyqyw.person_name=contact_info_dict['联系人'] if contact_info_dict['联系人'] else '-'
				zjmyqyw.address=contact_info_dict['地 址'] if contact_info_dict['地 址'] else '-'
				zjmyqyw.phone_code=contact_info_dict['电 话'] if contact_info_dict['电 话'] else '-'
				zjmyqyw.cell_phone=contact_info_dict['手机'] if contact_info_dict['手机'] else '-'
				zjmyqyw.qq=contact_info_dict['QQ'] if contact_info_dict['QQ'] else '-'
				zjmyqyw.website=contact_info_dict['网 址'] if contact_info_dict['网 址'] else '-'
				zjmyqyw.get_request(url3)
				company_info_list=list(zjmyqyw.data_search('find','.rkbody table tr td'))
				company_info_dict={company_info_list[n].strip(': '):company_info_list[n+1].strip(': ') for n in range(0,24,2)}
				#print(company_info_dict)
				zjmyqyw.main_product=company_info_dict['主营产品或服务'] if company_info_dict['主营产品或服务'] else '-'
				zjmyqyw.business_mode=company_info_dict['经营模式'] if company_info_dict['经营模式'] else '-'
				zjmyqyw.company_type=company_info_dict['企业类型'] if company_info_dict['企业类型'] else '-'
				zjmyqyw.register_money=company_info_dict['注册资本'] if company_info_dict['注册资本'] else '-'
				zjmyqyw.data_save()
				print(f'浙江企业网——{zjmyqyw.company_name}信息导入完成')
			page+=1
	zjmyqyw.spider_end()
#zjmyqyw()

# test_obj=SuperSpider()
# js='var btn=document.querySelector(".see_a.inactive_scode");btn.click();'
# test_obj.use_selenium()
# test_obj.selenium_js('https://www.china.cn/shukongjichuang/3746553522.html',js)
# test_obj.cell_phone=test_obj.selenium_search('css_selector','.inactive_top .number').__next__()
# print('aaaaaaa')
# print(test_obj.cell_phone)
Esempio n. 13
0
def business_detail_spider():
	stock_list=[]
	business_detail=SuperSpider(host='47.102.40.81',passwd='Abc12345',db='bryframe',table_name='business_detail',field_list=('spider_date','up_date','code','name','department_name','amount'))
	business_detail.up_date=business_detail.spider_date
	page=1
	while True:
		try:
			json_data=business_detail.use_requests_to_html(f'http://data.eastmoney.com/DataCenter_V3/stock2016/ActiveStatistics/pagesize=50,page={page},sortRule=-1,sortType=JmMoney,startDate={business_detail.spider_date},endDate={business_detail.spider_date},gpfw=0,js=var%20data_tab_1.html?rt=25861061','GB2312')
			data_list=business_detail.json_to_py(json_data,deal=True)['data']
		except:
			print(f'第{page}页获取失败')
			page+=1
			continue
		if not data_list or page == 500:
			break
		print(f'第{page}页')
		for data in data_list:
			if not data['SName']:
				continue
			stock_data_list=business_detail.json_to_py(data['SName'])
			for stock_data in stock_data_list:
				if stock_data['CodeName'] not in stock_list:
					stock_list.append(stock_data['CodeName'])
				else:
					continue
				business_detail.name=stock_data['CodeName']
				business_detail.code=stock_data['SCode']
				try:
					url_code=business_detail.re_find(r'\d+',business_detail.code).__next__().group()
				except:
					continue
				print(url_code)
				url=f'http://data.eastmoney.com/stock/lhb,{business_detail.spider_date},{url_code}.html'
				try:
					business_detail.get_request(url)
				except:
					continue
				detail_data_list=list(business_detail.data_search('find','table tbody td'))
				for i,j in zip(range(1,71,7),range(6,71,7)):
					try:
						business_detail.department_name=detail_data_list[i].split('\n')[0]
					except:
						break
					business_detail.amount=detail_data_list[j]
					business_detail.data_save()
					print(f'每日成交明细——{business_detail.up_date}——{business_detail.code}——{business_detail.name}——{business_detail.department_name}——导入完成')
		page+=1
	business_detail.spider_end()
Esempio n. 14
0
def department_track_spider():
    department_track = SuperSpider(
        host='47.102.40.81',
        passwd='Abc12345',
        db='bryframe',
        table_name='department_track',
        field_list=('spider_date', 'up_date', 'code', 'name', 'list_time',
                    'buy_sum', 'buy_time', 'sell_time', 'buy_amount',
                    'up_down'))
    month_ago = department_track.date_ago(30)
    page = 1
    while True:
        try:
            json_data = department_track.use_requests_to_html(
                f'http://data.eastmoney.com/DataCenter_V3/stock2016/JgStatistic/pagesize=50,page={page},sortRule=-1,sortType=,startDate={month_ago},endDate={department_track.spider_date},gpfw=0,js=var%20data_tab_3.html?rt=25754592',
                'GB2312')
            data_list = department_track.json_to_py(json_data,
                                                    deal=True)['data']
        except:
            print(f'第{page}页获取失败')
            page += 1
            continue
        if not data_list or page == 500:
            break
        print(f'第{page}页')
        for data in data_list:
            department_track.up_date = department_track.spider_date
            department_track.code = data['SCode']
            department_track.name = data['SName']
            department_track.list_time = department_track.to_null(
                data['UPCount'])
            department_track.buy_sum = department_track.to_null(
                data['JGBMoney'])
            department_track.buy_time = department_track.to_null(
                data['JGBCount'])
            department_track.sell_time = department_track.to_null(
                data['JGSCount'])
            department_track.buy_amount = department_track.to_null(
                data['JGPBuy'])
            department_track.up_down = department_track.to_null(
                data['RChange1M'])
            department_track.data_save()
            print(
                f'机构席位买卖追踪:{department_track.up_date}-{department_track.code}-{department_track.name}-导入完成'
            )
        page += 1
    department_track.spider_end()
    print('end:机构席位买卖追踪')
Esempio n. 15
0
def stock_data_spider():
    data_end = None
    stock_data = SuperSpider(host='139.224.115.44',
                             passwd='A9Vg+Dr*nP^fR=1V',
                             db='bryframe3',
                             table_name='stock_data',
                             field_list=('spider_date', 'up_date', 'code',
                                         'name', 'stock_rate', 'stock_price'))
    page = 1
    while True:
        print(f'第{page}页')
        url = 'http://datainterface.eastmoney.com/EM_DataCenter/JS.aspx?type=NS&sty=NSA&st=6&sr=-1&p=' + str(
            page) + '&ps=50&js=var%20inHqdtrZ={pages:(pc),data:[(x)]}&rt=5174'
        try:
            json_data = stock_data.get_html(url)
            data_list = stock_data.json_to_py(json_data, deal=True)['data']
            if data_list[:3] == data_end:
                break
            else:
                data_end = data_list[:3]
        except:
            print(f'第{page}页获取失败')
            page += 1
            continue
        if not data_list or page == 500:
            break
        for data in data_list:
            field_list = data.split(',')
            stock_data.code = field_list[2]
            stock_data.name = field_list[3]
            stock_data.stock_rate = '10配' + field_list[6]
            stock_data.stock_price = stock_data.to_null(field_list[7])
            stock_data.up_date = field_list[14] if field_list[14] else 'null'
            sql = f'select code from stock_data where code="{stock_data.code}" and spider_date="{stock_data.spider_date}" and up_date="{stock_data.up_date}"'
            same_data = stock_data.sql_search(sql)
            if same_data:
                stock_data.sql_search(
                    f'delete from stock_data where code="{stock_data.code}" and spider_date="{stock_data.spider_date}" and up_date="{stock_data.up_date}"'
                )
                print(
                    f'重新爬取-{stock_data.spider_date}-{stock_data.code}-{stock_data.name}'
                )
            stock_data.data_save()
            print(
                f'{stock_data.up_date}-{stock_data.code}-{stock_data.name}-导入完成'
            )
        page += 1
    stock_data.spider_end()
Esempio n. 16
0
def ip_spider4():
    ip = SuperSpider(host='192.168.0.172',
                     table_name='ip_pool',
                     field_list=[
                         'spider_datetime', 'source_name', 'source_page', 'ip',
                         'address'
                     ])
    ip.source_name = '方法SEO'
    ip.source_page = 'https://ip.seofangfa.com/'
    data_list = ip.data_search('https://ip.seofangfa.com/',
                               '//table[@class="table"]//td/text()')
    for i in range(0, 250, 5):
        ip.ip = f'http://{data_list[i]}:{data_list[i+1]}'
        ip.address = data_list[i + 3]
        ip.data_save()
        print(f'{ip.source_name}-{ip.ip}-导入完成')
    ip.spider_end()
Esempio n. 17
0
def institution_business_spider():
    institution_business = SuperSpider(
        host='47.102.40.81',
        passwd='Abc12345',
        db='bryframe',
        table_name='institution_business',
        field_list=('spider_date', 'up_date', 'code', 'name', 'buy_number',
                    'sell_number', 'buy_sum', 'sell_sum', 'buy_amount'))
    page = 1
    while True:
        try:
            json_data = institution_business.use_requests_to_html(
                f'http://data.eastmoney.com/DataCenter_V3/stock2016/DailyStockListStatistics/pagesize=50,page={page},sortRule=-1,sortType=PBuy,startDate={institution_business.spider_date},endDate={institution_business.spider_date},gpfw=0,js=var%20data_tab_1.html?rt=25754580',
                'GB2312')
            data_list = institution_business.json_to_py(json_data,
                                                        deal=True)['data']
        except:
            print(f'第{page}页获取失败')
            page += 1
            continue
        if not data_list or page == 500:
            break
        print(f'第{page}页')
        for data in data_list:
            institution_business.up_date = institution_business.spider_date
            institution_business.code = data['SCode']
            institution_business.name = data['SName']
            institution_business.buy_number = institution_business.to_null(
                data['BSL'])
            institution_business.sell_number = institution_business.to_null(
                data['SSL'])
            institution_business.buy_sum = institution_business.to_null(
                data['BMoney'])
            institution_business.sell_sum = institution_business.to_null(
                data['SMoney'])
            institution_business.buy_amount = institution_business.to_null(
                data['PBuy'])
            institution_business.data_save()
            print(
                f'机构买卖情况:{institution_business.up_date}-{institution_business.code}-{institution_business.name}-导入完成'
            )
        page += 1
    institution_business.spider_end()
    print('end:机构买卖情况')
Esempio n. 18
0
def xarcw_spider():
	f=Faker(locale='zh_CN')
	word_list=['python','web','数据库','运维']
	xarcw=SuperSpider(db='supery',table_name='post_tag',default_field='null',field_list=('post_id','tag_id'))
	# data={
	# 'memberName': '13155291086',
	# 'password': '******'}
	# xarcw.post_request('https://login.goodjobs.cn/index.php/action/UserLogin',data=data)
	post_list=xarcw.sql_search('select id from post')
	tag_list=xarcw.sql_search('select id from tag')
	number_list=(2,3,4,5)
	for post in post_list:
		number=random.choice(number_list)
		aim_tag=random.sample(tag_list,number)
		for tag in aim_tag:
			xarcw.post_id=post[0]
			xarcw.tag_id=tag[0]
			xarcw.data_save()
			print(f'{xarcw.post_id}-{xarcw.tag_id}-导入完成')
Esempio n. 19
0
def ip_spider1():
    ip = SuperSpider(host='192.168.0.172',
                     table_name='ip_pool',
                     field_list=[
                         'spider_datetime', 'source_name', 'source_page', 'ip',
                         'address'
                     ])
    ip.source_name = '快代理'
    for page in range(1, 100):
        print(f'第{page}页')
        ip.source_page = f'https://www.kuaidaili.com/free/inha/{page}/'
        data_list = ip.data_search(
            f'https://www.kuaidaili.com/free/inha/{page}/',
            '//table[@class="table table-bordered table-striped"]//td/text()')
        for i in range(0, 105, 7):
            try:
                ip.ip = f'http://{data_list[i]}:{data_list[i+1]}'
                ip.address = data_list[i + 4]
            except:
                break
            ip.data_save()
            print(f'{ip.source_name}-第{page}页-{ip.ip}-导入完成')
        time.sleep(10)
    ip.spider_end()
Esempio n. 20
0
def wl114_spider():
    wl114 = SuperSpider()
    wl114.source_name = '网络114'
    wl114.business_mode = '-'
    wl114.register_money = '-'
    wl114.website = '-'
    wl114.qq = '-'
    wl114.get_request('http://www.net114.com/')
    url_list1 = [
        i.replace('.html', '-p-{}.html') for i in wl114.data_search(
            'xpath',
            '//*[@id="product_center_content"]/div/ul/li/p/a',
            attr='href') if i.endswith('.html')
    ]
    profession_list1 = [
        i for i in wl114.data_search(
            'xpath', '//*[@id="product_center_content"]/div/ul/li/p/a')
        if i != '更多>>'
    ]
    error_index = profession_list1.index('维护工具')
    url_list2 = (i for i in wl114.data_search(
        'xpath',
        '//*[@id="product_center_content"]/div/ul/li/p/a',
        attr='href') if not i.endswith('.html'))
    profession_list2 = (i for i in wl114.data_search(
        'xpath', '//*[@id="product_center_content"]/div/ul/li/p/a')
                        if i == '更多>>')
    for url1, profession1 in zip(url_list1[error_index:],
                                 profession_list1[error_index:]):
        try:
            wl114.get_request(url1.format(1))
            all_page = wl114.data_search(
                'find', '.page_p:not(span)').__next__().split('\xa0')[1]
        except:
            continue
        for page in range(1, int(all_page) + 1):
            print(f'{profession1}——第{page}页')
            try:
                wl114.get_request(url1.format(page))
            except:
                continue
            url_list3 = list(
                wl114.data_search('find', '.product_list_div_h143 h2 a',
                                  'href'))
            if not url_list3:
                break
            for url3 in url_list3:
                try:
                    wl114.get_request(url3)
                    company_info_dict = {
                        i.split(':')[0].strip(): i.split(':')[-1].strip()
                        for i in wl114.data_search(
                            'find', '.right.w_250 .border.p_8 li') if ':' in i
                    }
                    phone_url = wl114.data_search(
                        'find', '.right.w_250 .border.p_8 li a',
                        'href').__next__()
                except:
                    continue
                wl114.company_type = company_info_dict.get('企业性质', '-')
                wl114.main_product = company_info_dict.get('企业主营', '-')
                wl114.address = company_info_dict.get('企业地址', '-')
                try:
                    wl114.get_request(phone_url)
                except:
                    continue
                phone_info_data = wl114.data_search(
                    'find', 'td[valign="top"]:first-child')
                try:
                    phone_info_list = phone_info_data.__next__().split('\n')
                    phone_info_dict = {
                        i.split(':')[0].strip(): i.split(':')[-1].strip()
                        for i in phone_info_list if ':' in i
                    }
                except:
                    continue
                wl114.company_name = phone_info_dict.get('公司名称', '-')
                if wl114.company_name == '-':
                    wl114.company_name = phone_info_dict.get('企业名称', '-')
                wl114.person_name = phone_info_dict.get('联系人', '-')
                wl114.fax = phone_info_dict.get('传真', '-')
                wl114.phone_number = phone_info_dict.get('手机', '-')
                wl114.source_page = url3
                wl114.data_save()
                wl114.phone_number = phone_info_dict.get('联系电话', '-')
                wl114.data_save()
                print(f'{profession1}——第{page}页——{wl114.company_name}信息导入完成')
            page += 1
    for url2 in url_list2:
        try:
            wl114.get_request(url2)
        except:
            continue
        url_list4 = (i.replace('.html', '-p-{}.html')
                     for i in wl114.data_search(
                         'find', '.product_w369_list a[href]', 'href'))
        profession_list4 = wl114.data_search('find',
                                             '.product_w369_list a[href]')
        for profession4, url4 in zip(profession_list4, url_list4):
            try:
                wl114.get_request(url4.format(1))
                all_page = wl114.data_search(
                    'find', '.page_p:not(span)').__next__().split('\xa0')[1]
            except:
                continue
            for page in range(1, int(all_page) + 1):
                print(f'{profession4}——第{page}页')
                try:
                    wl114.get_request(url4.format(page))
                except:
                    continue
                url_list3 = list(
                    wl114.data_search('find', '.product_list_div_h143 h2 a',
                                      'href'))
                if not url_list3:
                    break
                for url3 in url_list3:
                    try:
                        wl114.get_request(url3)
                        company_info_dict = {
                            i.split(':')[0].strip(): i.split(':')[-1].strip()
                            for i in wl114.data_search(
                                'find', '.right.w_250 .border.p_8 li')
                            if ':' in i
                        }
                        phone_url = wl114.data_search(
                            'find', '.right.w_250 .border.p_8 li a',
                            'href').__next__()
                    except:
                        continue
                    wl114.company_type = company_info_dict.get('企业性质', '-')
                    wl114.main_product = company_info_dict.get('企业主营', '-')
                    wl114.address = company_info_dict.get('企业地址', '-')
                    try:
                        wl114.get_request(phone_url)
                    except:
                        continue
                    phone_info_data = wl114.data_search(
                        'find', 'td[valign="top"]:first-child')
                    try:
                        phone_info_list = phone_info_data.__next__().split(
                            '\n')
                        phone_info_dict = {
                            i.split(':')[0].strip(): i.split(':')[-1].strip()
                            for i in phone_info_list if ':' in i
                        }
                    except:
                        continue
                    wl114.company_name = phone_info_dict.get('公司名称', '-')
                    if wl114.company_name == '-':
                        wl114.company_name = phone_info_dict.get('企业名称', '-')
                    wl114.person_name = phone_info_dict.get('联系人', '-')
                    wl114.fax = phone_info_dict.get('传真', '-')
                    wl114.phone_number = phone_info_dict.get('手机', '-')
                    wl114.source_page = url3
                    wl114.data_save()
                    wl114.phone_number = phone_info_dict.get('联系电话', '-')
                    wl114.data_save()
                    print(
                        f'{profession4}——第{page}页——{wl114.company_name}信息导入完成')
                page += 1
    wl114.spider_end()
Esempio n. 21
0
def zggys_spider():
    zggys = SuperSpider(host='192.168.0.172', default_field='-')
    zggys.source_name = '中国供应商'
    proxies_list = zggys.sql_search('select ip from ip_pool')
    url_list1 = [
        i + '?p={}' for i in zggys.data_search(
            'https://cn.china.cn/',
            '//*[@id="content"]/div[1]/div[1]/div/div[2]/div/div[2]/div/ul/li/div[2]/a/@href'
        )
    ]
    profession_list = zggys.data_search(
        'https://cn.china.cn/',
        '//*[@id="content"]/div[1]/div[1]/div/div[2]/div/div[2]/div/ul/li/div[2]/a/text()',
        'GBK')
    error_index = profession_list.index('睡袋')
    for url1, profession in zip(url_list1[error_index:],
                                profession_list[error_index:]):
        page = 1
        while True:
            time.sleep(2)
            print(f'{profession}——第{page}页')
            for i in range(20):
                proxies = random.choice(proxies_list)[0]
                print(f'使用代理-{proxies}')
                key = 'http' if not proxies.startswith('https') else 'https'
                try:
                    url_list2 = zggys.data_search(
                        url1.format(page),
                        '//ul[@class="extension_ul"]//h3[@class="title"]/a/@href',
                        'GBK',
                        proxies={key: proxies},
                        timeout=5)
                except Exception as error:
                    print(error)
                    continue
            if not url_list2:
                print(f'{profession}——第{page}页——没有数据')
                break
            for url2 in url_list2:
                for i in range(20):
                    try:
                        time.sleep(2)
                        proxies = random.choice(proxies_list)[0]
                        print(f'使用代理-{proxies}')
                        key = 'http' if not proxies.startswith(
                            'https') else 'https'
                        html = zggys.get_html(url2,
                                              charset='GBK',
                                              proxies={key: proxies},
                                              timeout=5)
                        zggys.source_page = url2
                        if zggys.data_search(
                                html=html,
                                xpath='//div[@class="column_xx"]//p//a/text()'
                        ):
                            zggys.company_name = zggys.data_search(
                                html=html,
                                xpath='//div[@class="column_xx"]//p//a/text()'
                            )[0]
                        company_info_list = [
                            i for i in zggys.data_search(
                                html=html,
                                xpath='//ul[@class="business_xx"]//li//text()')
                            if i.strip('\r\n |')
                        ]
                        # print(company_info_list)
                    except Exception as error:
                        print(error)
                        continue
                    else:
                        try:
                            aim_index = company_info_list.index('经营模式')
                            zggys.business_mode = company_info_list[aim_index +
                                                                    1]
                        except:
                            pass
                        try:
                            aim_index = company_info_list.index('注册资本')
                            zggys.register_money = company_info_list[
                                aim_index + 1].strip()
                        except:
                            pass
                        try:
                            aim_index = company_info_list.index('企业类型')
                            zggys.company_type = company_info_list[aim_index +
                                                                   1]
                        except:
                            pass
                        try:
                            aim_index = company_info_list.index('主营产品')
                            zggys.main_product = company_info_list[aim_index +
                                                                   1]
                        except:
                            pass
                        try:
                            aim_index = company_info_list.index('公司地址')
                            zggys.address = company_info_list[aim_index + 1]
                        except:
                            pass
                        try:
                            zggys.person_name = zggys.data_search(
                                html=html,
                                xpath=
                                '//div[@class="personal_top"]//div[@class="t"]//span/text()'
                            )[0]
                        except:
                            pass
                        phone_list = zggys.data_search(
                            html=html,
                            xpath='//div[@class="personal_bottom"]//span/text()'
                        )
                        if not phone_list:
                            # js=['var btn=document.querySelector(".see_a.inactive_scode");btn.click();']
                            # try:
                            # 	zggys.selenium_open(url2)
                            # 	zggys.selenium_js(js,sleep_time=2)
                            # 	zggys.phone_number=zggys.selenium_search('css_selector','.inactive_top .number').__next__()
                            # 	phone_info_dict={i.split('\n')[0]:i.split('\n')[1].strip('QQ交谈') for i in zggys.selenium_search('css_selector','.inactive_right .txt p')}
                            # except:
                            # 	continue
                            # zggys.fax=phone_info_dict.get('传真','-').strip()
                            # zggys.qq=phone_info_dict.get('Q  Q','-').strip()
                            # zggys.data_save()
                            # zggys.phone_number=phone_info_dict.get('电话','-').strip()
                            # zggys.data_save()
                            break
                        for phone in phone_list:
                            zggys.phone_number = phone.strip()
                            zggys.data_save()
                        print(
                            f'{profession}—第{page}页—{zggys.company_name}信息导入完成'
                        )
                    break
            page += 1
    zggys.spider_end()
Esempio n. 22
0
def skb_spider(phone,passwd,word,page_now=1):
	skb=SuperSpider(use_selenium=True)
	skb.source_name='搜客宝'
	skb.fax='-'
	skb.staff_number='-'
	skb.selenium_open('https://biz.lixiaoskb.com/login')
	skb.selenium_input('xpath','//*[@id="app"]/div[1]/div/div/div[1]/div[2]/div[2]/form/div[1]/div/div/div/input',phone)
	skb.selenium_input('xpath','//*[@id="app"]/div[1]/div/div/div[1]/div[2]/div[2]/form/div[2]/div/div/div/input',passwd,enter=True,sleep_time=3)
	js3='document.querySelector("#tab-0").click();'
	skb.selenium_js([js3])
	skb.selenium_input('xpath','//*[@id="searchDeInput"]/div[1]/div/input',word,sleep_time=5,enter=True)
	all_page=500
	if int(page_now) == int(all_page):
		print(f'{word}——所有数据爬取结束')
		skb.spider_end()
		return word,int(all_page)
	for page in range(page_now,int(all_page)+1):
		print(f'{word}——第{page}页')
		try:
			skb.selenium_scroll('//div[@id="jumpPage"]//input[@class="el-input__inner"]')
			skb.selenium_input('css_selector','#jumpPage .el-input input',page,sleep_time=2,enter=True)
		except Exception as e:
			print(e)
			continue
		url_list=skb.selenium_search('xpath',f'//div[@class="card"]//span[@class="name"]//a',attr='href')
		for url in url_list:
			skb.source_page=url
			js1=f'window.open("{url}")'
			skb.selenium_js([js1],sleep_time=3)
			skb.switch_window()
			try:
				skb.company_name=skb.selenium_search('css_selector','.top .name').__next__()
			except Exception as e:
				print(e)
				skb.window_close()
				skb.switch_window(sleep_time=2)
				continue
			try:
				company_info_dict1={i.split(':')[0].strip():i.split(':')[-1].strip() for i in skb.selenium_search('css_selector','.line .group')}
				skb.company_type=company_info_dict1.get('公司类型','-')
				skb.address=company_info_dict1.get('通讯地址','-')
				business_mode=company_info_dict1.get('所属行业','-')
				skb.website=company_info_dict1.get('官方网站','-').strip('更多>> ')
			except:
				pass
			try:
				company_info_dict2={i.split('\n')[0].strip('/ '):i.split('\n')[-1].strip('/ ') for i in skb.selenium_search('css_selector','.gongshang-col')}
				skb.person_name=company_info_dict2.get('法人/负责人','-')
				skb.register_money=company_info_dict2.get('注册资本','-')
				skb.main_product=company_info_dict2.get('经营范围','-')
			except:
				pass
			js2='var open_btn=document.querySelector(".mask-box .action span");open_btn.click();'
			try:
				skb.selenium_js([js2],sleep_time=3)
			except Exception as e:
				print(e)
				phone_list=[]
				qq_list=[]
				try:
					phone_info=skb.selenium_search('css_selector','.el-scrollbar__view')
					phone_info_list=list(phone_info)[1].split('\n')
				except Exception as e:
					print(e)
					skb.window_close()
					skb.switch_window(sleep_time=2)
					continue
				#print(phone_info_list)
				for i,j in enumerate(phone_info_list):
					if j == '选 择':
						skb.phone_number=phone_info_list[i-1]
					elif j == '联系人':
						skb.person_name=phone_info_list[i+1]
					elif j == 'qq号码':
						skb.qq=phone_info_list[i+1].strip(',')
					elif j == '电子邮箱':
						skb.mail=phone_info_list[i+1].strip(',')
						try:
							skb.data_save()
						except:
							continue
				print(f'{word}——第{page}页——{skb.company_name}信息导入完成')
				skb.window_close()
				skb.switch_window(sleep_time=2)
				continue
			phone_list=[]
			qq_list=[]
			try:
				phone_info=skb.selenium_search('css_selector','.el-scrollbar__view')
				phone_info_list=list(phone_info)[1].split('\n')
			except Exception as e:
				print(e)
				skb.window_close()
				skb.switch_window(sleep_time=2)
				continue
			#print(phone_info_list)
			for i,j in enumerate(phone_info_list):
				if j == '选 择':
					skb.phone_number=phone_info_list[i-1]
				elif j == '联系人':
					skb.person_name=phone_info_list[i+1]
				elif j == 'qq号码':
					skb.qq=phone_info_list[i+1].strip(',')
				elif j == '电子邮箱':
					skb.mail=phone_info_list[i+1].strip(',')
					try:
						skb.data_save()
					except:
						continue
			print(f'{word}——第{page}页——{skb.company_name}信息导入完成')
			use_number=skb.selenium_search('css_selector','.inner-user .viewCount:first-child').__next__()
			print(use_number)
			if int(use_number) == 0:
				print(f'{word}——第{page}页——今日次数已用完')
				skb.spider_end()
				return word,page
			skb.window_close()
			skb.switch_window(sleep_time=2)
	skb.spider_end()
	print(f'{word}——所有数据爬取结束')
	return word,int(all_page)
Esempio n. 23
0
def stock_report_spider():
    stock_report = SuperSpider(
        host='47.102.40.81',
        passwd='Abc12345',
        db='bryframe',
        table_name='stock_report',
        field_list=('code', 'name', 'spider_date', 'up_date', 'report',
                    'grade', 'grade_change', 'institution', 'income_2018',
                    'rate_2018', 'income_2019', 'rate_2019'))
    sql1 = 'select MAX(up_date) from stock_report'
    latest_time = stock_report.sql_search(sql1)[0][0]
    if not latest_time:
        latest_datetime = datetime.now() - timedelta(days=1)
    else:
        latest_datetime = datetime(latest_time.year, latest_time.month,
                                   latest_time.day)
    is_end = False
    for page in range(1, 254):
        url = 'http://datainterface.eastmoney.com//EM_DataCenter/js.aspx?type=SR&sty=GGSR&js=var%20MILbIdwm={"data":[(x)],"pages":"(pc)","update":"(ud)","count":"(count)"}&ps=50&p=' + str(
            page) + '&mkt=0&stat=0&cmd=2&code=&rt=51552935'
        try:
            json_data = stock_report.use_requests_to_html(url, 'utf8')
            data_list = stock_report.json_to_py(json_data, deal=True)['data']
        except:
            print(f'第{page}页获取失败')
            page += 1
            continue
        for data in data_list:
            time1 = data['datetime'][:10]
            datetime1 = datetime.strptime(time1, '%Y-%m-%d')
            if datetime1 <= latest_datetime:
                print('暂无数据更新')
                is_end = True
                break
            infocode = data['infoCode']
            time2 = time1.replace('-', '')
            try:
                stock_report.get_request(
                    f'http://data.eastmoney.com/report/{time2}/{infocode}.html'
                )
            except:
                continue
            report = ''
            for par in stock_report.data_search('find',
                                                '#ContentBody .newsContent p'):
                report = report + par
            stock_report.code = data['secuFullCode']
            stock_report.name = data['secuName']
            stock_report.up_date = stock_report.spider_date
            stock_report.report = report
            stock_report.grade = data['rate']
            stock_report.grade_change = data['change']
            stock_report.institution = data['insName']
            stock_report.income_2018 = stock_report.to_null(data['sys'][0])
            stock_report.rate_2018 = stock_report.to_null(data['syls'][0])
            stock_report.income_2019 = stock_report.to_null(data['sys'][1])
            stock_report.rate_2019 = stock_report.to_null(data['syls'][1])
            stock_report.data_save()
            print(
                f'个股研报:{stock_report.spider_date}-{stock_report.code}-{stock_report.name}-导入完成'
            )
        if is_end == True:
            break
    stock_report.spider_end()
    print('end:个股研报')
Esempio n. 24
0
def bonus_data_spider():
    bonus_data = SuperSpider(
        host='139.224.115.44',
        passwd='A9Vg+Dr*nP^fR=1V',
        db='bryframe3',
        table_name='bonus_data',
        field_list=('spider_date', 'bonus_report_date', 'code', 'name',
                    'cash_bonus_rate', 'transfer_rate', 'plan_announce_date',
                    'stock_register_date', 'remove_date', 'plan_scheduler',
                    'latest_announce_date'))
    date_list = bonus_data.data_search(
        'http://data.eastmoney.com/yjfp/201812.html',
        '//select[@id="sel_bgq"]/option/text()', 'gb2312')
    year_ago_datetime = bonus_data.to_datetime(bonus_data.date_ago(365))
    date_list2 = []
    for aim_date in date_list:
        if year_ago_datetime <= bonus_data.to_datetime(str(aim_date)):
            date_list2.append(aim_date)
        else:
            break
    for use_date in date_list2:
        bonus_data.bonus_report_date = use_date
        page = 1
        while True:
            print(f'第{page}页')
            try:
                json_data = bonus_data.get_html(
                    f'http://data.eastmoney.com/DataCenter_V3/yjfp/getlist.ashx?js=var%20aTnZIWfZ&pagesize=50&page={page}&sr=-1&sortType=YAGGR&mtk=%C8%AB%B2%BF%B9%C9%C6%B1&filter=(ReportingPeriod=^{use_date}^)&rt=51742239',
                    'GB2312')
                data_list = bonus_data.json_to_py(json_data, deal=True)['data']
            except:
                print(f'第{page}页获取失败')
                page += 1
                continue
            if not data_list or page == 500:
                break
            for data in data_list:
                bonus_data.code = data['Code']
                bonus_data.name = data['Name']
                bonus_data.latest_announce_date = bonus_data.to_null(
                    data['NoticeDate'][:10])
                sql = f'select code from bonus_data where code="{bonus_data.code}" and spider_date="{bonus_data.spider_date}" and latest_announce_date="{bonus_data.latest_announce_date}"'
                same_data = bonus_data.sql_search(sql)
                if same_data:
                    bonus_data.sql_search(
                        f'delete from bonus_data where code="{bonus_data.code}" and spider_date="{bonus_data.spider_date}" and latest_announce_date="{bonus_data.latest_announce_date}"'
                    )
                    print(
                        f'重新爬取-{bonus_data.spider_date}-{bonus_data.code}-{bonus_data.name}'
                    )
                bonus_data.plan_announce_date = bonus_data.to_null(
                    data['ResultsbyDate'][:10])
                bonus_data.stock_register_date = bonus_data.to_null(
                    data['GQDJR'][:10])
                bonus_data.remove_date = bonus_data.to_null(data['CQCXR'][:10])
                bonus_data.plan_scheduler = data['ProjectProgress']
                group_data = data['AllocationPlan']
                try:
                    bonus_data.cash_bonus_rate = '10' + bonus_data.re_find(
                        r'派[\d\.]+', group_data).__next__().group() + '元(含税)'
                except:
                    bonus_data.cash_bonus_rate = 'null'
                try:
                    transfer_rate1 = bonus_data.re_find(
                        r'转[\d\.]+', group_data).__next__().group()
                except:
                    transfer_rate1 = ''
                try:
                    transfer_rate2 = bonus_data.re_find(
                        r'送[\d\.]+', group_data).__next__().group()
                except:
                    transfer_rate2 = ''
                if not transfer_rate1 and not transfer_rate2:
                    bonus_data.transfer_rate = 'null'
                else:
                    bonus_data.transfer_rate = '10' + transfer_rate2 + transfer_rate1
                bonus_data.data_save()
                print(
                    f'{bonus_data.bonus_report_date}-{bonus_data.code}-{bonus_data.name}-导入完成'
                )
            page += 1
    bonus_data.spider_end()
Esempio n. 25
0
def profession_report_spider():
    profession_report_list = []
    profession_report = SuperSpider(
        host='47.102.40.81',
        passwd='Abc12345',
        db='bryframe',
        table_name='profession_report',
        field_list=('name', 'spider_date', 'up_date', 'up_down', 'report',
                    'grade', 'grade_change', 'institution'))
    sql1 = 'select MAX(up_date) from profession_report'
    latest_time = profession_report.sql_search(sql1)[0][0]
    if not latest_time:
        latest_datetime = datetime.now() - timedelta(days=1)
    else:
        latest_datetime = datetime(latest_time.year, latest_time.month,
                                   latest_time.day)
    is_end = False
    for page in range(1, 1337):
        url = 'http://datainterface.eastmoney.com//EM_DataCenter/js.aspx?type=SR&sty=HYSR&mkt=0&stat=0&cmd=4&code=&sc=&ps=50&p=' + str(
            page
        ) + '&js=var%20vMcgaFDg={%22data%22:[(x)],%22pages%22:%22(pc)%22,%22update%22:%22(ud)%22,%22count%22:%22(count)%22}&rt=51553086'
        try:
            json_data = profession_report.get_html(url)
            data_list = profession_report.json_to_py(json_data,
                                                     deal=True)['data']
        except Exception as error:
            print(f'第{page}页获取失败')
            print(error)
            page += 1
            continue
        for data in data_list:
            data = data.split(',')
            time1 = data[1].split(' ')[0].replace('/', '-')
            profession_report.name = data[10]
            profession_report.up_date = time1
            datetime1 = datetime.strptime(time1, '%Y-%m-%d')
            if datetime1 <= latest_datetime:
                print('暂无数据更新')
                is_end = True
                break
            infocode = data[2]
            time2 = time1.replace('-', '')
            profession_report.up_down = profession_report.to_null(data[11])
            try:
                profession_report.report = (''.join(
                    profession_report.data_search(
                        f'http://data.eastmoney.com/report/{time2}/{infocode}.html',
                        '//div[@class="newsContent"]/text()',
                        'gb2312'))).strip()
            except:
                pass
            sql = f'select name from profession_report where name="{profession_report.name}" and spider_date="{profession_report.spider_date}" and up_date="{profession_report.up_date}" and report="{profession_report.report}"'
            same_data = profession_report.sql_search(sql)
            profession_report.grade = data[7]
            profession_report.grade_change = data[0]
            profession_report.institution = data[4]
            profession_report.data_save()
            print(
                f'行业研报:{profession_report.up_date}-{profession_report.name}-{profession_report.institution}-导入完成'
            )
        if is_end == True:
            break
    profession_report.spider_end()
    print('end:行业研报')
Esempio n. 26
0
def zggys_spider():	
	zggys=SuperSpider(use_selenium=True)
	zggys.source='中国供应商'
	zggys.website='-'
	zggys.get_request('https://cn.china.cn/')
	url_list1=(i+'?p={}' for i in zggys.data_search('xpath','//*[@id="content"]/div[1]/div[1]/div/div[2]/div/div[2]/div/ul/li/div[2]/a','href'))	
	for url1 in url_list1:
		page=10
		while True:
			print(f'第{page}页')
			try:
				zggys.get_request(url1.format(page))
			except:
				print(f'获取第{page}页失败')
				page+=1
				continue
			url_list2=zggys.data_search('find','h3.title a','href')
			if not url_list2:
				break
			for url2 in url_list2:
				try:
					zggys.get_request(url2)
					zggys.company_name=zggys.data_search('find','.column_xx p a','title').__next__()
				except:
					continue
				company_info_list=(i for i in zggys.data_search('find','.business_xx').__next__().split('\n') if '|' in i)
				company_info_dict={i.split('|')[0]:i.split('|')[1] for i in company_info_list}
				zggys.business_mode=company_info_dict.get('经营模式','-') 
				zggys.register_money=company_info_dict.get('注册资本','-') 
				zggys.company_type=company_info_dict.get('企业类型','-') 
				zggys.main_product=company_info_dict.get('主营产品','-') 
				zggys.address=company_info_dict.get('公司地址','-') 
				#print(business_mode,register_money,company_type,main_product,address)
				zggys.person_name=zggys.data_search('find','.personal_top .t span').__next__()
				phone_list=zggys.data_search('find','.personal_bottom span')
				#print(phone_list)
				cell_phone_list=[]
				phone_code_list=[]
				for phone in phone_list:
					if not phone:
						js='var btn=document.querySelector(".see_a.inactive_scode");btn.click();'
						zggys.selenium_js(url2,js)
						zggys.cell_phone=zggys.selenium_search('css_selector','.inactive_top .number').__next__()
						phone_info_dict={i.split('\n')[0]:i.split('\n')[1].strip('QQ交谈') for i in zggys.selenium_search('css_selector','.inactive_right .txt p')}	
						zggys.phone_code=phone_info_dict.get('电话','-')
						zggys.fax=phone_info_dict.get('传真','-')
						zggys.qq=phone_info_dict.get('Q  Q','-')
					else:
						if not phone.startswith('1'):
							phone_code_list.append(phone)
						else:
							cell_phone_list.append(phone)
				if cell_phone_list or phone_code_list:
					zggys.phone_code='/'.join(phone_code_list) if phone_code_list else '-'
					zggys.cell_phone='/'.join(cell_phone_list) if cell_phone_list else '-'
					zggys.fax='-'
					zggys.qq='-'
				zggys.data_save()
				print(f'中国供应商——{zggys.company_name}信息导入完成')
			page+=1
	zggys.spider_end()
Esempio n. 27
0
def xarcw_spider():
    word_list = ['网络']
    xarcw = SuperSpider(host='192.168.0.172', default_field='-')
    xarcw.source_name = '新安人才网'
    data = {'memberName': '13155291086', 'password': '******'}
    xarcw.post_request('https://login.goodjobs.cn/index.php/action/UserLogin',
                       data=data)
    for word in word_list:
        for city_code in range(1043, 1061):
            for page in range(1, 61):
                print(f'{word}-{city_code}-第{page}页')
                try:
                    url_list = xarcw.data_search(
                        f'https://search.goodjobs.cn/index.php?keyword={word}&boxwp=c{city_code}&page={page}',
                        '//div[@class="dw_table"]//span[@class="e1"]/a/@href')
                except:
                    print(f'{word}-{city_code}-第{page}页获取失败')
                    continue
                if not url_list:
                    print(f'{word}-{city_code}-第{page}页-爬取结束')
                    break
                for url in url_list:
                    # print(url)
                    xarcw.source_page = url
                    time.sleep(1)
                    data_list = xarcw.data_search(url, [
                        '//p[@class="cname"]/a/text()',
                        '//p[@class="msg ltype"]/text()',
                        '//div[@class="w706 clearfix"]/text()',
                        '//div[@class="w706 clearfix"]/img/@src',
                        '//div[@class="comadress clearfix"]/text()'
                    ])
                    if not data_list[0] or not data_list[3]:
                        continue
                    if not data_list[0]:
                        data_list = xarcw.data_search(url, [
                            '//div[@class="w240 whitespace pb16"]//a[@class="org"]/text()',
                            '//div[@class="w240 whitespace pb16"]//p[@class="grey lh28"]/span[@class="black"]/text()',
                            '//p[@class="duol mt20"]/text()',
                            '//p[@class="duol mt20"]/img/@src',
                            '//div[@class="comadress clearfix"]/text()'
                        ])
                        xarcw.company_type = data_list[1][0]
                        xarcw.main_product = data_list[1][2]
                    else:
                        company_info_list = [
                            i.strip('\xa0\xa0\n ')
                            for i in data_list[1][0].split('|')
                        ]
                        xarcw.company_type = company_info_list[0]
                        for j in company_info_list[1:]:
                            if '-' in j:
                                xarcw.staff_number = j
                            else:
                                xarcw.main_product = j
                    xarcw.company_name = data_list[0][0]
                    xarcw.person_name = [i for i in data_list[2]
                                         if i.strip()][0]
                    try:
                        xarcw.phone_number = xarcw.use_tesseract(
                            url=data_list[3][0], lang=None)
                    except:
                        continue
                    xarcw.address = data_list[4][0].strip('工作地点:\u3000\n ')
                    xarcw.data_save()
                    print(
                        f'{xarcw.company_name}-{xarcw.person_name}-{xarcw.phone_number}-导入完成'
                    )
Esempio n. 28
0
def zgcpw_spider():
    zgcpw = SuperSpider()
    company_list = deque([], maxlen=35)
    zgcpw.source_name = '中国产品网'
    zgcpw.get_request('http://www.pe168.com/')
    url_list1 = zgcpw.data_search('find', 'td div:nth-child(2) a', 'href')
    profession_list = zgcpw.data_search('find', 'td div:nth-child(2) a')
    for profession, url1 in zip(profession_list, url_list1):
        try:
            zgcpw.get_request(url1)
            page_all = zgcpw.data_search('find', '.pages cite').__next__()
            page_all_number = zgcpw.re_find(r'/(\d+)页',
                                            page_all).__next__().group(1)
        except:
            continue
        for page in range(1, int(page_all_number) + 1):
            print(f'{profession}——第{page}页')
            url2 = url1.replace('.html', f'-{page}.html')
            try:
                zgcpw.get_request(url2)
            except:
                continue
            url_list3 = zgcpw.data_search(
                'find', '.left_box form tr ul li:nth-last-child(1) a', 'href')
            company_list3 = zgcpw.data_search(
                'find', '.left_box form tr ul li:nth-last-child(1) a')
            for company_name, url3 in zip(company_list3, url_list3):
                if company_name in company_list:
                    print('信息重复')
                    continue
                company_list.append(company_name)
                zgcpw.company_name = company_name
                try:
                    zgcpw.get_request(url3)
                except:
                    continue
                zgcpw.source_page = url3
                try:
                    company_info_url = zgcpw.data_search(
                        'find', 'a[title="公司介绍"]', 'href').__next__()
                except:
                    company_list.append(company_name)
                    continue
                try:
                    zgcpw.get_request(company_info_url)
                except:
                    continue
                company_info_list = list(
                    zgcpw.data_search('find',
                                      '.main_body:nth-last-child(1) td'))
                zgcpw.company_type = company_info_list[
                    company_info_list.index('公司类型:') +
                    1] if '公司类型:' in company_info_list else '-'
                zgcpw.staff_number = company_info_list[
                    company_info_list.index('公司规模:') +
                    1] if '公司规模:' in company_info_list else '-'
                zgcpw.register_money = company_info_list[
                    company_info_list.index('注册资本:') +
                    1] if '注册资本:' in company_info_list else '-'
                zgcpw.business_mode = company_info_list[
                    company_info_list.index('经营模式:') +
                    1] if '经营模式:' in company_info_list else '-'
                zgcpw.main_product = company_info_list[
                    company_info_list.index('经营范围:') +
                    1] if '经营范围:' in company_info_list else '-'
                try:
                    phone_info_url = zgcpw.data_search('find',
                                                       'a[title="联系方式"]',
                                                       'href').__next__()
                except:
                    company_list.append(company_name)
                    continue
                try:
                    zgcpw.get_request(phone_info_url)
                except:
                    continue
                phone_info_list = list(
                    zgcpw.data_search('find', '.px13.lh18 td'))
                zgcpw.address = phone_info_list[
                    phone_info_list.index('公司地址:') +
                    1] if '公司地址:' in phone_info_list else '-'
                zgcpw.fax = phone_info_list[
                    phone_info_list.index('公司传真:') +
                    1] if '公司传真:' in phone_info_list else '-'
                zgcpw.website = phone_info_list[
                    phone_info_list.index('公司网址:') +
                    1] if '公司网址:' in phone_info_list else '-'
                zgcpw.person_name = phone_info_list[
                    phone_info_list.index('联 系 人:') +
                    1] if '联 系 人:' in phone_info_list else '-'
                zgcpw.phone_number = phone_info_list[
                    phone_info_list.index('公司电话:') +
                    1] if '公司电话:' in phone_info_list else '-'
                zgcpw.data_save()
                zgcpw.phone_number = phone_info_list[
                    phone_info_list.index('手机号码:') +
                    1] if '手机号码:' in phone_info_list else '-'
                zgcpw.data_save()
                print(f'{profession}——第{page}页——{company_name}导入完成')
    zgcpw.spider_end()
Esempio n. 29
0
def business_detail_spider():
    business_detail_list = []
    business_detail = SuperSpider(host='47.102.40.81',
                                  passwd='Abc12345',
                                  db='bryframe',
                                  table_name='business_detail',
                                  field_list=('spider_date', 'up_date', 'code',
                                              'name', 'department_name',
                                              'amount'))
    business_detail.up_date = business_detail.spider_date
    page = 1
    while True:
        try:
            json_data = business_detail.get_html(
                f'http://data.eastmoney.com/DataCenter_V3/stock2016/ActiveStatistics/pagesize=50,page={page},sortRule=-1,sortType=JmMoney,startDate={business_detail.spider_date},endDate={business_detail.spider_date},gpfw=0,js=var%20data_tab_1.html?rt=25861061',
                'GB2312')
            data_list = business_detail.json_to_py(json_data,
                                                   deal=True)['data']
        except:
            print(f'第{page}页获取失败')
            page += 1
            continue
        if not data_list or page == 500:
            break
        print(f'第{page}页')
        for data in data_list:
            if not data['SName']:
                continue
            stock_data_list = business_detail.json_to_py(data['SName'])
            for stock_data in stock_data_list:
                if stock_data['CodeName'] not in business_detail_list:
                    business_detail_list.append(stock_data['CodeName'])
                else:
                    continue
                business_detail.name = stock_data['CodeName']
                business_detail.code = stock_data['SCode']
                sql = f'select code from business_detail where code="{business_detail.code}" and spider_date="{business_detail.spider_date}"'
                same_data = business_detail.sql_search(sql)
                if same_data:
                    business_detail.sql_search(
                        f'delete from business_detail where code="{business_detail.code}" and spider_date="{business_detail.spider_date}"'
                    )
                    print(
                        f'重新爬取-{business_detail.spider_date}-{business_detail.code}-{business_detail.name}'
                    )
                try:
                    url_code = business_detail.re_find(
                        r'\d+', business_detail.code).__next__().group()
                except:
                    continue
                url = f'http://data.eastmoney.com/stock/lhb,{business_detail.spider_date},{url_code}.html'
                try:
                    detail_data_list = [
                        i for i in business_detail.data_search(
                            url, '//div[@class="content-sepe"]//td//text()',
                            'gb2312') if i.strip() and '\r' not in i
                    ]
                    for i in range(6):
                        if '(买入前5名与卖出前5名)' in detail_data_list:
                            error_index = detail_data_list.index(
                                '(买入前5名与卖出前5名)')
                            del detail_data_list[error_index:error_index + 6]
                except:
                    print(
                        f'{business_detail.code}-{business_detail.name}-获取失败')
                    continue
                # print(detail_data_list)
                department_list = []
                for i, j in zip(range(1, 1000, 8), range(7, 1000, 8)):
                    try:
                        business_detail.department_name = detail_data_list[i]
                        if business_detail.department_name not in department_list:
                            department_list.append(
                                business_detail.department_name)
                        else:
                            print(
                                f'{business_detail.name}-{business_detail.department_name}-信息重复'
                            )
                            continue
                        business_detail.amount = detail_data_list[j]
                        # print(business_detail.amount)
                    except:
                        break
                    business_detail.data_save()
                    print(
                        f'每日成交明细——{business_detail.up_date}——{business_detail.code}——{business_detail.name}——{business_detail.department_name}——导入完成'
                    )
        page += 1
    business_detail.spider_end()