コード例 #1
0
def ip_check_run():
    del_same()
    ip = SuperSpider(host='192.168.0.172')
    proxies_list = ip.sql_search('select ip from ip_pool')
    loop = asyncio.get_event_loop()
    loop.run_until_complete(ip_check_task(proxies_list, ip))
    ip.spider_end()
コード例 #2
0
 def get_phone_number():
     wx = SuperSpider()
     html = wx.get_html(
         'http://192.168.30.200/api/check_wx/get_mobile.html?max_id=6000000&num=5000'
     )
     data_list = wx.json_to_py(html)
     print(data_list)
コード例 #3
0
def ip_spider3_run():
    headers = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding':
        'gzip, deflate',
        'Accept-Language':
        'zh-CN,zh;q=0.9',
        'Cache-Control':
        'max-age=0',
        'Connection':
        'keep-alive',
        'Cookie':
        'acw_tc=792b121315526282356885461e0c5a3c57c2f3644ca7854ccdbc9c502d1ef5; ASPSESSIONIDSADQRDTR=ANOPOIJBEFGBOCBAJPPMKEDH; __51cke__=; Hm_lvt_8fd158bb3e69c43ab5dd05882cf0b234=1552552479,1552628110; __tins__16949115=%7B%22sid%22%3A%201552628109404%2C%20%22vd%22%3A%2018%2C%20%22expires%22%3A%201552630850318%7D; __51laig__=18; Hm_lpvt_8fd158bb3e69c43ab5dd05882cf0b234=1552629050',
        'Host':
        'ip.zdaye.com',
        'Referer':
        'http://ip.zdaye.com/dayProxy/3.html',
        'Upgrade-Insecure-Requests':
        '1',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
    }
    ip = SuperSpider(host='192.168.0.172',
                     table_name='ip_pool',
                     field_list=[
                         'spider_datetime', 'source_name', 'source_page', 'ip',
                         'address'
                     ],
                     headers=headers)
    proxies_list = ip.sql_search('select ip from ip_pool')
    loop = asyncio.get_event_loop()
    loop.run_until_complete(ip_spider3_task(ip, proxies_list))
    ip.spider_end()
コード例 #4
0
ファイル: stw_spider.py プロジェクト: cwy1019120542/MySpiders
def stw_spider():
	stw=SuperSpider()
	stw_list=deque([],maxlen=30)
	stw.source_name='商泰网'
	url_list1=stw.data_search(r'https://cn.made-in-china.com/','//div[@class="sub-cata"]//dd[@class="sub-cata-item-bd"]//a/@href')
	# print(url_list1)
	for url1 in url_list1:
		url1='https://cn.made-in-china.com'+url1
		code=stw.data_search(url1,'//input[@name="code"]/@value')
		print(code)
コード例 #5
0
def ip_spider6_run():
    ip = SuperSpider(host='192.168.0.172',
                     table_name='ip_pool',
                     field_list=[
                         'spider_datetime', 'source_name', 'source_page', 'ip',
                         'address'
                     ])
    proxies_list = ip.sql_search('select ip from ip_pool')
    loop = asyncio.get_event_loop()
    loop.run_until_complete(ip_spider6_task(ip, proxies_list))
    ip.spider_end()
コード例 #6
0
ファイル: lhb_spider.py プロジェクト: cwy1019120542/MySpiders
def run_all_spider():
    run_all = SuperSpider(host='47.102.40.81',
                          passwd='Abc12345',
                          db='bryframe')
    while True:
        now = datetime.now()
        aim_date = f"{now.strftime('%m')}月{now.strftime('%d')}日"
        run_all.get_request('http://data.eastmoney.com/stock/tradedetail.html')
        data_date = run_all.data_search(
            'find', '.cate_type_ul.cate_type_date .at').__next__()
        if aim_date != data_date:
            print('————今日数据未更新————')
            return
        else:
            break
    # lhb_rank_spider()
    # institution_business_spider()
    # stock_count_spider()
    # department_track_spider()
    # active_department_spider()
    # business_detail_spider()
    # department_count_spider()
    # stock_info_spider()
    # stock_report_spider()
    profession_report_spider()
    run_all.spider_end()
コード例 #7
0
def run_all_spider():
    run_all = SuperSpider(host='47.102.40.81',
                          passwd='Abc12345',
                          db='bryframe')
    while True:
        now = datetime.now()
        aim_date = f"{now.strftime('%m')}月{now.strftime('%d')}日"
        data_date = run_all.data_search(
            'http://data.eastmoney.com/stock/tradedetail.html',
            '//ul[contains(@class,"cate_type_date")]/li[@class="at"]/text()',
            'gb2312')[0]
        print(data_date)
        if aim_date != data_date:
            print('————今日数据未更新————')
            run_all.spider_end()
            return
        else:
            break
    for i in range(3):
        lhb_rank_spider()
        institution_business_spider()
        stock_count_spider()
        department_track_spider()
        active_department_spider()
        business_detail_spider()
        department_count_spider()
        stock_info_spider()
        stock_report_spider()
        profession_report_spider()
        bonus_data_spider()
        stock_data_spider()
        time.sleep(1800)
    run_all.spider_end()
コード例 #8
0
def xarcw_spider():
	f=Faker(locale='zh_CN')
	word_list=['python','web','数据库','运维']
	xarcw=SuperSpider(db='supery',table_name='post_tag',default_field='null',field_list=('post_id','tag_id'))
	# data={
	# 'memberName': '13155291086',
	# 'password': '******'}
	# xarcw.post_request('https://login.goodjobs.cn/index.php/action/UserLogin',data=data)
	post_list=xarcw.sql_search('select id from post')
	tag_list=xarcw.sql_search('select id from tag')
	number_list=(2,3,4,5)
	for post in post_list:
		number=random.choice(number_list)
		aim_tag=random.sample(tag_list,number)
		for tag in aim_tag:
			xarcw.post_id=post[0]
			xarcw.tag_id=tag[0]
			xarcw.data_save()
			print(f'{xarcw.post_id}-{xarcw.tag_id}-导入完成')
コード例 #9
0
ファイル: lhb_spider.py プロジェクト: cwy1019120542/MySpiders
def stock_count_spider():
    stock_count = SuperSpider(host='47.102.40.81',
                              passwd='Abc12345',
                              db='bryframe',
                              table_name='stock_count',
                              field_list=('spider_date', 'up_date', 'code',
                                          'name', 'list_time', 'buy_sum',
                                          'sell_sum', 'buy_amount'))
    month_ago = stock_count.date_ago(30)
    page = 1
    while True:
        try:
            json_data = stock_count.use_requests_to_html(
                f'http://data.eastmoney.com/DataCenter_V3/stock2016/StockStatistic/pagesize=50,page={page},sortRule=-1,sortType=,startDate={month_ago},endDate={stock_count.spider_date},gpfw=0,js=var%20data_tab_3.html?rt=25754758',
                'GB2312')
            data_list = stock_count.json_to_py(json_data, deal=True)['data']
        except:
            print(f'第{page}页获取失败')
            page += 1
            continue
        if not data_list or page == 500:
            break
        print(f'第{page}页')
        for data in data_list:
            stock_count.up_date = data['Tdate']
            stock_count.code = data['SCode']
            stock_count.name = data['SName']
            stock_count.list_time = stock_count.to_null(data['SumCount'])
            stock_count.buy_sum = stock_count.to_null(data['Bmoney'])
            stock_count.sell_sum = stock_count.to_null(data['Smoney'])
            stock_count.buy_amount = stock_count.to_null(data['JmMoney'])
            stock_count.data_save()
            print(
                f'个股龙虎榜统计:{stock_count.up_date}-{stock_count.code}-{stock_count.name}-导入完成'
            )
        page += 1
    stock_count.spider_end()
    print('end:个股龙虎榜统计')
コード例 #10
0
ファイル: ht_spider.py プロジェクト: cwy1019120542/MySpiders
def ht_spider(start_time='2019-01-02 00:00:00',end_time='2019-04-02 00:00:00'):
	ht=SuperSpider(use_selenium=True,default_field='null',field_list=('start_time','call_duration','connect_duration','talk_duration','ring_duration','call_direction','connect_status','sound_file','customer_id','caller','called','caller_department','caller_number','caller_user_name','project_name','call_type'),table_name='ht_data')
	ht.selenium_get(r'http://210.13.87.106:8088/ec2')
	ht.selenium_click('//td[@tabindex="-1"]//div[@class="v-captiontext"]',3)
	ht.selenium_input('//input[@class="v-textfield"]','mgrdefault8',index=3)
	ht.selenium_input('//input[@class="v-textfield"]','fuyan2018',index=-1)
	ht.selenium_click('//span[@class="v-button-caption"]',3,index=1)
	ht.selenium_click('//span[@class="v-nativebutton-caption"]',3,index=2)
	ht.selenium_input('//input[@class="v-textfield v-datefield-textfield"]',start_time,index=0)
	ht.selenium_input('//input[@class="v-textfield v-datefield-textfield"]',end_time,index=0)
	ht.selenium_click('//div[@class="v-filterselect-button"]',index=2)
	ht.selenium_click('//td[@class="gwt-MenuItem"]/span',index=0)
	ht.selenium_click('//div[@class="v-button v-button-default default"]//span[@class="v-button-caption"]',3,index=0)
	page_all=ht.selenium_search('//*[@id="ec2-100180"]/div/div[2]/div/div[2]/div/div/div/div[1]/div/div/div/div[1]/div/div[2]/div/div[2]/div/div/div[2]/div/div/div/div[2]/div/div/div/div[2]/div/div/div/div[2]/div/div/div/div[7]/div/div')[0]
	page_number=ht.re_find(r'/(\d+)页',page_all).__next__().group(1)
	for page in range(1,int(page_number)):
		html=ht.page_source()
		data_list=ht.data_search(html=html,xpath='//td[@class="v-table-cell-content"]//text()')
		for i,index1,index2 in zip(range(1,1000),range(0,1000,18),range(18,1000,18)):
			split_list=data_list[index1:index2]
			if split_list:
				split_list.pop(8)
				split_list.pop(8)
				for field,data in zip(ht.field_list,split_list):
					exec(f'ht.{field}=data')
				ht.data_save()
				print(f'第{page}页——第{i}条数据——导入完成')
			else:
				break
		ht.selenium_click('//*[@id="ec2-100180"]/div/div[2]/div/div[2]/div/div/div/div[1]/div/div/div/div[1]/div/div[2]/div/div[2]/div/div/div[2]/div/div/div/div[2]/div/div/div/div[2]/div/div/div/div[2]/div/div/div/div[3]/div/div/span/span',3)
	ht.spider_end()
コード例 #11
0
ファイル: lhb_spider.py プロジェクト: cwy1019120542/MySpiders
def lhb_rank_spider():
    lhb_rank = SuperSpider(
        host='47.102.40.81',
        passwd='Abc12345',
        db='bryframe',
        table_name='lhb_rank',
        field_list=('spider_date', 'up_date', 'code', 'name', 'close_price',
                    'up_down', 'buy_amount', 'change_rate', 'currency_market'))
    page = 1
    while True:
        try:
            json_data = lhb_rank.use_requests_to_html(
                f'http://data.eastmoney.com/DataCenter_V3/stock2016/TradeDetail/pagesize=200,page={page},sortRule=-1,sortType=,startDate={lhb_rank.spider_date},endDate={lhb_rank.spider_date},gpfw=0,js=var%20data_tab_1.html?rt=25754497',
                'GB2312')
            data_list = lhb_rank.json_to_py(json_data, deal=True)['data']
        except:
            print(f'第{page}页获取失败')
            page += 1
            continue
        if not data_list or page == 500:
            break
        print(f'第{page}页')
        for data in data_list:
            lhb_rank.up_date = lhb_rank.spider_date
            lhb_rank.code = data['SCode']
            lhb_rank.name = data['SName']
            lhb_rank.close_price = lhb_rank.to_null(data['ClosePrice'])
            lhb_rank.up_down = lhb_rank.to_null(data['Chgradio'])
            lhb_rank.buy_amount = lhb_rank.to_null(data['JmMoney'])
            lhb_rank.change_rate = lhb_rank.to_null(data['Dchratio'])
            lhb_rank.currency_market = lhb_rank.to_null(data['Ltsz'])
            lhb_rank.data_save()
            print(
                f'当日龙虎榜涨跌幅排名:{lhb_rank.up_date}-{lhb_rank.code}-{lhb_rank.name}-导入完成'
            )
        page += 1
    lhb_rank.spider_end()
    print('end:龙虎榜当日跌幅排名')
コード例 #12
0
ファイル: lhb_spider.py プロジェクト: cwy1019120542/MySpiders
def profession_report_spider():
    profession_report = SuperSpider(
        table_name='profession_report',
        field_list=('name', 'spider_date', 'up_date', 'up_down', 'report',
                    'grade', 'grade_change', 'institution'))
    sql1 = 'select MAX(up_date) from profession_report'
    latest_time = profession_report.sql_search(sql1)[0][0]
    if not latest_time:
        latest_datetime = datetime.now() - timedelta(days=1)
    else:
        latest_datetime = datetime(latest_time.year, latest_time.month,
                                   latest_time.day)
    is_end = False
    for page in range(1, 1337):
        url = 'http://datainterface.eastmoney.com//EM_DataCenter/js.aspx?type=SR&sty=HYSR&mkt=0&stat=0&cmd=4&code=&sc=&ps=50&p=' + str(
            page
        ) + '&js=var%20vMcgaFDg={%22data%22:[(x)],%22pages%22:%22(pc)%22,%22update%22:%22(ud)%22,%22count%22:%22(count)%22}&rt=51553086'
        try:
            json_data = profession_report.use_requests_to_html(url, 'utf8')
            data_list = profession_report.json_to_py(json_data,
                                                     deal=True)['data']
        except:
            print(f'第{page}页获取失败')
            page += 1
            continue
        for data in data_list:
            data = data.split(',')
            time1 = data[1].split(' ')[0].replace('/', '-')
            datetime1 = datetime.strptime(time1, '%Y-%m-%d')
            if datetime1 <= latest_datetime:
                print('暂无数据更新')
                is_end = True
                break
            infocode = data[2]
            time2 = time1.replace('-', '')
            try:
                profession_report.get_request(
                    f'http://data.eastmoney.com/report/{time2}/{infocode}.html'
                )
            except:
                continue
            report = ''
            for par in profession_report.data_search('find', '.newsContent p'):
                report = report + par
            profession_report.name = data[10]
            profession_report.up_date = time1
            profession_report.up_down = profession_report.to_null(data[11])
            profession_report.report = report
            profession_report.grade = data[7]
            profession_report.grade_change = data[0]
            profession_report.institution = data[4]
            profession_report.data_save()
            print(
                f'行业研报:{profession_report.up_date}-{profession_report.name}-{profession_report.institution}-导入完成'
            )
        if is_end == True:
            break
    profession_report.spider_end()
    print('end:行业研报')
コード例 #13
0
ファイル: lhb_spider.py プロジェクト: cwy1019120542/MySpiders
def institution_business_spider():
    institution_business = SuperSpider(
        host='47.102.40.81',
        passwd='Abc12345',
        db='bryframe',
        table_name='institution_business',
        field_list=('spider_date', 'up_date', 'code', 'name', 'buy_number',
                    'sell_number', 'buy_sum', 'sell_sum', 'buy_amount'))
    page = 1
    while True:
        try:
            json_data = institution_business.use_requests_to_html(
                f'http://data.eastmoney.com/DataCenter_V3/stock2016/DailyStockListStatistics/pagesize=50,page={page},sortRule=-1,sortType=PBuy,startDate={institution_business.spider_date},endDate={institution_business.spider_date},gpfw=0,js=var%20data_tab_1.html?rt=25754580',
                'GB2312')
            data_list = institution_business.json_to_py(json_data,
                                                        deal=True)['data']
        except:
            print(f'第{page}页获取失败')
            page += 1
            continue
        if not data_list or page == 500:
            break
        print(f'第{page}页')
        for data in data_list:
            institution_business.up_date = institution_business.spider_date
            institution_business.code = data['SCode']
            institution_business.name = data['SName']
            institution_business.buy_number = institution_business.to_null(
                data['BSL'])
            institution_business.sell_number = institution_business.to_null(
                data['SSL'])
            institution_business.buy_sum = institution_business.to_null(
                data['BMoney'])
            institution_business.sell_sum = institution_business.to_null(
                data['SMoney'])
            institution_business.buy_amount = institution_business.to_null(
                data['PBuy'])
            institution_business.data_save()
            print(
                f'机构买卖情况:{institution_business.up_date}-{institution_business.code}-{institution_business.name}-导入完成'
            )
        page += 1
    institution_business.spider_end()
    print('end:机构买卖情况')
コード例 #14
0
def profession_report_spider():
    profession_report_list = []
    profession_report = SuperSpider(
        host='47.102.40.81',
        passwd='Abc12345',
        db='bryframe',
        table_name='profession_report',
        field_list=('name', 'spider_date', 'up_date', 'up_down', 'report',
                    'grade', 'grade_change', 'institution'))
    sql1 = 'select MAX(up_date) from profession_report'
    latest_time = profession_report.sql_search(sql1)[0][0]
    if not latest_time:
        latest_datetime = datetime.now() - timedelta(days=1)
    else:
        latest_datetime = datetime(latest_time.year, latest_time.month,
                                   latest_time.day)
    is_end = False
    for page in range(1, 1337):
        url = 'http://datainterface.eastmoney.com//EM_DataCenter/js.aspx?type=SR&sty=HYSR&mkt=0&stat=0&cmd=4&code=&sc=&ps=50&p=' + str(
            page
        ) + '&js=var%20vMcgaFDg={%22data%22:[(x)],%22pages%22:%22(pc)%22,%22update%22:%22(ud)%22,%22count%22:%22(count)%22}&rt=51553086'
        try:
            json_data = profession_report.get_html(url)
            data_list = profession_report.json_to_py(json_data,
                                                     deal=True)['data']
        except Exception as error:
            print(f'第{page}页获取失败')
            print(error)
            page += 1
            continue
        for data in data_list:
            data = data.split(',')
            time1 = data[1].split(' ')[0].replace('/', '-')
            profession_report.name = data[10]
            profession_report.up_date = time1
            datetime1 = datetime.strptime(time1, '%Y-%m-%d')
            if datetime1 <= latest_datetime:
                print('暂无数据更新')
                is_end = True
                break
            infocode = data[2]
            time2 = time1.replace('-', '')
            profession_report.up_down = profession_report.to_null(data[11])
            try:
                profession_report.report = (''.join(
                    profession_report.data_search(
                        f'http://data.eastmoney.com/report/{time2}/{infocode}.html',
                        '//div[@class="newsContent"]/text()',
                        'gb2312'))).strip()
            except:
                pass
            sql = f'select name from profession_report where name="{profession_report.name}" and spider_date="{profession_report.spider_date}" and up_date="{profession_report.up_date}" and report="{profession_report.report}"'
            same_data = profession_report.sql_search(sql)
            profession_report.grade = data[7]
            profession_report.grade_change = data[0]
            profession_report.institution = data[4]
            profession_report.data_save()
            print(
                f'行业研报:{profession_report.up_date}-{profession_report.name}-{profession_report.institution}-导入完成'
            )
        if is_end == True:
            break
    profession_report.spider_end()
    print('end:行业研报')
コード例 #15
0
ファイル: lhb_spider.py プロジェクト: cwy1019120542/MySpiders
def stock_report_spider():
    stock_report = SuperSpider(
        host='47.102.40.81',
        passwd='Abc12345',
        db='bryframe',
        table_name='stock_report',
        field_list=('code', 'name', 'spider_date', 'up_date', 'report',
                    'grade', 'grade_change', 'institution', 'income_2018',
                    'rate_2018', 'income_2019', 'rate_2019'))
    sql1 = 'select MAX(up_date) from stock_report'
    latest_time = stock_report.sql_search(sql1)[0][0]
    if not latest_time:
        latest_datetime = datetime.now() - timedelta(days=1)
    else:
        latest_datetime = datetime(latest_time.year, latest_time.month,
                                   latest_time.day)
    is_end = False
    for page in range(1, 254):
        url = 'http://datainterface.eastmoney.com//EM_DataCenter/js.aspx?type=SR&sty=GGSR&js=var%20MILbIdwm={"data":[(x)],"pages":"(pc)","update":"(ud)","count":"(count)"}&ps=50&p=' + str(
            page) + '&mkt=0&stat=0&cmd=2&code=&rt=51552935'
        try:
            json_data = stock_report.use_requests_to_html(url, 'utf8')
            data_list = stock_report.json_to_py(json_data, deal=True)['data']
        except:
            print(f'第{page}页获取失败')
            page += 1
            continue
        for data in data_list:
            time1 = data['datetime'][:10]
            datetime1 = datetime.strptime(time1, '%Y-%m-%d')
            if datetime1 <= latest_datetime:
                print('暂无数据更新')
                is_end = True
                break
            infocode = data['infoCode']
            time2 = time1.replace('-', '')
            try:
                stock_report.get_request(
                    f'http://data.eastmoney.com/report/{time2}/{infocode}.html'
                )
            except:
                continue
            report = ''
            for par in stock_report.data_search('find',
                                                '#ContentBody .newsContent p'):
                report = report + par
            stock_report.code = data['secuFullCode']
            stock_report.name = data['secuName']
            stock_report.up_date = stock_report.spider_date
            stock_report.report = report
            stock_report.grade = data['rate']
            stock_report.grade_change = data['change']
            stock_report.institution = data['insName']
            stock_report.income_2018 = stock_report.to_null(data['sys'][0])
            stock_report.rate_2018 = stock_report.to_null(data['syls'][0])
            stock_report.income_2019 = stock_report.to_null(data['sys'][1])
            stock_report.rate_2019 = stock_report.to_null(data['syls'][1])
            stock_report.data_save()
            print(
                f'个股研报:{stock_report.spider_date}-{stock_report.code}-{stock_report.name}-导入完成'
            )
        if is_end == True:
            break
    stock_report.spider_end()
    print('end:个股研报')
コード例 #16
0
def zggys_spider():	
	zggys=SuperSpider(use_selenium=True)
	zggys.source='中国供应商'
	zggys.website='-'
	zggys.get_request('https://cn.china.cn/')
	url_list1=(i+'?p={}' for i in zggys.data_search('xpath','//*[@id="content"]/div[1]/div[1]/div/div[2]/div/div[2]/div/ul/li/div[2]/a','href'))	
	for url1 in url_list1:
		page=10
		while True:
			print(f'第{page}页')
			try:
				zggys.get_request(url1.format(page))
			except:
				print(f'获取第{page}页失败')
				page+=1
				continue
			url_list2=zggys.data_search('find','h3.title a','href')
			if not url_list2:
				break
			for url2 in url_list2:
				try:
					zggys.get_request(url2)
					zggys.company_name=zggys.data_search('find','.column_xx p a','title').__next__()
				except:
					continue
				company_info_list=(i for i in zggys.data_search('find','.business_xx').__next__().split('\n') if '|' in i)
				company_info_dict={i.split('|')[0]:i.split('|')[1] for i in company_info_list}
				zggys.business_mode=company_info_dict.get('经营模式','-') 
				zggys.register_money=company_info_dict.get('注册资本','-') 
				zggys.company_type=company_info_dict.get('企业类型','-') 
				zggys.main_product=company_info_dict.get('主营产品','-') 
				zggys.address=company_info_dict.get('公司地址','-') 
				#print(business_mode,register_money,company_type,main_product,address)
				zggys.person_name=zggys.data_search('find','.personal_top .t span').__next__()
				phone_list=zggys.data_search('find','.personal_bottom span')
				#print(phone_list)
				cell_phone_list=[]
				phone_code_list=[]
				for phone in phone_list:
					if not phone:
						js='var btn=document.querySelector(".see_a.inactive_scode");btn.click();'
						zggys.selenium_js(url2,js)
						zggys.cell_phone=zggys.selenium_search('css_selector','.inactive_top .number').__next__()
						phone_info_dict={i.split('\n')[0]:i.split('\n')[1].strip('QQ交谈') for i in zggys.selenium_search('css_selector','.inactive_right .txt p')}	
						zggys.phone_code=phone_info_dict.get('电话','-')
						zggys.fax=phone_info_dict.get('传真','-')
						zggys.qq=phone_info_dict.get('Q  Q','-')
					else:
						if not phone.startswith('1'):
							phone_code_list.append(phone)
						else:
							cell_phone_list.append(phone)
				if cell_phone_list or phone_code_list:
					zggys.phone_code='/'.join(phone_code_list) if phone_code_list else '-'
					zggys.cell_phone='/'.join(cell_phone_list) if cell_phone_list else '-'
					zggys.fax='-'
					zggys.qq='-'
				zggys.data_save()
				print(f'中国供应商——{zggys.company_name}信息导入完成')
			page+=1
	zggys.spider_end()
コード例 #17
0
ファイル: lhb_spider.py プロジェクト: cwy1019120542/MySpiders
def department_count_spider():
    department_count = SuperSpider(
        host='47.102.40.81',
        passwd='Abc12345',
        db='bryframe',
        table_name='department_count',
        field_list=('spider_date', 'up_date', 'name', 'list_time', 'buy_time',
                    'buy_sum', 'sell_time', 'sell_sum'))
    month_ago = department_count.date_ago(30)
    page = 1
    while True:
        try:
            json_data = department_count.use_requests_to_html(
                f'http://data.eastmoney.com/DataCenter_V3/stock2016/TraderStatistic/pagesize=50,page={page},sortRule=-1,sortType=,startDate={month_ago},endDate={department_count.spider_date},gpfw=0,js=var%20data_tab_1.html?rt=25754789',
                'GB2312')
            data_list = department_count.json_to_py(json_data,
                                                    deal=True)['data']
        except:
            print(f'第{page}页获取失败')
            page += 1
            continue
        if not data_list or page == 500:
            break
        print(f'第{page}页')
        for data in data_list:
            department_count.up_date = department_count.spider_date
            department_count.name = data['SalesName']
            department_count.list_time = department_count.to_null(
                data['UpCount'])
            department_count.buy_time = department_count.to_null(
                data['BCount'])
            department_count.buy_sum = department_count.to_null(
                data['SumActBMoney'])
            department_count.sell_time = department_count.to_null(
                data['SCount'])
            department_count.sell_sum = department_count.to_null(
                data['SumActSMoney'])
            department_count.data_save()
            print(
                f'证券营业部上榜统计:{department_count.up_date}-{department_count.name}-导入完成'
            )
        page += 1
    department_count.spider_end()
    print('end:证券营业部上榜统计')
コード例 #18
0
def zjmyqyw():
	zjmyqyw=SuperSpider()
	zjmyqyw.source='浙江名营企业网'
	zjmyqyw.fax='-'
	zjmyqyw.get_request('http://www.zj123.com/')
	url_list1=('http://www.zj123.com/'+i.replace('1.','{}.') for i in zjmyqyw.data_search('find','.indsort dd a','href'))
	for url1 in url_list1:
		page=1
		while True:
			print(f'第{page}页')
			zjmyqyw.get_request(url1.format(page))
			page_judge=zjmyqyw.data_search('find','.sleft .m.m1 .fred').__next__().split()[0]
			if int(page_judge) != page:
				break
			print(page_judge)
			url_list2=('http://www.zj123.com/member/VIPContact/'+i.split('-')[1]+'/index.htm' for i in zjmyqyw.data_search('find','.listdetail22 .listdetail dt a','href'))
			url_list3=('http://www.zj123.com/member/VIPCompany/'+i.split('-')[1]+'/index.htm' for i in zjmyqyw.data_search('find','.listdetail22 .listdetail dt a','href'))
			#print(url_list2)
			for url2,url3 in zip(url_list2,url_list3):
				zjmyqyw.get_request(url2)
				contact_info_dict={i.split(':')[0].strip():i.split(':')[-1].strip().replace('\xa0','') for i in zjmyqyw.data_search('find','.rkbody table tr')}
				zjmyqyw.company_name=contact_info_dict['公司名称'] if contact_info_dict['公司名称'] else '-'
				zjmyqyw.person_name=contact_info_dict['联系人'] if contact_info_dict['联系人'] else '-'
				zjmyqyw.address=contact_info_dict['地 址'] if contact_info_dict['地 址'] else '-'
				zjmyqyw.phone_code=contact_info_dict['电 话'] if contact_info_dict['电 话'] else '-'
				zjmyqyw.cell_phone=contact_info_dict['手机'] if contact_info_dict['手机'] else '-'
				zjmyqyw.qq=contact_info_dict['QQ'] if contact_info_dict['QQ'] else '-'
				zjmyqyw.website=contact_info_dict['网 址'] if contact_info_dict['网 址'] else '-'
				zjmyqyw.get_request(url3)
				company_info_list=list(zjmyqyw.data_search('find','.rkbody table tr td'))
				company_info_dict={company_info_list[n].strip(': '):company_info_list[n+1].strip(': ') for n in range(0,24,2)}
				#print(company_info_dict)
				zjmyqyw.main_product=company_info_dict['主营产品或服务'] if company_info_dict['主营产品或服务'] else '-'
				zjmyqyw.business_mode=company_info_dict['经营模式'] if company_info_dict['经营模式'] else '-'
				zjmyqyw.company_type=company_info_dict['企业类型'] if company_info_dict['企业类型'] else '-'
				zjmyqyw.register_money=company_info_dict['注册资本'] if company_info_dict['注册资本'] else '-'
				zjmyqyw.data_save()
				print(f'浙江企业网——{zjmyqyw.company_name}信息导入完成')
			page+=1
	zjmyqyw.spider_end()
#zjmyqyw()

# test_obj=SuperSpider()
# js='var btn=document.querySelector(".see_a.inactive_scode");btn.click();'
# test_obj.use_selenium()
# test_obj.selenium_js('https://www.china.cn/shukongjichuang/3746553522.html',js)
# test_obj.cell_phone=test_obj.selenium_search('css_selector','.inactive_top .number').__next__()
# print('aaaaaaa')
# print(test_obj.cell_phone)
コード例 #19
0
def zggys_spider():
    zggys = SuperSpider(host='192.168.0.172', default_field='-')
    zggys.source_name = '中国供应商'
    proxies_list = zggys.sql_search('select ip from ip_pool')
    url_list1 = [
        i + '?p={}' for i in zggys.data_search(
            'https://cn.china.cn/',
            '//*[@id="content"]/div[1]/div[1]/div/div[2]/div/div[2]/div/ul/li/div[2]/a/@href'
        )
    ]
    profession_list = zggys.data_search(
        'https://cn.china.cn/',
        '//*[@id="content"]/div[1]/div[1]/div/div[2]/div/div[2]/div/ul/li/div[2]/a/text()',
        'GBK')
    error_index = profession_list.index('睡袋')
    for url1, profession in zip(url_list1[error_index:],
                                profession_list[error_index:]):
        page = 1
        while True:
            time.sleep(2)
            print(f'{profession}——第{page}页')
            for i in range(20):
                proxies = random.choice(proxies_list)[0]
                print(f'使用代理-{proxies}')
                key = 'http' if not proxies.startswith('https') else 'https'
                try:
                    url_list2 = zggys.data_search(
                        url1.format(page),
                        '//ul[@class="extension_ul"]//h3[@class="title"]/a/@href',
                        'GBK',
                        proxies={key: proxies},
                        timeout=5)
                except Exception as error:
                    print(error)
                    continue
            if not url_list2:
                print(f'{profession}——第{page}页——没有数据')
                break
            for url2 in url_list2:
                for i in range(20):
                    try:
                        time.sleep(2)
                        proxies = random.choice(proxies_list)[0]
                        print(f'使用代理-{proxies}')
                        key = 'http' if not proxies.startswith(
                            'https') else 'https'
                        html = zggys.get_html(url2,
                                              charset='GBK',
                                              proxies={key: proxies},
                                              timeout=5)
                        zggys.source_page = url2
                        if zggys.data_search(
                                html=html,
                                xpath='//div[@class="column_xx"]//p//a/text()'
                        ):
                            zggys.company_name = zggys.data_search(
                                html=html,
                                xpath='//div[@class="column_xx"]//p//a/text()'
                            )[0]
                        company_info_list = [
                            i for i in zggys.data_search(
                                html=html,
                                xpath='//ul[@class="business_xx"]//li//text()')
                            if i.strip('\r\n |')
                        ]
                        # print(company_info_list)
                    except Exception as error:
                        print(error)
                        continue
                    else:
                        try:
                            aim_index = company_info_list.index('经营模式')
                            zggys.business_mode = company_info_list[aim_index +
                                                                    1]
                        except:
                            pass
                        try:
                            aim_index = company_info_list.index('注册资本')
                            zggys.register_money = company_info_list[
                                aim_index + 1].strip()
                        except:
                            pass
                        try:
                            aim_index = company_info_list.index('企业类型')
                            zggys.company_type = company_info_list[aim_index +
                                                                   1]
                        except:
                            pass
                        try:
                            aim_index = company_info_list.index('主营产品')
                            zggys.main_product = company_info_list[aim_index +
                                                                   1]
                        except:
                            pass
                        try:
                            aim_index = company_info_list.index('公司地址')
                            zggys.address = company_info_list[aim_index + 1]
                        except:
                            pass
                        try:
                            zggys.person_name = zggys.data_search(
                                html=html,
                                xpath=
                                '//div[@class="personal_top"]//div[@class="t"]//span/text()'
                            )[0]
                        except:
                            pass
                        phone_list = zggys.data_search(
                            html=html,
                            xpath='//div[@class="personal_bottom"]//span/text()'
                        )
                        if not phone_list:
                            # js=['var btn=document.querySelector(".see_a.inactive_scode");btn.click();']
                            # try:
                            # 	zggys.selenium_open(url2)
                            # 	zggys.selenium_js(js,sleep_time=2)
                            # 	zggys.phone_number=zggys.selenium_search('css_selector','.inactive_top .number').__next__()
                            # 	phone_info_dict={i.split('\n')[0]:i.split('\n')[1].strip('QQ交谈') for i in zggys.selenium_search('css_selector','.inactive_right .txt p')}
                            # except:
                            # 	continue
                            # zggys.fax=phone_info_dict.get('传真','-').strip()
                            # zggys.qq=phone_info_dict.get('Q  Q','-').strip()
                            # zggys.data_save()
                            # zggys.phone_number=phone_info_dict.get('电话','-').strip()
                            # zggys.data_save()
                            break
                        for phone in phone_list:
                            zggys.phone_number = phone.strip()
                            zggys.data_save()
                        print(
                            f'{profession}—第{page}页—{zggys.company_name}信息导入完成'
                        )
                    break
            page += 1
    zggys.spider_end()
コード例 #20
0
def xarcw_spider():
    word_list = ['网络']
    xarcw = SuperSpider(host='192.168.0.172', default_field='-')
    xarcw.source_name = '新安人才网'
    data = {'memberName': '13155291086', 'password': '******'}
    xarcw.post_request('https://login.goodjobs.cn/index.php/action/UserLogin',
                       data=data)
    for word in word_list:
        for city_code in range(1043, 1061):
            for page in range(1, 61):
                print(f'{word}-{city_code}-第{page}页')
                try:
                    url_list = xarcw.data_search(
                        f'https://search.goodjobs.cn/index.php?keyword={word}&boxwp=c{city_code}&page={page}',
                        '//div[@class="dw_table"]//span[@class="e1"]/a/@href')
                except:
                    print(f'{word}-{city_code}-第{page}页获取失败')
                    continue
                if not url_list:
                    print(f'{word}-{city_code}-第{page}页-爬取结束')
                    break
                for url in url_list:
                    # print(url)
                    xarcw.source_page = url
                    time.sleep(1)
                    data_list = xarcw.data_search(url, [
                        '//p[@class="cname"]/a/text()',
                        '//p[@class="msg ltype"]/text()',
                        '//div[@class="w706 clearfix"]/text()',
                        '//div[@class="w706 clearfix"]/img/@src',
                        '//div[@class="comadress clearfix"]/text()'
                    ])
                    if not data_list[0] or not data_list[3]:
                        continue
                    if not data_list[0]:
                        data_list = xarcw.data_search(url, [
                            '//div[@class="w240 whitespace pb16"]//a[@class="org"]/text()',
                            '//div[@class="w240 whitespace pb16"]//p[@class="grey lh28"]/span[@class="black"]/text()',
                            '//p[@class="duol mt20"]/text()',
                            '//p[@class="duol mt20"]/img/@src',
                            '//div[@class="comadress clearfix"]/text()'
                        ])
                        xarcw.company_type = data_list[1][0]
                        xarcw.main_product = data_list[1][2]
                    else:
                        company_info_list = [
                            i.strip('\xa0\xa0\n ')
                            for i in data_list[1][0].split('|')
                        ]
                        xarcw.company_type = company_info_list[0]
                        for j in company_info_list[1:]:
                            if '-' in j:
                                xarcw.staff_number = j
                            else:
                                xarcw.main_product = j
                    xarcw.company_name = data_list[0][0]
                    xarcw.person_name = [i for i in data_list[2]
                                         if i.strip()][0]
                    try:
                        xarcw.phone_number = xarcw.use_tesseract(
                            url=data_list[3][0], lang=None)
                    except:
                        continue
                    xarcw.address = data_list[4][0].strip('工作地点:\u3000\n ')
                    xarcw.data_save()
                    print(
                        f'{xarcw.company_name}-{xarcw.person_name}-{xarcw.phone_number}-导入完成'
                    )
コード例 #21
0
def stock_data_spider():
    data_end = None
    stock_data = SuperSpider(host='139.224.115.44',
                             passwd='A9Vg+Dr*nP^fR=1V',
                             db='bryframe3',
                             table_name='stock_data',
                             field_list=('spider_date', 'up_date', 'code',
                                         'name', 'stock_rate', 'stock_price'))
    page = 1
    while True:
        print(f'第{page}页')
        url = 'http://datainterface.eastmoney.com/EM_DataCenter/JS.aspx?type=NS&sty=NSA&st=6&sr=-1&p=' + str(
            page) + '&ps=50&js=var%20inHqdtrZ={pages:(pc),data:[(x)]}&rt=5174'
        try:
            json_data = stock_data.get_html(url)
            data_list = stock_data.json_to_py(json_data, deal=True)['data']
            if data_list[:3] == data_end:
                break
            else:
                data_end = data_list[:3]
        except:
            print(f'第{page}页获取失败')
            page += 1
            continue
        if not data_list or page == 500:
            break
        for data in data_list:
            field_list = data.split(',')
            stock_data.code = field_list[2]
            stock_data.name = field_list[3]
            stock_data.stock_rate = '10配' + field_list[6]
            stock_data.stock_price = stock_data.to_null(field_list[7])
            stock_data.up_date = field_list[14] if field_list[14] else 'null'
            sql = f'select code from stock_data where code="{stock_data.code}" and spider_date="{stock_data.spider_date}" and up_date="{stock_data.up_date}"'
            same_data = stock_data.sql_search(sql)
            if same_data:
                stock_data.sql_search(
                    f'delete from stock_data where code="{stock_data.code}" and spider_date="{stock_data.spider_date}" and up_date="{stock_data.up_date}"'
                )
                print(
                    f'重新爬取-{stock_data.spider_date}-{stock_data.code}-{stock_data.name}'
                )
            stock_data.data_save()
            print(
                f'{stock_data.up_date}-{stock_data.code}-{stock_data.name}-导入完成'
            )
        page += 1
    stock_data.spider_end()
コード例 #22
0
def bonus_data_spider():
    bonus_data = SuperSpider(
        host='139.224.115.44',
        passwd='A9Vg+Dr*nP^fR=1V',
        db='bryframe3',
        table_name='bonus_data',
        field_list=('spider_date', 'bonus_report_date', 'code', 'name',
                    'cash_bonus_rate', 'transfer_rate', 'plan_announce_date',
                    'stock_register_date', 'remove_date', 'plan_scheduler',
                    'latest_announce_date'))
    date_list = bonus_data.data_search(
        'http://data.eastmoney.com/yjfp/201812.html',
        '//select[@id="sel_bgq"]/option/text()', 'gb2312')
    year_ago_datetime = bonus_data.to_datetime(bonus_data.date_ago(365))
    date_list2 = []
    for aim_date in date_list:
        if year_ago_datetime <= bonus_data.to_datetime(str(aim_date)):
            date_list2.append(aim_date)
        else:
            break
    for use_date in date_list2:
        bonus_data.bonus_report_date = use_date
        page = 1
        while True:
            print(f'第{page}页')
            try:
                json_data = bonus_data.get_html(
                    f'http://data.eastmoney.com/DataCenter_V3/yjfp/getlist.ashx?js=var%20aTnZIWfZ&pagesize=50&page={page}&sr=-1&sortType=YAGGR&mtk=%C8%AB%B2%BF%B9%C9%C6%B1&filter=(ReportingPeriod=^{use_date}^)&rt=51742239',
                    'GB2312')
                data_list = bonus_data.json_to_py(json_data, deal=True)['data']
            except:
                print(f'第{page}页获取失败')
                page += 1
                continue
            if not data_list or page == 500:
                break
            for data in data_list:
                bonus_data.code = data['Code']
                bonus_data.name = data['Name']
                bonus_data.latest_announce_date = bonus_data.to_null(
                    data['NoticeDate'][:10])
                sql = f'select code from bonus_data where code="{bonus_data.code}" and spider_date="{bonus_data.spider_date}" and latest_announce_date="{bonus_data.latest_announce_date}"'
                same_data = bonus_data.sql_search(sql)
                if same_data:
                    bonus_data.sql_search(
                        f'delete from bonus_data where code="{bonus_data.code}" and spider_date="{bonus_data.spider_date}" and latest_announce_date="{bonus_data.latest_announce_date}"'
                    )
                    print(
                        f'重新爬取-{bonus_data.spider_date}-{bonus_data.code}-{bonus_data.name}'
                    )
                bonus_data.plan_announce_date = bonus_data.to_null(
                    data['ResultsbyDate'][:10])
                bonus_data.stock_register_date = bonus_data.to_null(
                    data['GQDJR'][:10])
                bonus_data.remove_date = bonus_data.to_null(data['CQCXR'][:10])
                bonus_data.plan_scheduler = data['ProjectProgress']
                group_data = data['AllocationPlan']
                try:
                    bonus_data.cash_bonus_rate = '10' + bonus_data.re_find(
                        r'派[\d\.]+', group_data).__next__().group() + '元(含税)'
                except:
                    bonus_data.cash_bonus_rate = 'null'
                try:
                    transfer_rate1 = bonus_data.re_find(
                        r'转[\d\.]+', group_data).__next__().group()
                except:
                    transfer_rate1 = ''
                try:
                    transfer_rate2 = bonus_data.re_find(
                        r'送[\d\.]+', group_data).__next__().group()
                except:
                    transfer_rate2 = ''
                if not transfer_rate1 and not transfer_rate2:
                    bonus_data.transfer_rate = 'null'
                else:
                    bonus_data.transfer_rate = '10' + transfer_rate2 + transfer_rate1
                bonus_data.data_save()
                print(
                    f'{bonus_data.bonus_report_date}-{bonus_data.code}-{bonus_data.name}-导入完成'
                )
            page += 1
    bonus_data.spider_end()
コード例 #23
0
import aiohttp
import asyncio
from super_spider import SuperSpider
ip = SuperSpider(host='192.168.0.172')
session = aiohttp.ClientSession()


async def ip_check(sem, proxies):
    async with sem:
        try:
            key = 'http' if not proxies.startswith('https') else 'https'
            url = f'{key}://www.baidu.com'
            async with session.get(url,
                                   headers=ip.random_headers(),
                                   proxy=proxies,
                                   timeout=3) as response:
                status_code = response.status
                if status_code != 200:
                    ip.sql_search(f'delete from ip_pool where ip="{proxies}"')
                    print(f'{proxies}-不可用已删除')
                else:
                    print(f'{proxies}-可用')
        except:
            ip.sql_search(f'delete from ip_pool where ip="{proxies}"')
            print(f'{proxies}-不可用已删除')


proxies_list = ip.sql_search('select ip from ip_pool')


async def split_task():
コード例 #24
0
ファイル: lhb_spider.py プロジェクト: cwy1019120542/MySpiders
def stock_info_spider():
    stock_info = SuperSpider(host='47.102.40.81',
                             passwd='Abc12345',
                             db='bryframe',
                             table_name='stock_info',
                             field_list=('code', 'name', 'spider_date',
                                         'up_date', 'highest', 'lowest',
                                         'today', 'yesterday'))
    for page in range(1, 181):
        try:
            json_data = stock_info.use_requests_to_html(
                f'http://nufm.dfcfw.com/EM_Finance2014NumericApplication/JS.aspx?cb=jQuery11240974473783255319_1545290975192&type=CT&token=4f1862fc3b5e77c150a2b985b12db0fd&sty=FCOIATC&js=(%7Bdata%3A%5B(x)%5D%2CrecordsFiltered%3A(tot)%7D)&cmd=C._A&st=(ChangePercent)&sr=-1&p={page}&ps=20&_=1545290975206',
                'utf8')
            data_list = stock_info.json_to_py(json_data, deal=True)['data']
        except:
            print(f'第{page}页获取失败')
            page += 1
            continue
        print(f'第{page}页')
        for data_str in data_list:
            data = data_str.replace('-', 'null').split(',')
            stock_info.code = data[1]
            stock_info.name = data[2]
            stock_info.spider_date = stock_info.spider_date
            stock_info.up_date = stock_info.spider_date
            stock_info.highest = stock_info.to_null(data[9])
            stock_info.lowest = stock_info.to_null(data[10])
            stock_info.today = stock_info.to_null(data[11])
            stock_info.yesterday = stock_info.to_null(data[12])
            stock_info.data_save()
            print(
                f'行情中心:{stock_info.up_date}-{stock_info.code}-{stock_info.name}-导入完成'
            )
        page += 1
    stock_info.spider_end()
    print('end:行情中心')
コード例 #25
0
ファイル: lhb_spider.py プロジェクト: cwy1019120542/MySpiders
def department_track_spider():
    department_track = SuperSpider(
        host='47.102.40.81',
        passwd='Abc12345',
        db='bryframe',
        table_name='department_track',
        field_list=('spider_date', 'up_date', 'code', 'name', 'list_time',
                    'buy_sum', 'buy_time', 'sell_time', 'buy_amount',
                    'up_down'))
    month_ago = department_track.date_ago(30)
    page = 1
    while True:
        try:
            json_data = department_track.use_requests_to_html(
                f'http://data.eastmoney.com/DataCenter_V3/stock2016/JgStatistic/pagesize=50,page={page},sortRule=-1,sortType=,startDate={month_ago},endDate={department_track.spider_date},gpfw=0,js=var%20data_tab_3.html?rt=25754592',
                'GB2312')
            data_list = department_track.json_to_py(json_data,
                                                    deal=True)['data']
        except:
            print(f'第{page}页获取失败')
            page += 1
            continue
        if not data_list or page == 500:
            break
        print(f'第{page}页')
        for data in data_list:
            department_track.up_date = department_track.spider_date
            department_track.code = data['SCode']
            department_track.name = data['SName']
            department_track.list_time = department_track.to_null(
                data['UPCount'])
            department_track.buy_sum = department_track.to_null(
                data['JGBMoney'])
            department_track.buy_time = department_track.to_null(
                data['JGBCount'])
            department_track.sell_time = department_track.to_null(
                data['JGSCount'])
            department_track.buy_amount = department_track.to_null(
                data['JGPBuy'])
            department_track.up_down = department_track.to_null(
                data['RChange1M'])
            department_track.data_save()
            print(
                f'机构席位买卖追踪:{department_track.up_date}-{department_track.code}-{department_track.name}-导入完成'
            )
        page += 1
    department_track.spider_end()
    print('end:机构席位买卖追踪')
コード例 #26
0
def business_detail_spider():
    business_detail_list = []
    business_detail = SuperSpider(host='47.102.40.81',
                                  passwd='Abc12345',
                                  db='bryframe',
                                  table_name='business_detail',
                                  field_list=('spider_date', 'up_date', 'code',
                                              'name', 'department_name',
                                              'amount'))
    business_detail.up_date = business_detail.spider_date
    page = 1
    while True:
        try:
            json_data = business_detail.get_html(
                f'http://data.eastmoney.com/DataCenter_V3/stock2016/ActiveStatistics/pagesize=50,page={page},sortRule=-1,sortType=JmMoney,startDate={business_detail.spider_date},endDate={business_detail.spider_date},gpfw=0,js=var%20data_tab_1.html?rt=25861061',
                'GB2312')
            data_list = business_detail.json_to_py(json_data,
                                                   deal=True)['data']
        except:
            print(f'第{page}页获取失败')
            page += 1
            continue
        if not data_list or page == 500:
            break
        print(f'第{page}页')
        for data in data_list:
            if not data['SName']:
                continue
            stock_data_list = business_detail.json_to_py(data['SName'])
            for stock_data in stock_data_list:
                if stock_data['CodeName'] not in business_detail_list:
                    business_detail_list.append(stock_data['CodeName'])
                else:
                    continue
                business_detail.name = stock_data['CodeName']
                business_detail.code = stock_data['SCode']
                sql = f'select code from business_detail where code="{business_detail.code}" and spider_date="{business_detail.spider_date}"'
                same_data = business_detail.sql_search(sql)
                if same_data:
                    business_detail.sql_search(
                        f'delete from business_detail where code="{business_detail.code}" and spider_date="{business_detail.spider_date}"'
                    )
                    print(
                        f'重新爬取-{business_detail.spider_date}-{business_detail.code}-{business_detail.name}'
                    )
                try:
                    url_code = business_detail.re_find(
                        r'\d+', business_detail.code).__next__().group()
                except:
                    continue
                url = f'http://data.eastmoney.com/stock/lhb,{business_detail.spider_date},{url_code}.html'
                try:
                    detail_data_list = [
                        i for i in business_detail.data_search(
                            url, '//div[@class="content-sepe"]//td//text()',
                            'gb2312') if i.strip() and '\r' not in i
                    ]
                    for i in range(6):
                        if '(买入前5名与卖出前5名)' in detail_data_list:
                            error_index = detail_data_list.index(
                                '(买入前5名与卖出前5名)')
                            del detail_data_list[error_index:error_index + 6]
                except:
                    print(
                        f'{business_detail.code}-{business_detail.name}-获取失败')
                    continue
                # print(detail_data_list)
                department_list = []
                for i, j in zip(range(1, 1000, 8), range(7, 1000, 8)):
                    try:
                        business_detail.department_name = detail_data_list[i]
                        if business_detail.department_name not in department_list:
                            department_list.append(
                                business_detail.department_name)
                        else:
                            print(
                                f'{business_detail.name}-{business_detail.department_name}-信息重复'
                            )
                            continue
                        business_detail.amount = detail_data_list[j]
                        # print(business_detail.amount)
                    except:
                        break
                    business_detail.data_save()
                    print(
                        f'每日成交明细——{business_detail.up_date}——{business_detail.code}——{business_detail.name}——{business_detail.department_name}——导入完成'
                    )
        page += 1
    business_detail.spider_end()
コード例 #27
0
def zjmyqyw_spdier():
	company_deque=deque([],maxlen=35)
	zjmyqyw=SuperSpider()
	zjmyqyw.source_name='浙江名营企业网'
	zjmyqyw.fax='-'
	zjmyqyw.get_request('http://www.zj123.com/')
	url_list1=['http://www.zj123.com/'+i.replace('1.','{}.') for i in zjmyqyw.data_search('find','.indsort dd a','href')]
	profession_list=list(zjmyqyw.data_search('find','.indsort dd a'))
	error_index=profession_list.index('特种印刷')
	for profession,url1 in zip(profession_list[error_index:],url_list1[error_index:]):
		for page in range(1,100):
			print(f'{profession}——第{page}页')
			try:
				zjmyqyw.get_request(url1.format(page))
				page_judge=zjmyqyw.data_search('find','.sleft .m.m1 .fred').__next__().split()[0]
			except:
				print(f'获取第{page}页失败')
				page+=1
				continue
			if int(page_judge) != page:
				break
			url_list2=('http://www.zj123.com/member/VIPContact/'+i.split('-')[1]+'/index.htm' for i in zjmyqyw.data_search('find','.listdetail22 .listdetail dt a','href'))
			url_list3=('http://www.zj123.com/member/VIPCompany/'+i.split('-')[1]+'/index.htm' for i in zjmyqyw.data_search('find','.listdetail22 .listdetail dt a','href'))
			#print(url_list2)
			for url2,url3 in zip(url_list2,url_list3):
				try:
					zjmyqyw.get_request(url2)
				except:
					continue
				contact_info_dict={i.split(':')[0].strip():i.split(':')[-1].strip().replace('\xa0','') for i in zjmyqyw.data_search('find','.rkbody table tr')}
				zjmyqyw.company_name=contact_info_dict['公司名称'] if contact_info_dict['公司名称'] else '-'
				if zjmyqyw.company_name in company_deque:
					print('信息重复')
					continue
				zjmyqyw.person_name=contact_info_dict['联系人'] if contact_info_dict['联系人'] else '-'
				zjmyqyw.address=contact_info_dict['地 址'] if contact_info_dict['地 址'] else '-'
				zjmyqyw.phone_number=contact_info_dict['电 话'] if contact_info_dict['电 话'] else '-'
				zjmyqyw.qq=contact_info_dict['QQ'] if contact_info_dict['QQ'] else '-'
				zjmyqyw.website=contact_info_dict['网 址'] if contact_info_dict['网 址'] else '-'
				try:
					zjmyqyw.get_request(url3)
				except:
					continue
				company_info_list=list(zjmyqyw.data_search('find','.rkbody table tr td'))
				company_info_dict={company_info_list[n].strip(': '):company_info_list[n+1].strip(': ') for n in range(0,24,2)}
				#print(company_info_dict)
				zjmyqyw.main_product=company_info_dict['主营产品或服务'] if company_info_dict['主营产品或服务'] else '-'
				zjmyqyw.business_mode=company_info_dict['经营模式'] if company_info_dict['经营模式'] else '-'
				zjmyqyw.company_type=company_info_dict['企业类型'] if company_info_dict['企业类型'] else '-'
				zjmyqyw.register_money=company_info_dict['注册资本'] if company_info_dict['注册资本'] else '-'
				zjmyqyw.register_money=company_info_dict['员工人数'] if company_info_dict['员工人数'] else '-'
				zjmyqyw.source_page=url2
				zjmyqyw.data_save()
				zjmyqyw.phone_number=contact_info_dict['手机'] if contact_info_dict['手机'] else '-'
				zjmyqyw.data_save()
				company_deque.append(zjmyqyw.company_name)
				print(f'{profession}——第{page}页——{zjmyqyw.company_name}信息导入完成')
	zjmyqyw.spider_end()
コード例 #28
0
ファイル: lhb_spider.py プロジェクト: cwy1019120542/MySpiders
def active_department_spider():
    active_department = SuperSpider(
        host='47.102.40.81',
        passwd='Abc12345',
        db='bryframe',
        table_name='active_department',
        field_list=('spider_date', 'up_date', 'name', 'buy_number',
                    'sell_number', 'buy_sum', 'sell_sum', 'business_amount',
                    'code', 'stock_name'))
    page = 1
    while True:
        try:
            json_data = active_department.use_requests_to_html(
                f'http://data.eastmoney.com/DataCenter_V3/stock2016/ActiveStatistics/pagesize=50,page={page},sortRule=-1,sortType=JmMoney,startDate={active_department.spider_date},endDate={active_department.spider_date},gpfw=0,js=var%20data_tab_1.html?rt=25754772',
                'GB2312')
            data_list = active_department.json_to_py(json_data,
                                                     deal=True)['data']
        except:
            print(f'第{page}页获取失败')
            page += 1
            continue
        if not data_list or page == 500:
            break
        print(f'第{page}页')
        for data in data_list:
            active_department.up_date = active_department.spider_date
            active_department.name = data['YybName']
            active_department.buy_number = active_department.to_null(
                data['YybBCount'])
            active_department.sell_number = active_department.to_null(
                data['YybSCount'])
            active_department.buy_sum = active_department.to_null(
                data['Bmoney'])
            active_department.sell_sum = active_department.to_null(
                data['Smoney'])
            active_department.business_amount = active_department.to_null(
                data['JmMoney'])
            if not data['SName']:
                active_department.code = 'null'
                active_department.stock_name = 'null'
                active_department.data_save()
            else:
                for data_s in active_department.json_to_py(data['SName']):
                    active_department.code = data_s['SCode']
                    active_department.stock_name = data_s['CodeName']
                    active_department.data_save()
                    print(
                        f'每日活跃营业部:{active_department.up_date}-{active_department.name}-导入完成'
                    )
        page += 1
    active_department.spider_end()
    print('end:每日活跃营业部')
コード例 #29
0
def wl114_spider():
    wl114 = SuperSpider()
    wl114.source_name = '网络114'
    wl114.business_mode = '-'
    wl114.register_money = '-'
    wl114.website = '-'
    wl114.qq = '-'
    wl114.get_request('http://www.net114.com/')
    url_list1 = [
        i.replace('.html', '-p-{}.html') for i in wl114.data_search(
            'xpath',
            '//*[@id="product_center_content"]/div/ul/li/p/a',
            attr='href') if i.endswith('.html')
    ]
    profession_list1 = [
        i for i in wl114.data_search(
            'xpath', '//*[@id="product_center_content"]/div/ul/li/p/a')
        if i != '更多>>'
    ]
    error_index = profession_list1.index('维护工具')
    url_list2 = (i for i in wl114.data_search(
        'xpath',
        '//*[@id="product_center_content"]/div/ul/li/p/a',
        attr='href') if not i.endswith('.html'))
    profession_list2 = (i for i in wl114.data_search(
        'xpath', '//*[@id="product_center_content"]/div/ul/li/p/a')
                        if i == '更多>>')
    for url1, profession1 in zip(url_list1[error_index:],
                                 profession_list1[error_index:]):
        try:
            wl114.get_request(url1.format(1))
            all_page = wl114.data_search(
                'find', '.page_p:not(span)').__next__().split('\xa0')[1]
        except:
            continue
        for page in range(1, int(all_page) + 1):
            print(f'{profession1}——第{page}页')
            try:
                wl114.get_request(url1.format(page))
            except:
                continue
            url_list3 = list(
                wl114.data_search('find', '.product_list_div_h143 h2 a',
                                  'href'))
            if not url_list3:
                break
            for url3 in url_list3:
                try:
                    wl114.get_request(url3)
                    company_info_dict = {
                        i.split(':')[0].strip(): i.split(':')[-1].strip()
                        for i in wl114.data_search(
                            'find', '.right.w_250 .border.p_8 li') if ':' in i
                    }
                    phone_url = wl114.data_search(
                        'find', '.right.w_250 .border.p_8 li a',
                        'href').__next__()
                except:
                    continue
                wl114.company_type = company_info_dict.get('企业性质', '-')
                wl114.main_product = company_info_dict.get('企业主营', '-')
                wl114.address = company_info_dict.get('企业地址', '-')
                try:
                    wl114.get_request(phone_url)
                except:
                    continue
                phone_info_data = wl114.data_search(
                    'find', 'td[valign="top"]:first-child')
                try:
                    phone_info_list = phone_info_data.__next__().split('\n')
                    phone_info_dict = {
                        i.split(':')[0].strip(): i.split(':')[-1].strip()
                        for i in phone_info_list if ':' in i
                    }
                except:
                    continue
                wl114.company_name = phone_info_dict.get('公司名称', '-')
                if wl114.company_name == '-':
                    wl114.company_name = phone_info_dict.get('企业名称', '-')
                wl114.person_name = phone_info_dict.get('联系人', '-')
                wl114.fax = phone_info_dict.get('传真', '-')
                wl114.phone_number = phone_info_dict.get('手机', '-')
                wl114.source_page = url3
                wl114.data_save()
                wl114.phone_number = phone_info_dict.get('联系电话', '-')
                wl114.data_save()
                print(f'{profession1}——第{page}页——{wl114.company_name}信息导入完成')
            page += 1
    for url2 in url_list2:
        try:
            wl114.get_request(url2)
        except:
            continue
        url_list4 = (i.replace('.html', '-p-{}.html')
                     for i in wl114.data_search(
                         'find', '.product_w369_list a[href]', 'href'))
        profession_list4 = wl114.data_search('find',
                                             '.product_w369_list a[href]')
        for profession4, url4 in zip(profession_list4, url_list4):
            try:
                wl114.get_request(url4.format(1))
                all_page = wl114.data_search(
                    'find', '.page_p:not(span)').__next__().split('\xa0')[1]
            except:
                continue
            for page in range(1, int(all_page) + 1):
                print(f'{profession4}——第{page}页')
                try:
                    wl114.get_request(url4.format(page))
                except:
                    continue
                url_list3 = list(
                    wl114.data_search('find', '.product_list_div_h143 h2 a',
                                      'href'))
                if not url_list3:
                    break
                for url3 in url_list3:
                    try:
                        wl114.get_request(url3)
                        company_info_dict = {
                            i.split(':')[0].strip(): i.split(':')[-1].strip()
                            for i in wl114.data_search(
                                'find', '.right.w_250 .border.p_8 li')
                            if ':' in i
                        }
                        phone_url = wl114.data_search(
                            'find', '.right.w_250 .border.p_8 li a',
                            'href').__next__()
                    except:
                        continue
                    wl114.company_type = company_info_dict.get('企业性质', '-')
                    wl114.main_product = company_info_dict.get('企业主营', '-')
                    wl114.address = company_info_dict.get('企业地址', '-')
                    try:
                        wl114.get_request(phone_url)
                    except:
                        continue
                    phone_info_data = wl114.data_search(
                        'find', 'td[valign="top"]:first-child')
                    try:
                        phone_info_list = phone_info_data.__next__().split(
                            '\n')
                        phone_info_dict = {
                            i.split(':')[0].strip(): i.split(':')[-1].strip()
                            for i in phone_info_list if ':' in i
                        }
                    except:
                        continue
                    wl114.company_name = phone_info_dict.get('公司名称', '-')
                    if wl114.company_name == '-':
                        wl114.company_name = phone_info_dict.get('企业名称', '-')
                    wl114.person_name = phone_info_dict.get('联系人', '-')
                    wl114.fax = phone_info_dict.get('传真', '-')
                    wl114.phone_number = phone_info_dict.get('手机', '-')
                    wl114.source_page = url3
                    wl114.data_save()
                    wl114.phone_number = phone_info_dict.get('联系电话', '-')
                    wl114.data_save()
                    print(
                        f'{profession4}——第{page}页——{wl114.company_name}信息导入完成')
                page += 1
    wl114.spider_end()
コード例 #30
0
ファイル: lhb_spider.py プロジェクト: cwy1019120542/MySpiders
def business_detail_spider():
    stock_list = []
    business_detail = SuperSpider(host='47.102.40.81',
                                  passwd='Abc12345',
                                  db='bryframe',
                                  table_name='business_detail',
                                  field_list=('spider_date', 'up_date', 'code',
                                              'name', 'department_name',
                                              'amount'))
    business_detail.up_date = business_detail.spider_date
    page = 1
    while True:
        try:
            json_data = business_detail.use_requests_to_html(
                f'http://data.eastmoney.com/DataCenter_V3/stock2016/ActiveStatistics/pagesize=50,page={page},sortRule=-1,sortType=JmMoney,startDate={business_detail.spider_date},endDate={business_detail.spider_date},gpfw=0,js=var%20data_tab_1.html?rt=25861061',
                'GB2312')
            data_list = business_detail.json_to_py(json_data,
                                                   deal=True)['data']
        except:
            print(f'第{page}页获取失败')
            page += 1
            continue
        if not data_list or page == 500:
            break
        print(f'第{page}页')
        for data in data_list:
            if not data['SName']:
                continue
            stock_data_list = business_detail.json_to_py(data['SName'])
            for stock_data in stock_data_list:
                if stock_data['CodeName'] not in stock_list:
                    stock_list.append(stock_data['CodeName'])
                else:
                    continue
                business_detail.name = stock_data['CodeName']
                business_detail.code = stock_data['SCode']
                try:
                    url_code = business_detail.re_find(
                        r'\d+', business_detail.code).__next__().group()
                except:
                    continue
                print(url_code)
                url = f'http://data.eastmoney.com/stock/lhb,{business_detail.spider_date},{url_code}.html'
                try:
                    business_detail.get_request(url)
                except:
                    continue
                detail_data_list = list(
                    business_detail.data_search('find', 'table tbody td'))
                for i, j in zip(range(1, 71, 7), range(6, 71, 7)):
                    try:
                        business_detail.department_name = detail_data_list[
                            i].split('\n')[0]
                        business_detail.amount = detail_data_list[j]
                    except:
                        break
                    business_detail.data_save()
                    print(
                        f'每日成交明细——{business_detail.up_date}——{business_detail.code}——{business_detail.name}——{business_detail.department_name}——导入完成'
                    )
        page += 1
    business_detail.spider_end()