import sys,os sys.path.append(os.path.dirname(__file__) + os.sep + '../') from datetime import datetime import time from import_data_to_mysql import con_db from common_tool import get_response, redis_return_operation, get_log from setting import db_setting from import_data_to_redis import RedisCache_checkAPI """ 英雄联盟官网爬虫 """ match_yxlmgw_log = get_log('match_yxlmgw') team_list = ['RNG', 'ES', 'EDG', 'LGD', 'IG', 'BLG', 'TES', 'SN', 'WE', 'OMG', 'DMO', 'LNG', 'JDG', 'FPX', 'RW', 'VG', 'V5'] # 英雄联盟官网的url headers---> headers_yxlmgw # 正常情况下赛程页面是 url_finish_1:显示2条已完成 url_matching:进行中(status为‘-1’就是没有进行中的比赛) url_unfinish: 未开始 # url_finish_2, url_finish_3 :包含一周内已完成的比赛 # url_unfinish_2, url_unfinish_3 :包含一周内已完成的比赛 url_finish_1 = 'https://apps.game.qq.com/lol/match/apis/searchBMatchInfo_bak.php?p8=5&p1=134&p4=3&p2=%C8%AB%B2%BF&p9=&p10=&p6=2&p11=&p12=&page=1&pagesize=2&_=' url_finish_2 = 'https://apps.game.qq.com/lol/match/apis/searchBMatchInfo_bak.php?p8=5&p1=134&p4=&p2=%C8%AB%B2%BF&p9=&p10=&p6=2&p11={0}&p12=&page=1&pagesize=8&_='
import sys,os sys.path.append(os.path.dirname(__file__) + os.sep + '../') from datetime import datetime from common_tool import get_response, get_log, redis_check from import_data_to_mysql import con_db from import_data_to_redis import RedisCache_checkAPI from setting import db_setting """ 雷竞技网英雄联盟赔率爬虫 url: https://www.ray83.com/match/37198305 """ leijingji_log = get_log('leijingji') # 爬虫流程: # 开始加载两页赛程的url:start_url, second_url # start_url = 'https://incpgameinfo.esportsworldlink.com/v2/match?page=1&match_type=2', # second_url ='https://incpgameinfo.esportsworldlink.com/v2/match?page=2&match_type=2' # 从start_url和second_url中拿到id,拼凑得到详情url # 详情url中拿到对应赔率url:https://incpgameinfo.esportsworldlink.com/v2/odds?match_id=37219633 # 今日 start_url = 'https://incpgameinfo.esportsworldlink.com/v2/match?page=1&match_type=2' # 滚盘 gunpan_url1 = 'https://incpgameinfo.esportsworldlink.com/v2/match?page=1&match_type=1' gunpan_url2 = 'https://incpgameinfo.esportsworldlink.com/v2/match?page=1&match_type=0' gunpan_url3 = 'https://incpgameinfo.esportsworldlink.com/v2/match?page=2&match_type=0' gunpan_urls = [gunpan_url1, gunpan_url2, gunpan_url3]
""" start_url = 'https://www.scoregg.com/services/api_url.php' header = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' ' Chrome/84.0.4147.89 Safari/537.36' } league_exclude = [ '2020 LCK夏季升降级赛', '2019KeSPA杯', '2019拉斯维加斯全明星', 'LPL公开训练赛', '2017 KPL秋季赛' ] position_dict = {'上单': 1, '打野': 2, '中单': 3, 'ADC': 4, '辅助': 5, 'None': 6} lol_player_log = get_log('lol_player') # 请求英雄联盟联赛id的form_data url:https://www.scoregg.com/services/api_url.php form_data_yxlm = { 'api_path': '/services/match/web_tournament_group_list.php', 'method': 'GET', 'platform': 'web', 'api_version': '9.9.9', 'language_id': 1, 'gameID': 1, 'type': 'all', 'page': 1, 'limit': 18, 'year': '' }
# 淘汰赛 url_knockout1 = 'https://app.tga.qq.com/openapi/tgabank/getSchedules?appid=10005&sign=K8tjxlHDt7HHFSJTlxxZW4A%2BalA%3D&begin_time=1596067200&end_time=1596153600&seasonid=KCC2020S' url_knockout2 = 'https://app.tga.qq.com/openapi/tgabank/getSchedules?appid=10005&sign=K8tjxlHDt7HHFSJTlxxZW4A%2BalA%3D&begin_time=1596153600&end_time=1596240000&seasonid=KCC2020S' url_knockout3 = 'https://app.tga.qq.com/openapi/tgabank/getSchedules?appid=10005&sign=K8tjxlHDt7HHFSJTlxxZW4A%2BalA%3D&begin_time=1596240000&end_time=1596326400&seasonid=KCC2020S' url_knockout4 = 'https://app.tga.qq.com/openapi/tgabank/getSchedules?appid=10005&sign=K8tjxlHDt7HHFSJTlxxZW4A%2BalA%3D&begin_time=1596326400&end_time=1596412800&seasonid=KCC2020S' url_knockout5 = 'https://app.tga.qq.com/openapi/tgabank/getSchedules?appid=10005&sign=K8tjxlHDt7HHFSJTlxxZW4A%2BalA%3D&begin_time=1596844800&end_time=1596931200&seasonid=KCC2020S' url_knockout6 = 'https://app.tga.qq.com/openapi/tgabank/getSchedules?appid=10005&sign=K8tjxlHDt7HHFSJTlxxZW4A%2BalA%3D&begin_time=1596931200&end_time=1597017600&seasonid=KCC2020S' url_knockout7 = 'https://app.tga.qq.com/openapi/tgabank/getSchedules?appid=10005&sign=K8tjxlHDt7HHFSJTlxxZW4A%2BalA%3D&begin_time=1597536000&end_time=1597622400&seasonid=KCC2020S' url_knockout_list = [ url_knockout1, url_knockout2, url_knockout3, url_knockout4, url_knockout5, url_knockout6, url_knockout7 ] match_wzry_log = get_log('match_wzry') redis = RedisCache_checkAPI() db = con_db(db_setting['host'], db_setting['user'], db_setting['password'], db_setting['db']) def parse_wzry(url, headers, propertys, db): try: responses = get_response(url, headers) results = responses['data'] # print(len(results), results) game_name = '王者荣耀' source_from = '王者荣耀官网' # 爬虫源网站 types = 2 for result in results: # print('赛程数据1:', type(result), result)
""" 五大联赛资讯爬虫 抓取规则:五大联赛对应的start_url找到article_id,拼接成咨询详情页的url 用xpath提取 """ redis = RedisCache_urldistict() db = con_db(db_sport_setting['host'], db_sport_setting['user'], db_sport_setting['password'], db_sport_setting['db']) England_url = 'http://www.ppsport.com/premierleague' Spain_url = 'http://www.ppsport.com/laliga' German_url = 'http://www.ppsport.com/bundesliga' Italy_url = 'http://www.ppsport.com/seriea' France_url = 'http://www.ppsport.com/ligue1' information_pptv_log = get_log('information_pptv') # 五大联赛对应联赛归属 start_url_O = { England_url:'英超联赛', Spain_url:'西甲联赛', German_url:'德甲联赛', Italy_url:'意甲联赛', France_url:'法甲联赛' } # 五大联赛的网页对应的提取资讯id的xpath规则: # 英超是两个板块,其他联赛是三个板块(三个板块xpath规则目前看起来是一样) # 提取的资讯id格式为('/article/news/1042662.html'),直接凭借成咨询详情页 league_xpath_rule = { England_url: { '/html/body/div[1]/div[4]/div[2]/div[3]/dl[1]/dd/a/@href', '/html/body/div[1]/div[4]/div[6]/div[2]/div/div/div/div/a/@href' }, Spain_url:{
form_data 中主要有两个变动参数:tournament_id(联赛id), page(页数) """ start_url = 'https://www.scoregg.com/services/api_url.php' header = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' ' Chrome/84.0.4147.89 Safari/537.36' } league_exclude = [ '2020 LCK夏季升降级赛', '2019KeSPA杯', '2019拉斯维加斯全明星', 'LPL公开训练赛', '2017 KPL秋季赛' ] lol_team_log = get_log('lol_team_log') # 请求英雄联盟联赛id的form_data url:https://www.scoregg.com/services/api_url.php form_data_yxlm = { 'api_path': '/services/match/web_tournament_group_list.php', 'method': 'GET', 'platform': 'web', 'api_version': '9.9.9', 'language_id': 1, 'gameID': 1, 'type': 'all', 'page': 1, 'limit': 18, 'year': '' }
today = datetime.now() week_day = today.weekday() today_str = today.strftime('%Y-%m-%d 00:00:00') str_today = datetime.strptime(today_str, '%Y-%m-%d %H:%M:%S') today_stamp = str_today.timestamp() # 这周1的00:00:00时间戳 monday_stamp = int(today_stamp - 86400 * week_day) # # 上周日的00:00:00时间戳 last_weekstamp = int(monday_stamp - 86400) # 下周1的00:00:00时间戳 next_weekstamp = int(monday_stamp + 86400 * 7) time_list = [monday_stamp, next_weekstamp] match_wanplus_log = get_log('match_wanplus') redis = RedisCache_checkAPI() db = con_db(db_setting['host'], db_setting['user'], db_setting['password'], db_setting['db']) def parse_wanplus(url, data, db, headers): try: responses = post_response(url, data, headers) results = responses['data']['scheduleList'] game_name = '英雄联盟' source_from = 'wanplus' # 爬虫源网站 types = 1 for key_list, result in results.items(): date_time = result['time']
import sys, os sys.path.append(os.path.dirname(__file__) + os.sep + '../') import requests from lxml import etree from import_data_to_mysql import con_db from setting import db_sport_setting from common_tool import get_log """ 竞彩网赔率爬虫 """ db = con_db(db_sport_setting['host'], db_sport_setting['user'], db_sport_setting['password'], db_sport_setting['db']) sport_bet_log = get_log('sport_bet') bet_list = {'u-cir': 2, 'u-dan': 1, 'u-kong': 0} headers_bet = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/84.0.4147.125 Safari/537.36' } start_url = 'https://info.sporttery.cn/football/match_list.php' match_betdetail = 'https://www.lottery.gov.cn/football/match_hhad.jspx?mid=' response = requests.get(url=start_url, headers=headers_bet) response = response.content.decode('gb2312')
match_url_pre = 'https://img1.famulei.com/tr_round/{0}.json?_={1}' # 战队排名url type_url_pre = 'https://img1.famulei.com/match/teamrank/{0}.json?_={1}' # 此时的时间戳 now_time_match = datetime.now() timestamps_match = int(now_time_match.timestamp() * 1000) source = 'score' # 键值对:‘网站战队名:分组名’ team_type_name = {} # 键值对:‘后端返回战队名:分组名’ realteam_type_name = {} league_board_log = get_log('league_board') db = con_db(db_setting['host'], db_setting['user'], db_setting['password'], db_setting['db']) redis = RedisCache_checkAPI() def parse(form_data_yxlm, types): game_name = '英雄联盟' if types == 1 else '王者荣耀' league_id = 0 try: responses = post_response(start_url, form_data_yxlm, header) responses = responses['data']['list'] # print('源数据:', responses) for response in responses: # 拿到联赛id tournamentID = response['tournamentID']
import json from datetime import datetime, timedelta from common_tool import redis_check, get_response_proxy, get_log from import_data_to_mysql import con_db from import_data_to_redis import RedisCache_checkAPI from setting import db_setting """ 尚牛电竞网比赛详情爬虫(数据不稳定,已弃用) """ # 创建数据库对象 db = con_db(db_setting['host'], db_setting['user'], db_setting['password'], db_setting['db']) # 创建redis对象 redis = RedisCache_checkAPI() detail_log = get_log('match_detail') # LPL战队列表 LPL_list = [ 'RNG', 'ES', 'EDG', 'LGD', 'IG', 'BLG', 'TES', 'SN', 'WE', 'OMG', 'DMO', 'LNG', 'JDG', 'FPX', 'RW', 'VG', 'V5' ] now_date = datetime.now() now_stamp = int(now_date.timestamp()) # print(now_stamp) cookie_message = 'UM_distinctid=172d9950ded60f-00916514fef24f-4353761-e1000-172d9950deea7b; ' \ 'Hm_lvt_c95eb6bfdfb2628993e507a9f5e0ea01=1594349716,1594629849,1594689821,1594950270; ' \ 'CNZZDATA1278221275=1183247664-1592785074-%7C1594948928; ' \ 'Hm_lpvt_c95eb6bfdfb2628993e507a9f5e0ea01={}'.format(now_stamp) headers = {
from import_data_to_redis import RedisCache_checkAPI from datetime import datetime, timedelta from setting import db_setting import json """ 英雄联盟赛事详情爬虫 从score网站上抓取 # start_url:https://www.scoregg.com/schedule 抓取流程: 从start_url中拿到昨天和今天的网站的赛事,通过过滤赛程类型来选择要抓取的赛事详情,其中的result就是该赛事打了几局,拿到里面的resultID 用resultID拼凑出对局详情的数据 :'https://img1.famulei.com/match/result/21952.json?_=1596076077396' 再根据接口返回数据分析入库 """ match_detail_score_log = get_log('match_detail_score') # 现有需要的联赛 tournamentID = { '170': '2020 LCS夏季赛', '171': '2020 LEC夏季赛', '172': '2020 LPL夏季赛', '173': '2020 LCK夏季赛', '174': '2020 LDL夏季赛' } # 对应位置的字典 position_dict = { 'blue_hero_a_name':1, 'blue_hero_b_name':2, 'blue_hero_c_name':3, 'blue_hero_d_name':4, 'blue_hero_e_name':5, 'red_hero_a_name':1, 'red_hero_b_name':2, 'red_hero_c_name':3, 'red_hero_d_name':4, 'red_hero_e_name':5 }
英雄排行榜(英雄联盟,王者荣耀) 抓取规则: 每个联赛都有一个tournament_id,以post请求:https://www.scoregg.com/services/api_url.php form_data 中主要有两个变动参数:tournament_id(联赛id), page(页数) """ start_url = 'https://www.scoregg.com/services/api_url.php' header = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' ' Chrome/84.0.4147.89 Safari/537.36' } league_exclude = ['2020 LCK夏季升降级赛', '2019KeSPA杯', '2019拉斯维加斯全明星', 'LPL公开训练赛', '2017 KPL秋季赛'] lol_heros_log = get_log('lol_heros') # 请求英雄联盟联赛id的form_data url:https://www.scoregg.com/services/api_url.php form_data_yxlm = { 'api_path': '/services/match/web_tournament_group_list.php', 'method': 'GET', 'platform': 'web', 'api_version': '9.9.9', 'language_id': 1, 'gameID': 1, 'type': 'all', 'page': 1, 'limit': 18, 'year':'' }