def lst4objs_txt_az(xobjs,fltLst=[]): clst=[] odds = [0] * 8 for x in xobjs: #css=x.text.replace('\n','') css=zstr.str_flt(x.get_text(),fltLst) c20=css.split(' ') for c in c20: if c!='' and c!='升' and c!='降': clst.append(c) cl = clst[0:3]+clst[-3:] odds[0] = cl[0] odds[1] = cl[2] odds[2] = cl[3] odds[3] = cl[5] begin_pan = float(tfsys.pan[cl[1]]) end_pan = float(tfsys.pan[cl[4]]) if begin_pan >= 0: odds[5] = begin_pan # 1- begin_pan * 0.125 #乘以0.125是为了归一化 else: odds[4] = -begin_pan # 1- begin_pan * -0.125 if end_pan >= 0: odds[7] = end_pan # 1- end_pan * 0.125 else: odds[6] = -end_pan # 1- end_pan * -0.125 return odds
def lst4objs_txt2(text, fltLst=[]): clst = [] css = zstr.str_flt(text, fltLst) c20 = css.split(' ') for c in c20: if c != '': clst.append(c) # return clst
def lst4objs_txt(xobjs, fltLst=[]): clst = [] for x in xobjs: #css=x.text.replace('\n','') css = zstr.str_flt(x.get_text(), fltLst) c20 = css.split(' ') for c in c20: if c != '': clst.append(c) # return clst
def fb_get_team_dataset(htm): ### 1.获取轮赛参赛队伍 teamDict = {} result = re.findall(r"var arrTeam = \[.*\]", htm) res = result[0].split(' = ') teams = re.findall(r"\[\[(.*)\]\]", res[1]) teams = teams[0].split('],[') for team in teams: res = re.split("\,", team) team_id = res[0] team_name = res[1] team_name = zstr.str_flt(team_name, '\'') teamDict[team_id] = team_name ### 2. 获取参赛队伍积分,进失球等 scoresDf = pd.DataFrame(columns=tfsys.scSgn, dtype=str) ds = pd.Series(tfsys.scNil, index=tfsys.scSgn, dtype=str) result = re.findall(r"var totalScore = \[.*\]", htm) res = result[0].split(' = ') scores = re.findall(r"\[\[(.*)\]\]", res[1]) scores = scores[0].split('],[') for score in scores: res = re.split("\,", score) ds['teamPL'] = int(res[1]) #名次 ds['team_id'] = res[2] #球队ID ds['MW'] = int(res[4]) #轮次 ds['wins'] = int(res[5]) #胜场数 ds['draws'] = int(res[6]) #平 ds['losts'] = int(res[7]) #负 ds['goal_score'] = float(res[8]) #进球数 ds['goal_conceded'] = float(res[9]) #失球数 ds['goal_diff'] = float(res[10]) #净胜球数 ds['GS'] = float(res[14]) #场均进球数 ds['GC'] = float(res[15]) #场均失球数 ds['TP'] = float(res[16]) / int(res[4]) #场均得分 ds['M1'] = get_sign(res[24]) #前 1 轮赛果 ds['M2'] = get_sign(res[23]) #前 2 轮赛果 ds['M3'] = get_sign(res[22]) #前 3 轮赛果 ds['M4'] = get_sign(res[21]) #前 4 轮赛果 ds['M5'] = get_sign(res[20]) #前 5 轮赛果 scoresDf = scoresDf.append(ds.T, ignore_index=True) gs_max = (scoresDf['goal_score']).max() gc_max = (scoresDf['goal_conceded']).max() scoresDf['goal_score'] = scoresDf['goal_score'] / gs_max scoresDf['goal_conceded'] = scoresDf['goal_conceded'] / gc_max return teamDict, scoresDf
def lst4objs_txt_az(xobjs, fltLst=[]): clst = [] for x in xobjs: #css=x.text.replace('\n','') css = zstr.str_flt(x.get_text(), fltLst) c20 = css.split(' ') for c in c20: if c != '' and c != '升' and c != '降': clst.append(c) cl = clst[0:3] + clst[-3:] cl[1] = tfsys.pan[cl[1]] cl[4] = tfsys.pan[cl[4]] return cl
def fb_league_gids(htm, league, fgExt=True): df = pd.DataFrame(columns=tfsys.gidSgn, dtype=str) ds = pd.Series(tfsys.gidNil, index=tfsys.gidSgn, dtype=str) nround = re.findall(r"jh\[\"R_.*\"\] = \[.*\]", htm) #赛季有多少轮 n_round = len(nround) for n in range(5, n_round): ##从第5轮开始采集数据 pattern = "jh\[\"R_" + str(n + 1) + "\"\] = \[.*\]" result = re.findall(pattern, htm) res = result[0].split(' = ') games = re.findall(r"\[\[(.*)\]\]", res[1]) games = games[0].split('],[') for game in games: res = re.split("\,", game) ds['gid'] = res[0] ds['gset'] = league ds['kend'] = '1' #比赛已结束 score = res[6] score = zstr.str_flt(score, '\'') ds['qj'] = re.split("\-", score)[0] ds['qs'] = re.split("\-", score)[1] qj = int(ds['qj']) qs = int(ds['qs']) if qj > qs: ds['kwin'] = '3' elif qj < qs: ds['kwin'] = '0' else: ds['kwin'] = '1' ds['mplay'] = tfsys.teamIds[res[4]] ds['gplay'] = tfsys.teamIds[res[5]] ds['mtid'] = res[4] ds['gtid'] = res[5] ds['tplay'] = res[3] ds['tweek'] = str(n + 1) #第几轮赛事 ds['tsell'] = str(n + 1) df = df.append(ds.T, ignore_index=True) tfsys.gids = tfsys.gids.append(df) tfsys.gids.drop_duplicates(subset='gid', keep='last', inplace=True) if fgExt: fb_gid_getExt(df) #单线程 else: fb_gid_getExtPool(df) #多线程 if tfsys.gidsFN != '': print('+++++') print(tfsys.gids.tail()) tfsys.gids.to_csv(tfsys.gidsFN, index=False, encoding='gb18030')
def get_score_data(htm, keyword): cols = [ 'HTGS', 'ATGS', 'HTGC', 'ATGC', 'HTP', 'ATP', 'HTGD', 'ATGD', 'DiffPts', 'DiffLP' ] score = pd.Series(index=cols, dtype=str) scoreDF = pd.DataFrame(columns=cols) home = re.findall(r"var h2h_home = .*", htm) home = zstr.str_flt(home[0], [';', ' ', '\n', '\t', '\r']) homeId = (re.split("\=", home))[1] away = re.findall(r"var h2h_away = .*", htm) away = (re.split("\=", away[0]))[1] awayId = zstr.str_flt(away, [';', ' ', '\n', '\t', '\r']) HTGS = 0.0 HTGC = 0.0 HTP = 0.0 ATGS = 0.0 ATGC = 0.0 ATP = 0.0 GS_MAX = 0 GC_MAX = 0 datas = re.findall(keyword, htm) datas = datas[0].split('\",\"') for data in datas: keystr = '|' pos = data.index(keystr) data = data[pos:] res = re.split("\|", data) teamId = res[2] MW = float(res[5]) ##比赛轮次 if teamId == homeId: HomeTeamLP = float(res[1]) HTGS = float(res[15]) HTGC = float(res[16]) HTP = float(res[17]) / MW HTGD = (HTGS - HTGC) if teamId == awayId: AwayTeamLP = float(res[1]) ATGS = float(res[15]) ATGC = float(res[16]) ATP = float(res[17]) / MW ATGD = (ATGS - ATGC) if float(res[15]) > GS_MAX: GS_MAX = float(res[15]) if float(res[16]) > GC_MAX: GC_MAX = float(res[16]) DiffPts = HTP - ATP DiffLP = HomeTeamLP - AwayTeamLP score['HTGS'] = HTGS / GS_MAX score['ATGS'] = ATGS / GS_MAX score['HTGC'] = HTGC / GC_MAX score['ATGC'] = ATGC / GC_MAX score['HTP'] = HTP score['ATP'] = ATP score['HTGD'] = HTGD score['ATGD'] = ATGD score['DiffPts'] = DiffPts score['DiffLP'] = DiffLP scoreDF = scoreDF.append(score.T, ignore_index=True) return scoreDF, MW
def fb_get_features(htm, bars, ftg=''): cols = [ 'HTGS', 'ATGS', 'HTGC', 'ATGC', 'HTP', 'ATP', 'HM1', 'HM2', 'HM3', 'HM4', 'HM5', 'AM1', 'AM2', 'AM3', 'AM4', 'AM5', 'HTWinStreak3', 'HTWinStreak5', 'HTLossStreak3', 'HTLossStreak5', 'ATWinStreak3', 'ATWinStreak5', 'ATLossStreak3', 'ATLossStreak5', 'HTGD', 'ATGD', 'DiffPts', 'DiffFormPts', 'teamPL' ] features = pd.DataFrame(columns=cols) fea = pd.Series(index=cols, dtype=str) game = re.findall(r"var ScheduleID=.*", htm) if len(game) == 0: game = re.findall(r"ScheduleID=.*", htm) game = zstr.str_flt(game[0], [';', ' ', '\n', '\t', '\r']) gid = (re.split("\=", game))[1] home = re.findall(r"var hometeamID=.*", htm) home = zstr.str_flt(home[0], [';', ' ', '\n', '\t', '\r']) homeId = (re.split("\=", home))[1] away = re.findall(r"var guestteamID=.*", htm) away = (re.split("\=", away[0]))[1] awayId = zstr.str_flt(away, [';', ' ', '\n', '\t', '\r']) league_sc = tfsys.league_sc hIndex = np.where(league_sc['team_id'] == homeId) aIndex = np.where(league_sc['team_id'] == awayId) if len(hIndex[0]) == 0 and len(aIndex[0]) == 0: return hIndex = league_sc.index[hIndex][0] aIndex = league_sc.index[aIndex][0] fea['HTGS'] = league_sc.iloc[hIndex].goal_score fea['ATGS'] = league_sc.iloc[aIndex].goal_score fea['HTGC'] = league_sc.iloc[hIndex].goal_conceded fea['ATGC'] = league_sc.iloc[aIndex].goal_conceded fea['HTP'] = league_sc.iloc[hIndex].TP / league_sc.iloc[hIndex].MW fea['ATP'] = league_sc.iloc[aIndex].TP / league_sc.iloc[hIndex].MW fea['HM1'] = league_sc.iloc[hIndex].M1 fea['HM2'] = league_sc.iloc[hIndex].M2 fea['HM3'] = league_sc.iloc[hIndex].M3 fea['HM4'] = league_sc.iloc[hIndex].M4 fea['HM5'] = league_sc.iloc[hIndex].M5 fea['AM1'] = league_sc.iloc[aIndex].M1 fea['AM2'] = league_sc.iloc[aIndex].M2 fea['AM3'] = league_sc.iloc[aIndex].M3 fea['AM4'] = league_sc.iloc[aIndex].M4 fea['AM5'] = league_sc.iloc[aIndex].M5 HTFormPtsStr = league_sc.iloc[hIndex].M1 + league_sc.iloc[ hIndex].M2 + league_sc.iloc[hIndex].M3 + league_sc.iloc[ hIndex].M4 + league_sc.iloc[hIndex].M5 ATFormPtsStr = league_sc.iloc[aIndex].M1 + league_sc.iloc[ aIndex].M2 + league_sc.iloc[aIndex].M3 + league_sc.iloc[ aIndex].M4 + league_sc.iloc[aIndex].M5 fea['HTWinStreak3'] = get_3game_ws(HTFormPtsStr) fea['HTWinStreak5'] = get_5game_ws(HTFormPtsStr) fea['HTLossStreak3'] = get_3game_ls(HTFormPtsStr) fea['HTLossStreak5'] = get_5game_ls(HTFormPtsStr) fea['ATWinStreak3'] = get_3game_ws(ATFormPtsStr) fea['ATWinStreak5'] = get_5game_ws(ATFormPtsStr) fea['ATLossStreak3'] = get_3game_ls(ATFormPtsStr) fea['ATLossStreak5'] = get_5game_ls(ATFormPtsStr) fea['HTGD'] = league_sc.iloc[hIndex].goal_diff / league_sc.iloc[hIndex].MW fea['ATGD'] = league_sc.iloc[aIndex].goal_diff / league_sc.iloc[aIndex].MW fea['DiffPts'] = (league_sc.iloc[hIndex].TP - league_sc.iloc[aIndex].TP) / league_sc.iloc[aIndex].MW HTFormPts = get_form_points(HTFormPtsStr) ATFormPts = get_form_points(ATFormPtsStr) DiffFormPts = HTFormPts - ATFormPts fea['DiffFormPts'] = DiffFormPts / league_sc.iloc[aIndex].MW fea['teamPL'] = league_sc.iloc[hIndex].teamPL - league_sc.iloc[ aIndex].teamPL features = features.append(fea.T, ignore_index=True) features = HM_one_hot_encoder(features) features['gid'] = gid tfsys.samples = tfsys.samples.append(features)