def scrape_league_history(country): league_name = countries_leagues[country] db.delete_league_by_name(league_name) browser.get(main_url) more_countries_element = browser.find_element_by_class_name('show-more') more_countries_button = more_countries_element.find_element_by_link_text('More') execute_script_click(more_countries_button) click_league(country, league_name) archive_button = browser.find_element_by_link_text('Archive') archive_button.click() season_names = browser.find_elements_by_class_name('leagueTable__season')[2:] season_names = [season.find_element_by_tag_name('a') for season in season_names][::-1] league = League(name=league_name, country=country) db.save_league(league) seasons = [Season(name=get_years_from_season_name(season_name.text), league=league) for season_name in season_names] links = [season.get_attribute('href') for season in season_names] for season, link in zip(seasons, links): scrape_table(link, league, season) scrape_results(link, league, season)
def parseLeague(self): sh = self.wb.sheet_by_name(LEAGUE) self.teamSrv = ObjSrv() header = self.getRow(sh, 0, True) unifGrpIdL = header[3:] print(unifGrpIdL) divD = {} for i in range(1, sh.nrows): valL = self.getRow(sh, i, True) try: div = int(valL[0]) except ValueError: div = str(valL[0]) pool = valL[1] team = self.parseTeam(valL[2]) team.rank = i countL = [int(e) for e in valL[3:]] team.unifCountD = dict(zip(unifGrpIdL, countL)) if div not in divD: divD[div] = [] divD[div].append((pool, team)) divL = [] for divId, valL in iteritems(divD): poolL, teamL = zip(*valL) poolS = set(poolL) if len(poolS) == 1: division = Division('division-%s' % str(divId), teamL) elif len(poolS) == 2: pool1, pool2 = list(poolS) teamL1 = [team for pool, team in valL if pool == pool1] teamL2 = [team for pool, team in valL if pool == pool2] division = DivisionHalf('division-%s' % str(divId), teamL1, teamL2) else: raise Exception( "Can't have more than two pools in the same division (%s)." % (', '.join(map(str, poolS)))) for team in teamL: team.division = division divL.append(division) self.league = League(divL) self.loadLeague(self.league)
def parseLeague(self): sh = self.wb.sheet_by_name(LEAGUE) self.teamSrv = ObjSrv() header = self.getRow( sh, 0, True ) unifGrpIdL = header[3:] print(unifGrpIdL) divD = {} for i in range(1,sh.nrows): valL = self.getRow( sh, i, True ) try: div = int(valL[0]) except ValueError: div = str(valL[0]) pool = valL[1] team = self.parseTeam(valL[2]) team.rank = i countL = [ int(e) for e in valL[3:] ] team.unifCountD = dict( zip(unifGrpIdL, countL )) if div not in divD: divD[ div ] = [] divD[div].append( (pool, team ) ) divL = [] for divId, valL in iteritems(divD): poolL, teamL = zip( *valL ) poolS = set(poolL) if len(poolS) == 1: division = Division('division-%s'%str(divId), teamL) elif len(poolS) == 2: pool1, pool2 = list(poolS) teamL1 = [ team for pool, team in valL if pool == pool1 ] teamL2 = [ team for pool, team in valL if pool == pool2 ] division = DivisionHalf('division-%s'%str(divId), teamL1, teamL2) else: raise Exception("Can't have more than two pools in the same division (%s)."%(', '.join(map(str,poolS))) ) for team in teamL : team.division = division divL.append( division ) self.league = League( divL ) self.loadLeague( self.league )
def parseSeason(teamlist: TeamList, gamelist: GameList, prev_leagues, filename, leaguename, leagueyear, initial=False, is_lg=True): league = League(leaguename, leagueyear) default = (PPG, PPG) if initial else season_map.default_quality( league, prev_leagues) with open(filename) as games_in: # Open csv dicitonary reader and discard header game_rdr = csv.reader(games_in) row = next(game_rdr) table_formats = set([]) if row == ['Wk', 'Day', 'Date', 'Time', 'Home', 'Score', 'Away', 'Attendance', 'Venue', 'Referee', 'Match Report', 'Notes'] or \ row == ['Round', 'Day', 'Date', 'Time', 'Home', 'Score', 'Away', 'Attendance', 'Venue', 'Referee', 'Match Report', 'Notes']: pass elif row == [ 'Wk', 'Day', 'Date', 'Time', 'Home', 'xG', 'Score', 'xG', 'Away', 'Attendance', 'Venue', 'Referee', 'Match Report', 'Notes' ]: table_formats = {"XG"} elif row == [ 'Round', 'Wk', 'Day', 'Date', 'Time', 'Home', 'Score', 'Away', 'Attendance', 'Venue', 'Referee', 'Match Report', 'Notes' ]: table_formats = {"Round"} elif row == [ 'Round', 'Wk', 'Day', 'Date', 'Time', 'Home', 'xG', 'Score', 'xG', 'Away', 'Attendance', 'Venue', 'Referee', 'Match Report', 'Notes' ]: table_formats = {"XG", "Round"} else: raise RuntimeError( "Unable to parse table with header row: {}".format(row)) for row in game_rdr: if "XG" in table_formats: if "Round" in table_formats: rg_season, gamedate, homename, homexg, score, awayxg, awayname = row[ 0], row[3], row[5], row[6], row[7], row[8], row[9] else: gamedate, homename, homexg, score, awayxg, awayname = row[ 2], row[4], row[5], row[6], row[7], row[8] rg_season = None else: if "Round" in table_formats: rg_season, gamedate, homename, score, awayname = row[ 0], row[3], row[5], row[6], row[7] homexg = awayxg = None else: gamedate, homename, score, awayname = row[2], row[4], row[ 5], row[6] homexg = awayxg = rg_season = None if leaguename in ["champs"]: homename = " ".join(homename.split(" ")[:-1]) awayname = " ".join(awayname.split(" ")[1:]) try: date = datetime.datetime.strptime(gamedate, "%Y-%m-%d").date() except: continue hometeam = teamlist.getOrAdd(homename, default, is_lg) awayteam = teamlist.getOrAdd(awayname, default, is_lg) # parse scores if they exist try: homescore, awayscore = score.split('–') homescore, awayscore = int(homescore), int(awayscore) except: homescore = awayscore = None # parse home/away xg if they exist; otherwise make them None if homexg and awayxg: try: home_xg, away_xg = float(homexg), float(awayxg) except: print("Error in row with xg values", "/n", row) else: home_xg = away_xg = None # Make this a game lg = EXTRA_LEAGUE if is_lg and rg_season and rg_season != "Regular Season" else league game = Game(date, lg, hometeam, awayteam) if not (homescore is None): game.score(homescore, awayscore, home_xg, away_xg) # Add game to list of all games gamelist.add(game) return teamlist, league, gamelist
class ConfigLoader: def __init__(self, fileName): self.parserD = { DATE: self.parseDate, TIME: parseTime, FNAME: identity, STADIUM: identity, 'action': identity, TIME_MAX: parseTime, TIME_MIN: parseTime, TEAM: self.parseTeam, } self.dateSrv = ObjSrv() self.wb = xlrd.open_workbook(fileName) self.loadGeneralConf() self.parseLeague() self.getGroups() self.getFieldL() self.getRestriction() self.getTeamGrpD() self.parseUniformity() def parseDate(self, date): if isinstance(date, datetime.date): d = Date(date.year, date.month, date.day) else: month, day, year = date.split('/', 2) d = Date(int(year), int(month), int(day)) return self.dateSrv[d] def parseTeam(self, teamName): if isinstance(teamName, Team): team = teamName else: team = Team(teamName) return self.teamSrv[team] def loadLeague(self, league): self.divSrv = ObjSrv() self.teamSet = set() for division in league.divisionL: division = self.divSrv[division] for team in division.teamL: self.teamSet.add(team) def xl2py(self, val, fmt, strip=False ): if fmt == xlrd.XL_CELL_DATE: dateTuple = xlrd.xldate_as_tuple(val, self.wb.datemode) if dateTuple[3:] == (0,0,0): return datetime.date( *dateTuple[:3] ) elif dateTuple[:3] == (0,0,0): return datetime.time( *dateTuple[3:] ) else: return datetime.datetime( *dateTuple ) elif fmt == xlrd.XL_CELL_NUMBER: return val elif fmt == xlrd.XL_CELL_TEXT: if strip: val = val.strip() return val elif fmt == xlrd.XL_CELL_EMPTY: return None else: raise Exception( 'Unknown format %d'%fmt ) def getRow( self, sh, idx, strip = False ): valL = [] for val, fmt in zip( sh.row_values(idx), sh.row_types(idx) ): valL.append( self.xl2py( val, fmt, strip ) ) return valL def getGroups( self ): sh = self.wb.sheet_by_name(u'groupes') groupD = {} for division in self.divSrv.objD.values(): groupD[ '<%s>'%division.name ] = (TEAM,division.teamL) for i in range(1,sh.nrows): valL = self.getRow( sh, i, True ) type_ = valL[0] name = valL[1] if type_ not in typeL: raise ConfigException("%s n'est pas un type valide. Les valeurs possibles sont : %s."%(type_, ', '.join(typeL)) ) if not isGroupName(name): raise ConfigException("Un nom de groupe doit commencer par '<' et terminer par '>', ce qui n'est pas le cas de :%s"%name) itemL = [] for val in valL[2:]: if val is not None: itemL.append( val ) groupD[name] = (type_,itemL) self.groupD = groupD def expand( self, field, type_ ): if field is None: return None if not isinstance(field, (str, unicode)): valL = [field] else: if isGroupName( field ) and field in self.groupD: (type__, valL ) = self.groupD[ field ] if type__ != type_: raise Exception("Vous avez utilisé un groupe de type %s dans un champ de type %s"%(type__,type_) ) elif ',' in field: valL = [ val.strip() for val in field.split( ',' ) ] else: valL = [field] return [ self.parserD[type_]( val ) for val in valL ] def expandAll( self, valL, type2idx, typeL ): valLL = [] for type_ in typeL: idx = type2idx[type_] valLL.append( self.expand( valL[idx], type_ ) ) return valLL def getRestriction(self): sh = self.wb.sheet_by_name(RESTR) typeL, type2idx = self.getType( sh ) restrD = {} self.teamSrv.lock = True self.dateSrv.lock = True for i in range(1,sh.nrows): valL = self.getRow( sh, i, True ) include = signD[ str(valL[0]).strip() ] value = valL[1] teamL, dateL = self.expandAll( valL,type2idx,(TEAM,DATE) ) timeMinL, timeMaxL, stadiumL, fNameL = self.expandAll( valL,type2idx,(TIME_MIN,TIME_MAX,STADIUM,FNAME) ) if dateL is None: dateL = self.dateSrv.objD.values() for team in teamL: for date in dateL: restr = Restriction( include, value, team, date ) if timeMinL is not None: restr.timeMin = parseTime(timeMinL[0]) if timeMaxL is not None: restr.timeMax = parseTime(timeMaxL[0]) if stadiumL is not None: restr.stadiumS = set( stadiumL ) if fNameL is not None: restr.fNameS = set( fNameL ) try: restrD[ (team,date) ].append( restr ) except KeyError : restrD[ (team,date) ] = [restr] self.restrD = restrD def getType(self, sh ): typeL = self.getRow( sh, 0, True ) typeL = [ str(type_) for type_ in typeL ] type2idx = dict( zip( typeL, range(len(typeL)) ) ) return typeL, type2idx def getFieldL(self): sh = self.wb.sheet_by_name(u'terrains') typeL, _type2idx = self.getType(sh) fieldSet = set() for i in range(1, sh.nrows): valL = self.getRow(sh, i, True) action = str(valL[0]).strip() fieldPropLL = [self.expand(val, type_) for val, type_ in zip(valL[1:], typeL[1:])] subSet = set([fieldProp for fieldProp in itertools.product(*fieldPropLL)]) if action == '+': fieldSet.update(subSet) elif action == '-': fieldSet.difference_update(subSet) fieldL = list(fieldSet) fieldL.sort() dateL = list(set(list(zip( *fieldL))[1])) dateL.sort() self.fieldL = [] for stadium, date, time, fName in fieldL: self.fieldL.append( Field( stadium, fName, time, date, self.matchDuration ) ) def getTeamGrpD(self): sh = self.wb.sheet_by_name(TEAM_GROUP) teamGrpD = {} for i in range(1,sh.nrows): valL = self.getRow( sh, i, True ) name = valL[0] pen = valL[1] teamL = [] for teamName in valL[2:]: if teamName is not None: team = self.parseTeam(teamName) if team in self.teamSet: teamL.append( team ) else: print('Unknown team : %s' % str(team)) teamGrpD[name] = (pen, teamL) self.teamGrpD = teamGrpD def parseLeague(self): sh = self.wb.sheet_by_name(LEAGUE) self.teamSrv = ObjSrv() header = self.getRow( sh, 0, True ) unifGrpIdL = header[3:] print(unifGrpIdL) divD = {} for i in range(1,sh.nrows): valL = self.getRow( sh, i, True ) try: div = int(valL[0]) except ValueError: div = str(valL[0]) pool = valL[1] team = self.parseTeam(valL[2]) team.rank = i countL = [ int(e) for e in valL[3:] ] team.unifCountD = dict( zip(unifGrpIdL, countL )) if div not in divD: divD[ div ] = [] divD[div].append( (pool, team ) ) divL = [] for divId, valL in iteritems(divD): poolL, teamL = zip( *valL ) poolS = set(poolL) if len(poolS) == 1: division = Division('division-%s'%str(divId), teamL) elif len(poolS) == 2: pool1, pool2 = list(poolS) teamL1 = [ team for pool, team in valL if pool == pool1 ] teamL2 = [ team for pool, team in valL if pool == pool2 ] division = DivisionHalf('division-%s'%str(divId), teamL1, teamL2) else: raise Exception("Can't have more than two pools in the same division (%s)."%(', '.join(map(str,poolS))) ) for team in teamL : team.division = division divL.append( division ) self.league = League( divL ) self.loadLeague( self.league ) def parseUniformity(self): sh = self.wb.sheet_by_name(UNIF) unifL = [] for i in range(1,sh.nrows): restr = Restriction(False, 1, None, None) key, pen, maxCount, timeMin, timeMax, stadium, field = self.getRow(sh, i, True) if stadium is not None : restr.stadiumS = set([stadium]) if field is not None : restr.fNameS = set([field]) restr.timeMin = parseTime(timeMin) restr.timeMax = parseTime(timeMax) unifL.append( (key, float(pen) / 2**maxCount, restr) ) keySet = set(list(zip( *unifL))[0]) for team in self.league.getTeamL(): if hasattr( team, 'unifCountD'): for id_ in team.unifCountD.keys(): if id_ not in keySet: print("Warning : %s not in [%s]" % (id_, ', '.join(list(keySet)))) self.unifL = unifL def loadGeneralConf(self): sh = self.wb.sheet_by_name(CONF_SHEET) confD = {} for i in range(sh.nrows): key, val = self.getRow( sh, i, True ) confD[ confKeyMap[key] ] = val self.matchDuration = parseTime( confD['matchDuration'] ) self.nbMatch = int( confD['nbMatch'] ) self.penDateFactor = confD['penDateFactor'] def getConfig(self): config = Config(self.league, self.fieldL, self.restrD, self.teamGrpD, self.unifL, self.matchDuration, self.nbMatch, self.penDateFactor) return config
class ConfigLoader: def __init__(self, fileName): self.parserD = { DATE: self.parseDate, TIME: parseTime, FNAME: identity, STADIUM: identity, 'action': identity, TIME_MAX: parseTime, TIME_MIN: parseTime, TEAM: self.parseTeam, } self.dateSrv = ObjSrv() self.wb = xlrd.open_workbook(fileName) self.loadGeneralConf() self.parseLeague() self.getGroups() self.getFieldL() self.getRestriction() self.getTeamGrpD() self.parseUniformity() def parseDate(self, date): if isinstance(date, datetime.date): d = Date(date.year, date.month, date.day) else: month, day, year = date.split('/', 2) d = Date(int(year), int(month), int(day)) return self.dateSrv[d] def parseTeam(self, teamName): if isinstance(teamName, Team): team = teamName else: team = Team(teamName) return self.teamSrv[team] def loadLeague(self, league): self.divSrv = ObjSrv() self.teamSet = set() for division in league.divisionL: division = self.divSrv[division] for team in division.teamL: self.teamSet.add(team) def xl2py(self, val, fmt, strip=False): if fmt == xlrd.XL_CELL_DATE: dateTuple = xlrd.xldate_as_tuple(val, self.wb.datemode) if dateTuple[3:] == (0, 0, 0): return datetime.date(*dateTuple[:3]) elif dateTuple[:3] == (0, 0, 0): return datetime.time(*dateTuple[3:]) else: return datetime.datetime(*dateTuple) elif fmt == xlrd.XL_CELL_NUMBER: return val elif fmt == xlrd.XL_CELL_TEXT: if strip: val = val.strip() return val elif fmt == xlrd.XL_CELL_EMPTY: return None else: raise Exception('Unknown format %d' % fmt) def getRow(self, sh, idx, strip=False): valL = [] for val, fmt in zip(sh.row_values(idx), sh.row_types(idx)): valL.append(self.xl2py(val, fmt, strip)) return valL def getGroups(self): sh = self.wb.sheet_by_name(u'groupes') groupD = {} for division in self.divSrv.objD.values(): groupD['<%s>' % division.name] = (TEAM, division.teamL) for i in range(1, sh.nrows): valL = self.getRow(sh, i, True) type_ = valL[0] name = valL[1] if type_ not in typeL: raise ConfigException( "%s n'est pas un type valide. Les valeurs possibles sont : %s." % (type_, ', '.join(typeL))) if not isGroupName(name): raise ConfigException( "Un nom de groupe doit commencer par '<' et terminer par '>', ce qui n'est pas le cas de :%s" % name) itemL = [] for val in valL[2:]: if val is not None: itemL.append(val) groupD[name] = (type_, itemL) self.groupD = groupD def expand(self, field, type_): if field is None: return None if not isinstance(field, (str, unicode)): valL = [field] else: if isGroupName(field) and field in self.groupD: (type__, valL) = self.groupD[field] if type__ != type_: raise Exception( "Vous avez utilisé un groupe de type %s dans un champ de type %s" % (type__, type_)) elif ',' in field: valL = [val.strip() for val in field.split(',')] else: valL = [field] return [self.parserD[type_](val) for val in valL] def expandAll(self, valL, type2idx, typeL): valLL = [] for type_ in typeL: idx = type2idx[type_] valLL.append(self.expand(valL[idx], type_)) return valLL def getRestriction(self): sh = self.wb.sheet_by_name(RESTR) typeL, type2idx = self.getType(sh) restrD = {} self.teamSrv.lock = True self.dateSrv.lock = True for i in range(1, sh.nrows): valL = self.getRow(sh, i, True) include = signD[str(valL[0]).strip()] value = valL[1] teamL, dateL = self.expandAll(valL, type2idx, (TEAM, DATE)) timeMinL, timeMaxL, stadiumL, fNameL = self.expandAll( valL, type2idx, (TIME_MIN, TIME_MAX, STADIUM, FNAME)) if dateL is None: dateL = self.dateSrv.objD.values() for team in teamL: for date in dateL: restr = Restriction(include, value, team, date) if timeMinL is not None: restr.timeMin = parseTime(timeMinL[0]) if timeMaxL is not None: restr.timeMax = parseTime(timeMaxL[0]) if stadiumL is not None: restr.stadiumS = set(stadiumL) if fNameL is not None: restr.fNameS = set(fNameL) try: restrD[(team, date)].append(restr) except KeyError: restrD[(team, date)] = [restr] self.restrD = restrD def getType(self, sh): typeL = self.getRow(sh, 0, True) typeL = [str(type_) for type_ in typeL] type2idx = dict(zip(typeL, range(len(typeL)))) return typeL, type2idx def getFieldL(self): sh = self.wb.sheet_by_name(u'terrains') typeL, _type2idx = self.getType(sh) fieldSet = set() for i in range(1, sh.nrows): valL = self.getRow(sh, i, True) action = str(valL[0]).strip() fieldPropLL = [ self.expand(val, type_) for val, type_ in zip(valL[1:], typeL[1:]) ] subSet = set( [fieldProp for fieldProp in itertools.product(*fieldPropLL)]) if action == '+': fieldSet.update(subSet) elif action == '-': fieldSet.difference_update(subSet) fieldL = list(fieldSet) fieldL.sort() dateL = list(set(list(zip(*fieldL))[1])) dateL.sort() self.fieldL = [] for stadium, date, time, fName in fieldL: self.fieldL.append( Field(stadium, fName, time, date, self.matchDuration)) def getTeamGrpD(self): sh = self.wb.sheet_by_name(TEAM_GROUP) teamGrpD = {} for i in range(1, sh.nrows): valL = self.getRow(sh, i, True) name = valL[0] pen = valL[1] teamL = [] for teamName in valL[2:]: if teamName is not None: team = self.parseTeam(teamName) if team in self.teamSet: teamL.append(team) else: print('Unknown team : %s' % str(team)) teamGrpD[name] = (pen, teamL) self.teamGrpD = teamGrpD def parseLeague(self): sh = self.wb.sheet_by_name(LEAGUE) self.teamSrv = ObjSrv() header = self.getRow(sh, 0, True) unifGrpIdL = header[3:] print(unifGrpIdL) divD = {} for i in range(1, sh.nrows): valL = self.getRow(sh, i, True) try: div = int(valL[0]) except ValueError: div = str(valL[0]) pool = valL[1] team = self.parseTeam(valL[2]) team.rank = i countL = [int(e) for e in valL[3:]] team.unifCountD = dict(zip(unifGrpIdL, countL)) if div not in divD: divD[div] = [] divD[div].append((pool, team)) divL = [] for divId, valL in iteritems(divD): poolL, teamL = zip(*valL) poolS = set(poolL) if len(poolS) == 1: division = Division('division-%s' % str(divId), teamL) elif len(poolS) == 2: pool1, pool2 = list(poolS) teamL1 = [team for pool, team in valL if pool == pool1] teamL2 = [team for pool, team in valL if pool == pool2] division = DivisionHalf('division-%s' % str(divId), teamL1, teamL2) else: raise Exception( "Can't have more than two pools in the same division (%s)." % (', '.join(map(str, poolS)))) for team in teamL: team.division = division divL.append(division) self.league = League(divL) self.loadLeague(self.league) def parseUniformity(self): sh = self.wb.sheet_by_name(UNIF) unifL = [] for i in range(1, sh.nrows): restr = Restriction(False, 1, None, None) key, pen, maxCount, timeMin, timeMax, stadium, field = self.getRow( sh, i, True) if stadium is not None: restr.stadiumS = set([stadium]) if field is not None: restr.fNameS = set([field]) restr.timeMin = parseTime(timeMin) restr.timeMax = parseTime(timeMax) unifL.append((key, float(pen) / 2**maxCount, restr)) keySet = set(list(zip(*unifL))[0]) for team in self.league.getTeamL(): if hasattr(team, 'unifCountD'): for id_ in team.unifCountD.keys(): if id_ not in keySet: print("Warning : %s not in [%s]" % (id_, ', '.join(list(keySet)))) self.unifL = unifL def loadGeneralConf(self): sh = self.wb.sheet_by_name(CONF_SHEET) confD = {} for i in range(sh.nrows): key, val = self.getRow(sh, i, True) confD[confKeyMap[key]] = val self.matchDuration = parseTime(confD['matchDuration']) self.nbMatch = int(confD['nbMatch']) self.penDateFactor = confD['penDateFactor'] def getConfig(self): config = Config(self.league, self.fieldL, self.restrD, self.teamGrpD, self.unifL, self.matchDuration, self.nbMatch, self.penDateFactor) return config