def get_seasons(tournament_id, overwrite=False): if seasons.find_one({'tournamentId': tournament_id}) and not overwrite: print('Seasons already exist') return True tournament = tournaments.find_one({'tournamentId': tournament_id}) page = SITE+'/Regions/{regionId}/Tournaments/{tournamentId}'.format(**tournament) r = requests.get(page, headers=HEADERS) print(r.url) if r.status_code != 200: return False content = html.fromstring(r.text) season_links = content.xpath('//select[@id="seasons"]/option/@value') season_names = content.xpath('//select[@id="seasons"]/option/text()') for season_link, season_name in zip(season_links, season_names): season = { 'seasonId': int(season_link.split('/')[-1]), 'name': season_name, 'regionId': tournament['regionId'], 'tournamentId': tournament['tournamentId'], } seasons.update_one({'seasonId': season['seasonId']}, {'$setOnInsert': { 'name': season['name'], 'regionId': tournament['regionId'], 'tournamentId': tournament['tournamentId']}}, upsert=True) # Sometimes the tournament doesn't have a name in the main menu - use the title on the page if tournament['name'] == '': tournament_name = content.xpath('//h1[@class="tournament-header"]/text()')[0].strip() tournaments.update_one({'tournamentId': tournament['tournamentId']}, {'$se': {'name': tournament_name}}) # Some tournaments don't show up in the main menu - take a fuller list from the dropdown menu tournament_links = content.xpath('//select[@id="tournaments"]/option/@value') tournament_names = content.xpath('//select[@id="tournaments"]/option/text()') for tournament_link, tournament_name in zip(tournament_links, tournament_names): new_tournament = { 'tournamentId': int(tournament_link.split('/')[-1]), 'name': tournament_name, 'regionId': tournament['regionId'], } tournaments.update_one({'tournamentId': new_tournament['tournamentId']}, {'$setOnInsert': { 'name': new_tournament['name'], 'regionId': new_tournament['regionId']}}, upsert=True) wait()
def get_stages(season_id, overwrite=False): if stages.find_one({'seasonId': season_id}) and not overwrite: print('Stages already exist') return True season = seasons.find_one({'seasonId': season_id}) page = SITE+'/Regions/{regionId}/Tournaments/{tournamentId}/Seasons/{seasonId}'.format(**season) r = requests.get(page, headers=HEADERS) print(r.url) if r.status_code != 200: return False content = html.fromstring(r.text) stage_links = content.xpath("//select[@id='stages']/option/@value") stage_names = content.xpath("//select[@id='stages']/option/text()") for stage_link, stage_name in zip(stage_links, stage_names): stages.update_one({'stageId': int(stage_link.split('/')[-1])}, {'$setOnInsert': { 'name': stage_name, 'regionId': season['regionId'], 'tournamentId': season['tournamentId'], 'seasonId': season['seasonId']}}, upsert=True) if len(stage_links) == 0: fixture_link = content.xpath("//div[@id='sub-navigation']/ul/li/a[text()='Fixtures']/@href")[0] stages.update_one({'stageId': int(fixture_link.split("/")[-3])}, {'$setOnInsert': { 'name': content.xpath('//h1/text()')[0].strip(), 'regionId': season['regionId'], 'tournamentId': season['tournamentId'], 'seasonId': season['seasonId']}}, upsert=True) wait()
def load_data(limit=0): BODYPARTS = ['RightFoot', 'LeftFoot', 'Head', 'OtherBodyPart'] PATTERNOFPLAY = ['RegularPlay', 'FastBreak', 'SetPiece', 'FromCorner', 'Penalty', 'DirectFreekick', 'ThrowinSetPiece'] SHOTLOCATION = ['SmallBoxLeft', 'SmallBoxCentre', 'SmallBoxRight', 'DeepBoxLeft', 'BoxLeft', 'BoxCentre', 'BoxRight', 'DeepBoxRight', 'OutOfBoxDeepLeft', 'OutOfBoxLeft', 'OutOfBoxCentre', 'OutOfBoxRight', 'OutOfBoxDeepRight', 'ThirtyFivePlusLeft', 'ThirtyFivePlusCentre', 'ThirtyFivePlusRight'] l, c, r = array([104, 34]), array([104, 38]), array([104, 42]) shots = [] for event in events.find({'isShot': True, 'isOwnGoal': {'$exists': False}}).limit(limit): shot = dict() shot['id'] = int(event['id']) shot['Goal'] = event.get('isGoal') is True shot['X'] = 1.04 * event['x'] shot['Y'] = 0.76 * event['y'] p = array([shot['X'], shot['Y']]) shot['Distance'] = norm(p - c) shot['Angle'] = arccos(dot(p - l, p - r) / norm(p - l) / norm(p - r)) shot_qualifiers = {q['type']['displayName']: q.get('value') for q in event['qualifiers']} for qualifier in shot_qualifiers: if qualifier in BODYPARTS: shot['BodyPart'] = qualifier elif qualifier in PATTERNOFPLAY: shot['PatternOfPlay'] = qualifier elif qualifier in SHOTLOCATION: shot['ShotLocation'] = qualifier elif qualifier == 'Zone': shot['Zone'] = shot_qualifiers[qualifier] elif qualifier == 'RelatedEventId': related_event = events.find_one({'eventId': event['relatedEventId'], 'matchId': event['matchId'], 'teamId': event['teamId']}) shot['RelatedEventType'] = related_event['type']['displayName'] if related_event else None region = regions.find_one({'regionId': event['regionId']}) shot['Region'] = region['name'] if region else None tournament = tournaments.find_one({'tournamentId': event['tournamentId']}) shot['Tournament'] = tournament['name'] if tournament else None season = seasons.find_one({'seasonId': event['seasonId']}) shot['Season'] = season['name'] if season else None stage = stages.find_one({'stageId': event.get('stageId')}) shot['Stage'] = stage['name'] if stage else None team = teams.find_one({'teamId': event['teamId']}) shot['Team'] = team['name'] if team else None player = players.find_one({'playerId': event['playerId']}) shot['Player'] = player['name'] if player else None match = matches.find_one({'matchId': event['matchId']}) shot['Side'] = 'home' if team['name'] == match['home']['name'] else 'away' shot['Opponent'] = match['away']['name'] if team['name'] == match['home']['name'] else match['home']['name'] shot['Date'] = match['startDate'] shot['Period'] = event['period']['displayName'] shot['Minute'] = event['minute'] shots.append(shot) if len(shots) % 10 == 0: print('{0} shots in data set'.format(len(shots))) print('{0} shots in data set'.format(len(shots))) return DataFrame(shots)
def get_fixtures(tournament_id, season_id): season = seasons.find_one({ 'tournament': tournament_id, 'season': season_id }) tournament = tournaments.find_one({'tournament': tournament_id}) if tournament.get('cup') == 1: url = '{0}/spielplan/gesamtspielplan/pokalwettbewerb/{tournament}/saison_id/{season}'.format( SITE, **season) else: url = '{0}/spielplan/gesamtspielplan/wettbewerb/{tournament}/saison_id/{season}'.format( SITE, **season) r = requests.get(url, headers=HEADERS) print(r.url, tournament['name'], season['name']) if r.status_code != 200: wait() return False content = html.fromstring(r.text) datestamp, timestamp = date.min, time.min for row in content.xpath( '//div[@class="box"]/table/tbody/tr[not(td/@colspan)]'): teams.update_one({'team': int(row.xpath('td[3]/a/@id')[0])}, { '$setOnInsert': { 'name': row.xpath('td[3]/a/text()')[0], 'region': tournament['region'], 'national': False } }, upsert=True) teams.update_one({'team': int(row.xpath('td[7]/a/@id')[0])}, { '$setOnInsert': { 'name': row.xpath('td[7]/a/text()')[0], 'region': tournament['region'], 'national': False } }, upsert=True) if row.xpath('td[2]/text()')[0].strip(): timestamp = datetime.strptime( row.xpath('td[2]/text()')[0].strip(), '%I:%M %p').time() if row.xpath('td[1]/a/@href'): datestring = row.xpath('td[1]/a/@href')[0].split('/')[-1] if datestring == '0000-00-00': datestamp = datestamp.min else: datestamp = datetime.strptime( row.xpath('td[1]/a/@href')[0].split('/')[-1], '%Y-%m-%d') else: datestamp = datetime.strptime( row.xpath('td[1]/text()')[0].strip().split(' ')[-1], '%m/%d/%y') matches.update_one( {'match': int(row.xpath('td[5]/a/@href')[0].split('/')[-1])}, { '$setOnInsert': { 'season': season['season'], 'tournament': tournament['tournament'], 'region': tournament['region'] }, '$set': { 'date': datestamp, 'time': datetime.combine(datestamp.date(), timestamp), 'home': { 'team': int(row.xpath('td[3]/a/@id')[0]) }, 'away': { 'team': int(row.xpath('td[7]/a/@id')[0]) }, 'score': row.xpath('td[5]/a/text()')[0] } }, upsert=True) wait()