def get_stages(season_id, overwrite=False): if stages.find_one({'seasonId': season_id}) and not overwrite: print('Stages already exist') return True season = seasons.find_one({'seasonId': season_id}) page = SITE+'/Regions/{regionId}/Tournaments/{tournamentId}/Seasons/{seasonId}'.format(**season) r = requests.get(page, headers=HEADERS) print(r.url) if r.status_code != 200: return False content = html.fromstring(r.text) stage_links = content.xpath("//select[@id='stages']/option/@value") stage_names = content.xpath("//select[@id='stages']/option/text()") for stage_link, stage_name in zip(stage_links, stage_names): stages.update_one({'stageId': int(stage_link.split('/')[-1])}, {'$setOnInsert': { 'name': stage_name, 'regionId': season['regionId'], 'tournamentId': season['tournamentId'], 'seasonId': season['seasonId']}}, upsert=True) if len(stage_links) == 0: fixture_link = content.xpath("//div[@id='sub-navigation']/ul/li/a[text()='Fixtures']/@href")[0] stages.update_one({'stageId': int(fixture_link.split("/")[-3])}, {'$setOnInsert': { 'name': content.xpath('//h1/text()')[0].strip(), 'regionId': season['regionId'], 'tournamentId': season['tournamentId'], 'seasonId': season['seasonId']}}, upsert=True) wait()
def load_data(limit=0): BODYPARTS = ['RightFoot', 'LeftFoot', 'Head', 'OtherBodyPart'] PATTERNOFPLAY = ['RegularPlay', 'FastBreak', 'SetPiece', 'FromCorner', 'Penalty', 'DirectFreekick', 'ThrowinSetPiece'] SHOTLOCATION = ['SmallBoxLeft', 'SmallBoxCentre', 'SmallBoxRight', 'DeepBoxLeft', 'BoxLeft', 'BoxCentre', 'BoxRight', 'DeepBoxRight', 'OutOfBoxDeepLeft', 'OutOfBoxLeft', 'OutOfBoxCentre', 'OutOfBoxRight', 'OutOfBoxDeepRight', 'ThirtyFivePlusLeft', 'ThirtyFivePlusCentre', 'ThirtyFivePlusRight'] l, c, r = array([104, 34]), array([104, 38]), array([104, 42]) shots = [] for event in events.find({'isShot': True, 'isOwnGoal': {'$exists': False}}).limit(limit): shot = dict() shot['id'] = int(event['id']) shot['Goal'] = event.get('isGoal') is True shot['X'] = 1.04 * event['x'] shot['Y'] = 0.76 * event['y'] p = array([shot['X'], shot['Y']]) shot['Distance'] = norm(p - c) shot['Angle'] = arccos(dot(p - l, p - r) / norm(p - l) / norm(p - r)) shot_qualifiers = {q['type']['displayName']: q.get('value') for q in event['qualifiers']} for qualifier in shot_qualifiers: if qualifier in BODYPARTS: shot['BodyPart'] = qualifier elif qualifier in PATTERNOFPLAY: shot['PatternOfPlay'] = qualifier elif qualifier in SHOTLOCATION: shot['ShotLocation'] = qualifier elif qualifier == 'Zone': shot['Zone'] = shot_qualifiers[qualifier] elif qualifier == 'RelatedEventId': related_event = events.find_one({'eventId': event['relatedEventId'], 'matchId': event['matchId'], 'teamId': event['teamId']}) shot['RelatedEventType'] = related_event['type']['displayName'] if related_event else None region = regions.find_one({'regionId': event['regionId']}) shot['Region'] = region['name'] if region else None tournament = tournaments.find_one({'tournamentId': event['tournamentId']}) shot['Tournament'] = tournament['name'] if tournament else None season = seasons.find_one({'seasonId': event['seasonId']}) shot['Season'] = season['name'] if season else None stage = stages.find_one({'stageId': event.get('stageId')}) shot['Stage'] = stage['name'] if stage else None team = teams.find_one({'teamId': event['teamId']}) shot['Team'] = team['name'] if team else None player = players.find_one({'playerId': event['playerId']}) shot['Player'] = player['name'] if player else None match = matches.find_one({'matchId': event['matchId']}) shot['Side'] = 'home' if team['name'] == match['home']['name'] else 'away' shot['Opponent'] = match['away']['name'] if team['name'] == match['home']['name'] else match['home']['name'] shot['Date'] = match['startDate'] shot['Period'] = event['period']['displayName'] shot['Minute'] = event['minute'] shots.append(shot) if len(shots) % 10 == 0: print('{0} shots in data set'.format(len(shots))) print('{0} shots in data set'.format(len(shots))) return DataFrame(shots)
def get_fixtures(stage_id, overwrite=False): if matchheaders.find_one({'stageId': stage_id}) and not overwrite: print('Matches already exist') return True stage = stages.find_one({'stageId': stage_id}) page = SITE+'/Regions/{regionId}/Tournaments/{tournamentId}/Seasons/{seasonId}/Stages/{stageId}/Fixtures'.format(**stage) r = requests.get(page, headers=HEADERS) print(r.url) if r.status_code != 200: wait() return False model_last_mode = re.findall("'Model-Last-Mode': '([^']+)'", r.text)[0] headers = HEADERS.copy() headers['Model-Last-Mode'] = model_last_mode headers['Referer'] = r.url headers['X-Requested-With'] = 'XMLHttpRequest' dates = re.findall("'Month', ([^ ]+), min, max", r.text) if dates: dates = re.sub(r'(\d+)(?=:)', r'"\1"', dates[0]) d = json.loads(dates) if len(d) == 0: print('No matches') wait() return False months = {format(d): format(d+1, '02') for d in range(0, 12)} params = {'isAggregate': 'false'} for y in d: for m in d[y]: params['d'] = '{0}{1}'.format(y, months[m]) wait() page = SITE+'/tournamentsfeed/{0}/Fixtures/'.format(stage_id) r = requests.get(page, params=params, headers=headers, allow_redirects=False) print(r.url, r.status_code) if r.status_code != 200: wait() return False matchData = re.sub(r',(?=,)', r',null', r.text) data = json.loads(matchData.replace("'", '"')) for row in data: match = {'matchId': row[0], 'statusCode': row[1], 'startDate': row[2], 'startTime': row[3], 'home': {'teamId': row[4], 'name': row[5], 'field': 'home'}, 'away': {'teamId': row[7], 'name': row[8], 'field': 'away'}, 'score': row[10], 'elapsed': row[14], 'result': row[15], 'international': row[16], 'hasKeyEvents': row[12], 'hasPreview': row[13], 'isOpta': row[17], 'isOtherOpta': row[19], } if matchheaders.find_one({'matchId': match['matchId']}) and not overwrite: print('Match already exists') else: match['startDate'] = datetime.strptime(match['startDate'], '%A, %b %d %Y') match['startTime'] = datetime.strptime(match['startTime'], '%H:%M') match['startTime'] = datetime.combine(match['startDate'].date(), match['startTime'].time()) for k, v in stage.items(): if 'Id' in k: match[k] = v matchheaders.replace_one({'matchId': match['matchId']}, match, upsert=True) else: matchData = re.findall("calendarParameter\), ([^;]*)\);", r.text) matchData = re.sub(r',(?=,)', r',null', matchData[0]) data = json.loads(matchData.replace("'", '"') if matchData else '{}') for row in data: match = {'matchId': row[0], 'statusCode': row[1], 'startDate': row[2], 'startTime': row[3], 'home': {'teamId': row[4], 'name': row[5], 'field': 'home'}, 'away': {'teamId': row[7], 'name': row[8], 'field': 'away'}, 'score': row[10], 'elapsed': row[14], 'result': row[15], 'international': row[16], 'hasKeyEvents': row[12], 'hasPreview': row[13], 'isOpta': row[17], 'isOtherOpta': row[19], } if matchheaders.find_one({'matchId': match['matchId']}) and not overwrite: print('Match already exists') else: match['startDate'] = datetime.strptime(match['startDate'], '%A, %b %d %Y') match['startTime'] = datetime.strptime(match['startTime'], '%H:%M') match['startTime'] = datetime.combine(match['startDate'].date(), match['startTime'].time()) for k, v in stage.items(): if 'Id' in k: match[k] = v matchheaders.replace_one({'matchId': match['matchId']}, match, upsert=True) wait()