Beispiel #1
0
def get_stages(season_id, overwrite=False):
    if stages.find_one({'seasonId': season_id}) and not overwrite:
        print('Stages already exist')
        return True

    season = seasons.find_one({'seasonId': season_id})
    page = SITE+'/Regions/{regionId}/Tournaments/{tournamentId}/Seasons/{seasonId}'.format(**season)
    r = requests.get(page, headers=HEADERS)
    print(r.url)

    if r.status_code != 200:
        return False

    content = html.fromstring(r.text)
    stage_links = content.xpath("//select[@id='stages']/option/@value")
    stage_names = content.xpath("//select[@id='stages']/option/text()")

    for stage_link, stage_name in zip(stage_links, stage_names):
        stages.update_one({'stageId': int(stage_link.split('/')[-1])},
                          {'$setOnInsert': {
                              'name': stage_name,
                              'regionId': season['regionId'],
                              'tournamentId': season['tournamentId'],
                              'seasonId': season['seasonId']}},
                          upsert=True)

    if len(stage_links) == 0:
        fixture_link = content.xpath("//div[@id='sub-navigation']/ul/li/a[text()='Fixtures']/@href")[0]
        stages.update_one({'stageId': int(fixture_link.split("/")[-3])},
                          {'$setOnInsert': {
                              'name': content.xpath('//h1/text()')[0].strip(),
                              'regionId': season['regionId'],
                              'tournamentId': season['tournamentId'],
                              'seasonId': season['seasonId']}},
                          upsert=True)

    wait()
def load_data(limit=0):
    BODYPARTS = ['RightFoot', 'LeftFoot', 'Head', 'OtherBodyPart']
    PATTERNOFPLAY = ['RegularPlay', 'FastBreak', 'SetPiece', 'FromCorner', 'Penalty', 'DirectFreekick', 'ThrowinSetPiece']
    SHOTLOCATION = ['SmallBoxLeft', 'SmallBoxCentre', 'SmallBoxRight',
                    'DeepBoxLeft', 'BoxLeft', 'BoxCentre', 'BoxRight', 'DeepBoxRight',
                    'OutOfBoxDeepLeft', 'OutOfBoxLeft', 'OutOfBoxCentre', 'OutOfBoxRight', 'OutOfBoxDeepRight',
                    'ThirtyFivePlusLeft', 'ThirtyFivePlusCentre', 'ThirtyFivePlusRight']

    l, c, r = array([104, 34]), array([104, 38]), array([104, 42])
    shots = []
    for event in events.find({'isShot': True, 'isOwnGoal': {'$exists': False}}).limit(limit):
        shot = dict()
        shot['id'] = int(event['id'])
        shot['Goal'] = event.get('isGoal') is True
        shot['X'] = 1.04 * event['x']
        shot['Y'] = 0.76 * event['y']

        p = array([shot['X'], shot['Y']])
        shot['Distance'] = norm(p - c)
        shot['Angle'] = arccos(dot(p - l, p - r) / norm(p - l) / norm(p - r))

        shot_qualifiers = {q['type']['displayName']: q.get('value') for q in event['qualifiers']}
        for qualifier in shot_qualifiers:
            if qualifier in BODYPARTS:
                shot['BodyPart'] = qualifier
            elif qualifier in PATTERNOFPLAY:
                shot['PatternOfPlay'] = qualifier
            elif qualifier in SHOTLOCATION:
                shot['ShotLocation'] = qualifier
            elif qualifier == 'Zone':
                shot['Zone'] = shot_qualifiers[qualifier]
            elif qualifier == 'RelatedEventId':
                related_event = events.find_one({'eventId': event['relatedEventId'],
                                                 'matchId': event['matchId'],
                                                 'teamId': event['teamId']})
                shot['RelatedEventType'] = related_event['type']['displayName'] if related_event else None

        region = regions.find_one({'regionId': event['regionId']})
        shot['Region'] = region['name'] if region else None

        tournament = tournaments.find_one({'tournamentId': event['tournamentId']})
        shot['Tournament'] = tournament['name'] if tournament else None

        season = seasons.find_one({'seasonId': event['seasonId']})
        shot['Season'] = season['name'] if season else None

        stage = stages.find_one({'stageId': event.get('stageId')})
        shot['Stage'] = stage['name'] if stage else None

        team = teams.find_one({'teamId': event['teamId']})
        shot['Team'] = team['name'] if team else None

        player = players.find_one({'playerId': event['playerId']})
        shot['Player'] = player['name'] if player else None

        match = matches.find_one({'matchId': event['matchId']})
        shot['Side'] = 'home' if team['name'] == match['home']['name'] else 'away'
        shot['Opponent'] = match['away']['name'] if team['name'] == match['home']['name'] else match['home']['name']
        shot['Date'] = match['startDate']

        shot['Period'] = event['period']['displayName']
        shot['Minute'] = event['minute']

        shots.append(shot)

        if len(shots) % 10 == 0:
            print('{0} shots in data set'.format(len(shots)))

    print('{0} shots in data set'.format(len(shots)))

    return DataFrame(shots)
Beispiel #3
0
def get_fixtures(stage_id, overwrite=False):
    if matchheaders.find_one({'stageId': stage_id}) and not overwrite:
        print('Matches already exist')
        return True

    stage = stages.find_one({'stageId': stage_id})
    page = SITE+'/Regions/{regionId}/Tournaments/{tournamentId}/Seasons/{seasonId}/Stages/{stageId}/Fixtures'.format(**stage)
    r = requests.get(page, headers=HEADERS)
    print(r.url)

    if r.status_code != 200:
        wait()
        return False

    model_last_mode = re.findall("'Model-Last-Mode': '([^']+)'", r.text)[0]
    headers = HEADERS.copy()
    headers['Model-Last-Mode'] = model_last_mode
    headers['Referer'] = r.url
    headers['X-Requested-With'] = 'XMLHttpRequest'

    dates = re.findall("'Month', ([^ ]+), min, max", r.text)
    if dates:
        dates = re.sub(r'(\d+)(?=:)', r'"\1"', dates[0])
        d = json.loads(dates)

        if len(d) == 0:
            print('No matches')
            wait()
            return False

        months = {format(d): format(d+1, '02') for d in range(0, 12)}
        params = {'isAggregate': 'false'}

        for y in d:
            for m in d[y]:
                params['d'] = '{0}{1}'.format(y, months[m])
                wait()

                page = SITE+'/tournamentsfeed/{0}/Fixtures/'.format(stage_id)
                r = requests.get(page, params=params, headers=headers, allow_redirects=False)
                print(r.url, r.status_code)

                if r.status_code != 200:
                    wait()
                    return False

                matchData = re.sub(r',(?=,)', r',null', r.text)
                data = json.loads(matchData.replace("'", '"'))

                for row in data:
                    match = {'matchId': row[0], 'statusCode': row[1], 'startDate': row[2], 'startTime': row[3],
                             'home': {'teamId': row[4], 'name': row[5], 'field': 'home'},
                             'away': {'teamId': row[7], 'name': row[8], 'field': 'away'},
                             'score': row[10], 'elapsed': row[14], 'result': row[15], 'international': row[16],
                             'hasKeyEvents': row[12], 'hasPreview': row[13], 'isOpta': row[17], 'isOtherOpta': row[19],
                             }

                    if matchheaders.find_one({'matchId': match['matchId']}) and not overwrite:
                        print('Match already exists')
                    else:
                        match['startDate'] = datetime.strptime(match['startDate'], '%A, %b %d %Y')
                        match['startTime'] = datetime.strptime(match['startTime'], '%H:%M')
                        match['startTime'] = datetime.combine(match['startDate'].date(), match['startTime'].time())
                        for k, v in stage.items():
                            if 'Id' in k:
                                match[k] = v

                        matchheaders.replace_one({'matchId': match['matchId']}, match, upsert=True)
    else:
        matchData = re.findall("calendarParameter\), ([^;]*)\);", r.text)
        matchData = re.sub(r',(?=,)', r',null', matchData[0])
        data = json.loads(matchData.replace("'", '"') if matchData else '{}')

        for row in data:
            match = {'matchId': row[0], 'statusCode': row[1], 'startDate': row[2], 'startTime': row[3],
                     'home': {'teamId': row[4], 'name': row[5], 'field': 'home'},
                     'away': {'teamId': row[7], 'name': row[8], 'field': 'away'},
                     'score': row[10], 'elapsed': row[14], 'result': row[15], 'international': row[16],
                     'hasKeyEvents': row[12], 'hasPreview': row[13], 'isOpta': row[17], 'isOtherOpta': row[19],
                     }

            if matchheaders.find_one({'matchId': match['matchId']}) and not overwrite:
                print('Match already exists')
            else:
                match['startDate'] = datetime.strptime(match['startDate'], '%A, %b %d %Y')
                match['startTime'] = datetime.strptime(match['startTime'], '%H:%M')
                match['startTime'] = datetime.combine(match['startDate'].date(), match['startTime'].time())
                for k, v in stage.items():
                    if 'Id' in k:
                        match[k] = v

                matchheaders.replace_one({'matchId': match['matchId']}, match, upsert=True)
    wait()