Example #1
0
def team_setup():
    """
    Creates team log-related folders.

    :return: nothing
    """
    for season in range(2005, schedules.get_current_season() + 1):
        organization.check_create_folder(
            organization.get_season_team_pbp_folder(season))
    for season in range(2005, schedules.get_current_season() + 1):
        organization.check_create_folder(
            organization.get_season_team_toi_folder(season))
Example #2
0
def find_playoff_game(searchstr):
    """
    Finds playoff game id based on string specified
    :param searchstr: e.g. WSH PIT 2016 Game 5
    :return: (season, game)
    """

    parts = searchstr.split(' ')
    teams = []
    for part in parts:
        if re.match(r'^[A-z]{3}$', part.strip()):
            teams.append(part.upper())
    if len(teams) != 2:
        return

    team1, team2 = teams[:2]

    searchstr += ' '
    if re.search(r'\s\d{4}\s', searchstr) is not None:
        season = int(re.search(r'\s\d{4}\s', searchstr).group(0))
    else:
        season = schedules.get_current_season()

    # Get game with a 5-digit regex
    if re.search(r'\s\d\s', searchstr) is not None:
        gamenum = int(re.search(r'\s\d\s', searchstr).group(0))
        games = find_recent_games(team1, team2, limit=7, season=season)
        game = games[games.Game % 10 == gamenum].Game.iloc[0]
    else:
        raise ValueError

    return season, game
Example #3
0
def scrape_season_toi(season, force_overwrite=False):
    """
    Scrapes and parses toi from the given season.

    :param season: int, the season
    :param force_overwrite: bool. If true, rescrapes all games. If false, only previously unscraped ones

    :return: nothing
    """
    if season is None:
        season = schedules.get_current_season()

    sch = schedules.get_season_schedule(season)
    games = sch[sch.Status == "Final"].Game.values
    games.sort()
    intervals = helpers.intervals(games)
    interval_j = 0
    for i, game in enumerate(games):
        try:
            scrape_game_toi(season, game, force_overwrite)
            manipulate_schedules.update_schedule_with_pbp_scrape(season, game)
            parse_toi.parse_game_pbp(season, game, True)
            if len(parse_toi.get_parsed_toi(season, game)) < 3600:
                scrape_game_toi_from_html(season, game, True)
                parse_toi.parse_game_toi_from_html(season, game, True)
        except Exception as e:
            pass  # ed.print_and_log('{0:d} {1:d} {2:s}'.format(season, game, str(e)), 'warn')
        if interval_j < len(intervals):
            if i == intervals[interval_j][0]:
                print('Done scraping through {0:d} {1:d} ({2:d}%)'.format(
                    season, game,
                    round(intervals[interval_j][0] / len(games) * 100)))
                interval_j += 1
Example #4
0
def autoupdate(season=None):
    """
    Run this method to update local data. It reads the schedule file for given season and scrapes and parses
    previously unscraped games that have gone final or are in progress. Use this for 2010 or later.

    :param season: int, the season. If None (default), will do current season

    :return: nothing
    """
    # TODO: why does sometimes the schedule have the wrong game-team pairs, but when I regenerate, it's all ok?
    # TODO: this does not work quite right. Doesn't seem to know it needs to re-scrape TOI for previously scraped
    # TODO: in-progress games after they go final

    if season is None:
        season = schedules.get_current_season()

    sch = schedules.get_season_schedule(season)

    # First, for all games that were in progress during last scrape, delete html charts
    inprogress = sch.query('Status == "In Progress"')
    inprogressgames = inprogress.Game.values
    inprogressgames.sort()
    for game in inprogressgames:
        delete_game_html(season, game)

    # Now keep tabs on old final games
    old_final_games = set(
        sch.query('Status == "Final" & Result != "N/A"').Game.values)

    # Update schedule to get current status
    schedules.generate_season_schedule_file(season)

    # For games done previously, set pbp and toi status to scraped
    manipulate_schedules.update_schedule_with_pbp_scrape(
        season, old_final_games)
    manipulate_schedules.update_schedule_with_toi_scrape(
        season, old_final_games)
    sch = schedules.get_season_schedule(season)

    # Now, for games currently in progress, scrape.
    # But no need to force-overwrite. We handled games previously in progress above.
    # Games newly in progress will be written to file here.

    inprogressgames = sch.query('Status == "In Progress"')
    inprogressgames = inprogressgames.Game.values
    inprogressgames.sort()
    print("Updating in-progress games")
    read_inprogress_games(inprogressgames, season)

    # Now, for any games that are final, scrape and parse if not previously done
    games = sch.query('Status == "Final" & Result == "N/A"')
    games = games.Game.values
    games.sort()
    print('Updating final games')
    read_final_games(games, season)

    try:
        teams.update_team_logs(season, force_overwrite=False)
    except Exception as e:
        pass  # ed.print_and_log("Error with team logs in {0:d}: {1:s}".format(season, str(e)), 'warn')
Example #5
0
def parse_season_pbp(season, force_overwrite=False):
    """
    Parses pbp from the given season.

    :param season: int, the season
    :param force_overwrite: bool. If true, parses all games. If false, only previously unparsed ones

    :return: nothing
    """
    if season is None:
        season = schedules.get_current_season()

    sch = schedules.get_season_schedule(season)
    games = sch[sch.Status == "Final"].Game.values
    games.sort()
    intervals = helpers.intervals(games)
    interval_j = 0
    for i, game in enumerate(games):
        try:
            parse_game_pbp(season, game, force_overwrite)
        except Exception as e:
            pass  # ed.print_and_log('{0:d} {1:d} {2:s}'.format(season, game, str(e)), 'warn')
        if interval_j < len(intervals):
            if i == intervals[interval_j][0]:
                print('Done parsing through {0:d} {1:d} ({2:d}%)'.format(
                    season, game,
                    round(intervals[interval_j][0] / len(games) * 100)))
                interval_j += 1
Example #6
0
def scrape_pbp_setup():
    """
    Creates raw pbp folders if need be

    :return:
    """
    for season in range(2005, schedules.get_current_season() + 1):
        organization.check_create_folder(organization.get_season_raw_pbp_folder(season))
Example #7
0
def parse_toi_setup():
    """
    Creates parsed toi folders if need be

    :return:
    """
    for season in range(2005, schedules.get_current_season() + 1):
        organization.check_create_folder(
            organization.get_season_parsed_toi_folder(season))
def get_enddate_from_kwargs(**kwargs):
    """Returns 6/21 of endseason + 1, or enddate"""

    if 'enddate' in kwargs:
        return kwargs['enddate']
    elif 'endseason' in kwargs:
        today = datetime.datetime.now().strftime('%Y-%m-%d')
        return min('{0:d}-06-21'.format(kwargs['endseason']+1), today)
    elif 'startseason' in kwargs:
        return get_enddate_from_kwargs(endseason=kwargs['startseason'])
    elif 'season' in kwargs:
        return get_enddate_from_kwargs(endseason=kwargs['season'])
    elif 'startdate' in kwargs:
        return get_enddate_from_kwargs(endseason=helper.infer_season_from_date(kwargs['startdate']))
    else:
        return get_enddate_from_kwargs(endseason=schedules.get_current_season())
Example #9
0
def parse_season_toi(season, force_overwrite=False):
    """
    Parses toi from the given season. Final games covered only.

    :param season: int, the season
    :param force_overwrite: bool. If true, parses all games. If false, only previously unparsed ones

    :return:
    """

    if season is None:
        season = schedules.get_current_season()

    sch = schedules.get_season_schedule(season)
    games = sch[sch.Status == "Final"].Game.values
    games.sort()
    for game in games:
        parse_game_toi(season, game, force_overwrite)
Example #10
0
def find_recent_games(team1, team2=None, limit=1):
    """
    A convenience function that lists the most recent in progress or final games for specified team(s)

    :param team1: str, a team
    :param team2: str, a team (optional)
    :param limit: How many games to return

    :return: df with relevant rows
    """
    sch = schedules.get_season_schedule(schedules.get_current_season())
    sch = sch[sch.Status != "Scheduled"]

    t1 = team_info.team_as_id(team1)
    sch = sch[(sch.Home == t1) | (sch.Road == t1)]
    if team2 is not None:
        t2 = team_info.team_as_id(team2)
        sch = sch[(sch.Home == t2) | (sch.Road == t2)]

    return sch.sort_values('Game', ascending=False).iloc[:limit, :]
def add_players_to_file(filename,
                        focus_team,
                        season=None,
                        gamecol='Game',
                        periodcol='Period',
                        timecol='Time',
                        time_format='elapsed',
                        update_data=False,
                        player_output='names'):
    """
    Adds names of on-ice players to the end of each line, and writes to file in the same folder as input file.
    Specifically, adds 1 second to the time in the spreadsheet and adds players who were on the ice at that time.

    You cannot necessarily trust results when times coincide with stoppages--and it's worth checking faceoffs as well.

    :param filename: str, the file to read. Will save output as this filename but ending in "on-ice.csv"
    :param focus_team: str or int, e.g. 'WSH' or 'WPG'
    :param season: int. For 2007-08, use 2007. Defaults to current season.
    :param gamecol: str. The column holding game IDs (e.g. 20001). By default, looks for column called "Game"
    :param periodcol: str. The column holding period number/name (1, 2, 3, 4 or OT, etc). By default: "Period"
    :param timecol: str. The column holding time in period in M:SS format.
    :param time_format: str, how to interpret timecol. Use 'elapsed' or 'remaining'.
        E.g. the start of a period is 0:00 with elapsed and 20:00 in remaining.
    :param update_data: bool. If True, will autoupdate() data for given season. If not, will not update game data.
        Use when file includes data from games not already scraped.
    :param player_output: str, use 'names' or 'nums'. Currently only supports 'names'

    :return: nothing
    """
    # TODO handle date instead of season and game

    if season is None:
        season = schedules.get_current_season()
    if update_data:
        autoupdate.autoupdate()

    df = _read_tracking_file(filename)
    df = add_times_to_file(df, periodcol, timecol, time_format)
    df = add_onice_players_to_df(df, focus_team, season, gamecol,
                                 player_output)
    _write_tracking_file(df, filename)
Example #12
0
def check_game_toi(season=None):
    """
    Rescrapes gone-final games if they do not pass the following checks:
        - (TODO)

    :param season: int, the season

    :return:
    """
    if season is None:
        season = schedules.get_current_season()

    sch = schedules.get_season_schedule(season)
    finals = sch.query(
        'Status == "Final" & TOIStatus == "Scraped" & Game >= 20001 & Game <= 30417'
    ).Game.values

    games_to_rescrape = []

    for game in finals:
        try:
            toi = parse_toi.get_parsed_toi(season, game)

            assert len(toi) >= 3595  # At least 3600 seconds in game, approx

            # TODO add other checks

        except AssertionError as ae:
            print(ae, ae.args, len(toi))

            games_to_rescrape.append(game)
        except IOError:
            games_to_rescrape.append(game)

    if len(games_to_rescrape) > 0:
        autoupdate.read_final_games(games_to_rescrape, season)
        teams.update_team_logs(season, force_games=games_to_rescrape)
Example #13
0
def find_recent_games(team1, team2=None, limit=1, season=None):
    """
    A convenience function that lists the most recent in progress or final games for specified team(s)

    :param team1: str, a team
    :param team2: str, a team (optional)
    :param limit: How many games to return
    :param season: int, the season

    :return: df with relevant rows
    """
    if season is None:
        season = schedules.get_current_season()
    sch = schedules.get_season_schedule(season)
    #sch = sch[sch.Status != "Scheduled"]  # doesn't work if data hasn't been updated
    sch = sch[sch.Date <= datetime.datetime.now().strftime('%Y-%m-%d')]

    t1 = team_info.team_as_id(team1)
    sch = sch[(sch.Home == t1) | (sch.Road == t1)]
    if team2 is not None:
        t2 = team_info.team_as_id(team2)
        sch = sch[(sch.Home == t2) | (sch.Road == t2)]

    return sch.sort_values('Game', ascending=False).iloc[:limit, :]
Example #14
0
def get_season_dropdown_options():
    """Use for options in season dropdown"""
    options = [{'label': '{0:d}-{1:s}'.format(yr, str(yr + 1)[2:]),
                'value': yr} for yr in range(2010, schedules.get_current_season()+1)]
    return options
Example #15
0
def get_game_graph_types():
    """Update this with more chart types for single games"""
    options = [{'label': 'Head-to-head', 'value': 'H2H'},
               {'label': 'Game timeline', 'value': 'TL'}]
    return options

#sch = reduced_schedule_dataframe(schedules.get_current_season())
clean_images_folder()

app = dash.Dash()

app.layout = html.Div(children=[html.H1(children='Welcome to the app for scrapenhl2'),
                                html.Label('Select season'),
                                dcc.Dropdown(
                                    options=get_season_dropdown_options(),
                                    value=schedules.get_current_season(),
                                    id='season-dropdown'),
                                html.Label('Select game'),
                                dcc.Dropdown(
                                    options=get_game_dropdown_options_for_season(schedules.get_current_season()),
                                    value=20001,
                                    id='game-dropdown'),
                                html.Label('Select graph type'),
                                dcc.RadioItems(
                                    id='game-graph-radio',
                                    options=get_game_graph_types(),
                                    value='H2H'),
                                html.Img(id='image', width=800)
                                ])

@app.callback(Output('game-dropdown', 'options'), [Input('season-dropdown', 'value')])
Example #16
0
                        game_h2h.game_h2h(season, gameid, save_file=h2hfile)
                        tweet_game_images(h2hfile, tlfile, hname, rname, status, data)
                        print('Success!')
                    except Exception as e:
                        print(data['text'], time.time(), e, e.args)
                        tweet_error("Sorry, there was an unknown error while making the charts (cc @muneebalamcu)",
                                    data)

            except Exception as e:
                print('Unexpected error')
                print(time.time(), data['text'], e, e.args)

# Use this try-catch to post an outgoing message
# I'm using Pycharm, so pressing stop will create a KeyboardInterrupt
try:
    stream = MyStreamer(
        consumer_key,
        consumer_secret,
        access_token,
        access_token_secret
    )
    stream.statuses.filter(track='@h2hbot')
except KeyboardInterrupt:
    if not SILENT:
        twitter.update_status(status="I'm turning off now ({0:s})".format(
            datetime.datetime.now().strftime('%Y-%m-%d %-H:%M ET')))
    if SCRAPED_NEW:
        teams.update_team_logs(schedules.get_current_season())


Example #17
0
    def on_success(self, data):
        if 'text' in data:
            print(data['text'])

            if r'https://t.co/' in data['text']:
                print('This looks like an image')
                return
            if data['text'][:3] == 'RT ':
                print('This looks like a retweet')
                return

            global LAST_UPDATE, SCRAPED_NEW
            try:
                if player_cf_graphs(data):
                    return

                try:
                    season, gameid = games.find_playoff_game(data['text'])
                except ValueError:
                    season = None
                    gameid = None

                # Get season with a 4-digit regex
                if season is None:
                    text = data['text'] + ' '
                    if re.search(r'\s\d{4}\s', text) is not None:
                        season = int(re.search(r'\s\d{4}\s', text).group(0))
                        if season < 2015 or season > schedules.get_current_season():
                            tweet_error("Sorry, I don't have data for this season yet", data)
                            print('Invalid season')
                            return
                    else:
                        season = schedules.get_current_season()

                # Get game with a 5-digit regex
                if gameid is None:
                    if re.search(r'\s\d{5}\s', text) is not None:
                        gameid = int(re.search(r'\s\d{5}\s', text).group(0))
                        if not schedules.check_valid_game(season, gameid):
                            tweet_error("Sorry, this game ID doesn't look right", data)
                            print('Game ID not right')
                            return
                    else:
                        pass

                if gameid is None:
                    # Get team names
                    parts = data['text'].replace('@h2hbot', '').strip().split(' ')
                    teams = []
                    for part in parts:
                        if re.match(r'[A-z]{3}', part.strip()):
                            part = part.upper()
                            if team_info.team_as_id(part) is not None:
                                teams.append(part)
                    if len(teams) == 0:
                        print('Think this was a tagged discussion')
                        return
                    elif len(teams) != 2:
                        tweet_error("Sorry, I need 2 teams. Found {0:d}. Make sure abbreviations are correct"
                                    .format(len(teams)), data)
                        return

                    team1, team2 = teams[:2]
                    gameid = games.most_recent_game_id(team1, team2)

                h2hfile = 'bot/{0:d}0{1:d}h2h.png'.format(season, gameid)
                tlfile = 'bot/{0:d}0{1:d}tl.png'.format(season, gameid)

                oldstatus = schedules.get_game_status(season, gameid)

                # Scrape only if:
                # Game is in current season AND
                # Game is today, and my schedule says it's "scheduled", OR
                # Game is today, and my schedule doesn't say it's final yet, and it's been at least
                #   5 min since last scrape, OR
                # Game was before today and my schedule doesn't say "final"
                # Update in these cases
                scrapeagain = False
                if season == schedules.get_current_season():
                    today = datetime.datetime.now().strftime('%Y-%m-%d')
                    gdata = schedules.get_game_data_from_schedule(season, gameid)
                    if gdata['Date'] == today:
                        if gdata['Status'] == 'Scheduled':
                            scrapeagain = True
                        elif gdata['Status'] != 'Final' and \
                                (LAST_UPDATE is None or time.time() - LAST_UPDATE >= 60 * 5):
                            scrapeagain = True
                    elif gdata['Date'] < today and gdata['Status'] != 'Final':
                        scrapeagain = True
                if scrapeagain:
                    autoupdate.autoupdate(season, update_team_logs=False)
                    LAST_UPDATE = time.time()
                    SCRAPED_NEW = True

                hname = schedules.get_home_team(season, gameid)
                rname = schedules.get_road_team(season, gameid)
                status = schedules.get_game_status(season, gameid)

                if 'In Progress' in oldstatus or status != oldstatus or not os.path.exists(tlfile):
                    try:
                        game_timeline.game_timeline(season, gameid, save_file=tlfile)
                        game_h2h.game_h2h(season, gameid, save_file=h2hfile)
                        tweet_game_images(h2hfile, tlfile, hname, rname, status, data)
                        print('Success!')
                    except Exception as e:
                        print(data['text'], time.time(), e, e.args)
                        tweet_error("Sorry, there was an unknown error while making the charts (cc @muneebalamcu)",
                                    data)

            except Exception as e:
                print('Unexpected error')
                print(time.time(), data['text'], e, e.args)