Esempio n. 1
0
def scrape_games(games, if_scrape_shifts, data_format='csv', rescrape=False, docs_dir=None):
    """
    Scrape a list of games
    
    :param games: list of game_ids
    :param if_scrape_shifts: Boolean indicating whether to also scrape shifts 
    :param data_format: format you want data in - csv or pandas (csv is default)
    :param rescrape: If you want to rescrape pages already scraped. Only applies if you supply a docs dir.
    :param docs_dir: Directory that either contains previously scraped docs or one that you want them to be deposited 
                     in after scraping
    
    :return: Dictionary with DataFrames and errors or None
    """
    # First check if the inputs are good
    check_data_format(data_format)

    # Check on the docs_dir and re_scrape
    shared.add_dir(docs_dir)
    shared.if_rescrape(rescrape)

    # Create List of game_id's and dates
    games_list = json_schedule.get_dates(games)

    # Scrape pbp and shifts
    pbp_df, shifts_df = scrape_list_of_games(games_list, if_scrape_shifts)

    if data_format.lower() == 'csv':
        to_csv(str(random.randint(1, 101)), pbp_df, shifts_df)
    else:
        return {"pbp": pbp_df, "shifts": shifts_df, "errors": errors} if if_scrape_shifts else {"pbp": pbp_df,
                                                                                                "errors": errors}
Esempio n. 2
0
def scrape_date_range(from_date, to_date, if_scrape_shifts, data_format='csv', preseason=False, rescrape=False, docs_dir=None):
    """
    Scrape games in given date range
    
    :param from_date: date you want to scrape from
    :param to_date: date you want to scrape to
    :param if_scrape_shifts: Boolean indicating whether to also scrape shifts 
    :param data_format: format you want data in - csv or  pandas (csv is default)
    :param preseason: Boolean indicating whether to include preseason games (default if False)
                      This is may or may not work!!! I don't give a shit.
    :param rescrape: If you want to rescrape pages already scraped. Only applies if you supply a docs dir. (def. = None)
    :param docs_dir: Directory that either contains previously scraped docs or one that you want them to be deposited 
                     in after scraping. (default is None)
    
    :return: Dictionary with DataFrames and errors or None
    """
    # First check if the inputs are good
    check_data_format(data_format)
    check_valid_dates(from_date, to_date)

    # Check on the docs_dir and re_scrape
    shared.add_dir(docs_dir)
    shared.if_rescrape(rescrape)

    games = json_schedule.scrape_schedule(from_date, to_date, preseason)
    pbp_df, shifts_df = scrape_list_of_games(games, if_scrape_shifts)

    if data_format.lower() == 'csv':
        to_csv(from_date+'--'+to_date, pbp_df, shifts_df)
    else:
        return {"pbp": pbp_df, "shifts": shifts_df, "errors": errors} if if_scrape_shifts else {"pbp": pbp_df,
                                                                                                "errors": errors}
def scrape_games(games, data_format='csv', rescrape=False, docs_dir=None):
    """
    Scrape a list of games

    :param games: list of game_ids
    :param data_format: format you want data in - csv or pandas (csv is default)
    :param rescrape: If you want to rescrape pages already scraped. Only applies if you supply a docs dir.
    :param docs_dir: Directory that either contains previously scraped docs or one that you want them to be deposited 
                     in after scraping

    :return: Dictionary with DataFrames or None
    """
    # First check if the inputs are good
    shared.check_data_format(data_format)

    # Check on the docs_dir and re_scrape
    shared.add_dir(docs_dir)
    shared.if_rescrape(rescrape)

    pbp_df = scrape_list_of_games(games)
    print_errors()

    if data_format.lower() == 'csv':
        shared.to_csv(str(random.randint(1, 101)), pbp_df, None, "nwhl")
    else:
        return pbp_df
def scrape_seasons(seasons,
                   if_scrape_shifts,
                   data_format='csv',
                   preseason=False,
                   rescrape=False,
                   docs_dir=None):
    """
    Given list of seasons it scrapes all the seasons 
    
    :param seasons: list of seasons
    :param if_scrape_shifts: Boolean indicating whether to also scrape shifts 
    :param data_format: format you want data in - csv or pandas (csv is default)
    :param preseason: Boolean indicating whether to include preseason games (default if False)
                      This is may or may not work!!! I don't give a shit.
    :param rescrape: If you want to rescrape pages already scraped. Only applies if you supply a docs dir.
    :param docs_dir: Directory that either contains previously scraped docs or one that you want them to be deposited 
                     in after scraping
    
    :return: Dictionary with DataFrames and errors or None
    """
    # First check if the inputs are good
    shared.check_data_format(data_format)

    # Check on the docs_dir and re_scrape
    shared.add_dir(docs_dir)
    shared.if_rescrape(rescrape)

    # Holds all seasons scraped (if not csv)
    master_pbps, master_shifts = [], []

    for season in seasons:
        from_date = '-'.join([str(season), '9', '1'])
        to_date = '-'.join([str(season + 1), '7', '1'])

        games = json_schedule.scrape_schedule(from_date, to_date, preseason)
        pbp_df, shifts_df = scrape_list_of_games(games, if_scrape_shifts)

        if data_format.lower() == 'csv':
            shared.to_csv(
                str(season) + str(season + 1), pbp_df, shifts_df, "nhl")
        else:
            master_pbps.append(pbp_df)
            master_shifts.append(shifts_df)

    if data_format.lower() == 'pandas':
        if if_scrape_shifts:
            return {
                "pbp": pd.concat(master_pbps),
                "shifts": pd.concat(master_shifts),
                "errors": errors
            }
        else:
            return {"pbp": pd.concat(master_pbps), "errors": errors}
Esempio n. 5
0
def set_docs_dir(user_dir):
    """
    Set the docs directory
    
    :param user_dir: User specified directory for storing saves scraped files
    
    :return: None
    """
    # We always want to rescrape since the files are being updated constantly
    shared.if_rescrape(True)

    shared.add_dir(user_dir)
def scrape_seasons(seasons, data_format='csv', rescrape=False, docs_dir=None):
    """
    Given list of seasons it scrapes all the seasons 

    :param seasons: list of seasons
    :param data_format: format you want data in - csv or pandas (csv is default)
    :param rescrape: If you want to rescrape pages already scraped. Only applies if you supply a docs dir.
    :param docs_dir: Directory that either contains previously scraped docs or one that you want them to be deposited 
                     in after scraping

    :return: Dictionary with DataFrames and errors or None
    """
    # First check if the inputs are good
    shared.check_data_format(data_format)

    # Check on the docs_dir and re_scrape
    shared.add_dir(docs_dir)
    shared.if_rescrape(rescrape)

    # Holds all seasons scraped (if not csv)
    master_pbps = []

    for season in seasons:
        from_date = '-'.join([str(season), '9', '1'])
        to_date = '-'.join([str(season + 1), '8', '31'])

        # Get dates and convert to just a list of game ids
        games = html_schedule.scrape_dates(from_date, to_date)
        game_ids = [game['game_id'] for game in games]

        # Scrape all PBP
        pbp_df = scrape_list_of_games(game_ids)

        # Merge in subtype
        pbp_df = pd.merge(pbp_df,
                          pd.DataFrame(games, columns=['game_id', 'sub_type']),
                          on="game_id",
                          how="left")

        if data_format.lower() == 'csv':
            shared.to_csv(str(season) + str(season + 1), pbp_df, None, "nwhl")
        else:
            master_pbps.append(pbp_df)

    print_errors()
    if data_format.lower() == 'pandas':
        return pd.concat(master_pbps, sort=True)
def scrape_date_range(from_date,
                      to_date,
                      data_format='csv',
                      rescrape=False,
                      docs_dir=None):
    """
    Scrape games in given date range

    :param from_date: date you want to scrape from
    :param to_date: date you want to scrape to
    :param data_format: format you want data in - csv or pandas (csv is default)
    :param rescrape: If you want to rescrape pages already scraped. Only applies if you supply a docs dir. (def. = None)
    :param docs_dir: Directory that either contains previously scraped docs or one that you want them to be deposited 
                     in after scraping. (default is None)

    :return: Dictionary with DataFrames and errors or None
    """
    # First check if the inputs are good
    shared.check_data_format(data_format)
    shared.check_valid_dates(from_date, to_date)

    # Check on the docs_dir and re_scrape
    shared.add_dir(docs_dir)
    shared.if_rescrape(rescrape)

    # Get dates and convert to just a list of game ids
    games = html_schedule.scrape_dates(from_date, to_date)
    game_ids = [game['game_id'] for game in games]

    # Scrape all PBP
    pbp_df = scrape_list_of_games(game_ids)

    # Merge in subtype
    pbp_df = pd.merge(pbp_df,
                      pd.DataFrame(games, columns=['game_id', 'sub_type']),
                      on="game_id",
                      how="left")

    print_errors()
    if data_format.lower() == 'csv':
        shared.to_csv(from_date + '--' + to_date, pbp_df, None, "nwhl")
    else:
        return pbp_df