def scrape_games(games, data_format='csv', rescrape=False, docs_dir=None): """ Scrape a list of games :param games: list of game_ids :param data_format: format you want data in - csv or pandas (csv is default) :param rescrape: If you want to rescrape pages already scraped. Only applies if you supply a docs dir. :param docs_dir: Directory that either contains previously scraped docs or one that you want them to be deposited in after scraping :return: Dictionary with DataFrames or None """ # First check if the inputs are good shared.check_data_format(data_format) # Check on the docs_dir and re_scrape shared.add_dir(docs_dir) shared.if_rescrape(rescrape) pbp_df = scrape_list_of_games(games) print_errors() if data_format.lower() == 'csv': shared.to_csv(str(random.randint(1, 101)), pbp_df, None, "nwhl") else: return pbp_df
def scrape_seasons(seasons, if_scrape_shifts, data_format='csv', preseason=False, rescrape=False, docs_dir=None): """ Given list of seasons it scrapes all the seasons :param seasons: list of seasons :param if_scrape_shifts: Boolean indicating whether to also scrape shifts :param data_format: format you want data in - csv or pandas (csv is default) :param preseason: Boolean indicating whether to include preseason games (default if False) This is may or may not work!!! I don't give a shit. :param rescrape: If you want to rescrape pages already scraped. Only applies if you supply a docs dir. :param docs_dir: Directory that either contains previously scraped docs or one that you want them to be deposited in after scraping :return: Dictionary with DataFrames and errors or None """ # First check if the inputs are good shared.check_data_format(data_format) # Check on the docs_dir and re_scrape shared.add_dir(docs_dir) shared.if_rescrape(rescrape) # Holds all seasons scraped (if not csv) master_pbps, master_shifts = [], [] for season in seasons: from_date = '-'.join([str(season), '9', '1']) to_date = '-'.join([str(season + 1), '7', '1']) games = json_schedule.scrape_schedule(from_date, to_date, preseason) pbp_df, shifts_df = scrape_list_of_games(games, if_scrape_shifts) if data_format.lower() == 'csv': shared.to_csv( str(season) + str(season + 1), pbp_df, shifts_df, "nhl") else: master_pbps.append(pbp_df) master_shifts.append(shifts_df) if data_format.lower() == 'pandas': if if_scrape_shifts: return { "pbp": pd.concat(master_pbps), "shifts": pd.concat(master_shifts), "errors": errors } else: return {"pbp": pd.concat(master_pbps), "errors": errors}
def scrape_seasons(seasons, data_format='csv', rescrape=False, docs_dir=None): """ Given list of seasons it scrapes all the seasons :param seasons: list of seasons :param data_format: format you want data in - csv or pandas (csv is default) :param rescrape: If you want to rescrape pages already scraped. Only applies if you supply a docs dir. :param docs_dir: Directory that either contains previously scraped docs or one that you want them to be deposited in after scraping :return: Dictionary with DataFrames and errors or None """ # First check if the inputs are good shared.check_data_format(data_format) # Check on the docs_dir and re_scrape shared.add_dir(docs_dir) shared.if_rescrape(rescrape) # Holds all seasons scraped (if not csv) master_pbps = [] for season in seasons: from_date = '-'.join([str(season), '9', '1']) to_date = '-'.join([str(season + 1), '8', '31']) # Get dates and convert to just a list of game ids games = html_schedule.scrape_dates(from_date, to_date) game_ids = [game['game_id'] for game in games] # Scrape all PBP pbp_df = scrape_list_of_games(game_ids) # Merge in subtype pbp_df = pd.merge(pbp_df, pd.DataFrame(games, columns=['game_id', 'sub_type']), on="game_id", how="left") if data_format.lower() == 'csv': shared.to_csv(str(season) + str(season + 1), pbp_df, None, "nwhl") else: master_pbps.append(pbp_df) print_errors() if data_format.lower() == 'pandas': return pd.concat(master_pbps, sort=True)
def scrape_date_range(from_date, to_date, if_scrape_shifts, data_format='csv', preseason=False, rescrape=False, docs_dir=None): """ Scrape games in given date range :param from_date: date you want to scrape from :param to_date: date you want to scrape to :param if_scrape_shifts: Boolean indicating whether to also scrape shifts :param data_format: format you want data in - csv or pandas (csv is default) :param preseason: Boolean indicating whether to include preseason games (default if False) This is may or may not work!!! I don't give a shit. :param rescrape: If you want to rescrape pages already scraped. Only applies if you supply a docs dir. (def. = None) :param docs_dir: Directory that either contains previously scraped docs or one that you want them to be deposited in after scraping. (default is None) :return: Dictionary with DataFrames and errors or None """ # First check if the inputs are good shared.check_data_format(data_format) shared.check_valid_dates(from_date, to_date) # Check on the docs_dir and re_scrape shared.add_dir(docs_dir) shared.if_rescrape(rescrape) games = json_schedule.scrape_schedule(from_date, to_date, preseason) pbp_df, shifts_df = scrape_list_of_games(games, if_scrape_shifts) if data_format.lower() == 'csv': shared.to_csv(from_date + '--' + to_date, pbp_df, shifts_df, "nhl") else: return { "pbp": pbp_df, "shifts": shifts_df, "errors": errors } if if_scrape_shifts else { "pbp": pbp_df, "errors": errors }
def scrape_date_range(from_date, to_date, data_format='csv', rescrape=False, docs_dir=None): """ Scrape games in given date range :param from_date: date you want to scrape from :param to_date: date you want to scrape to :param data_format: format you want data in - csv or pandas (csv is default) :param rescrape: If you want to rescrape pages already scraped. Only applies if you supply a docs dir. (def. = None) :param docs_dir: Directory that either contains previously scraped docs or one that you want them to be deposited in after scraping. (default is None) :return: Dictionary with DataFrames and errors or None """ # First check if the inputs are good shared.check_data_format(data_format) shared.check_valid_dates(from_date, to_date) # Check on the docs_dir and re_scrape shared.add_dir(docs_dir) shared.if_rescrape(rescrape) # Get dates and convert to just a list of game ids games = html_schedule.scrape_dates(from_date, to_date) game_ids = [game['game_id'] for game in games] # Scrape all PBP pbp_df = scrape_list_of_games(game_ids) # Merge in subtype pbp_df = pd.merge(pbp_df, pd.DataFrame(games, columns=['game_id', 'sub_type']), on="game_id", how="left") print_errors() if data_format.lower() == 'csv': shared.to_csv(from_date + '--' + to_date, pbp_df, None, "nwhl") else: return pbp_df
def scrape_games(games, if_scrape_shifts, data_format='csv', rescrape=False, docs_dir=None): """ Scrape a list of games :param games: list of game_ids :param if_scrape_shifts: Boolean indicating whether to also scrape shifts :param data_format: format you want data in - csv or pandas (csv is default) :param rescrape: If you want to rescrape pages already scraped. Only applies if you supply a docs dir. :param docs_dir: Directory that either contains previously scraped docs or one that you want them to be deposited in after scraping :return: Dictionary with DataFrames and errors or None """ # First check if the inputs are good shared.check_data_format(data_format) # Check on the docs_dir and re_scrape shared.add_dir(docs_dir) shared.if_rescrape(rescrape) # Create List of game_id's and dates games_list = json_schedule.get_dates(games) # Scrape pbp and shifts pbp_df, shifts_df = scrape_list_of_games(games_list, if_scrape_shifts) if data_format.lower() == 'csv': shared.to_csv(str(random.randint(1, 101)), pbp_df, shifts_df, "nhl") else: return { "pbp": pbp_df, "shifts": shifts_df, "errors": errors } if if_scrape_shifts else { "pbp": pbp_df, "errors": errors }
def test_check_data_format(): """ Test if it recognized the correct formats allowed""" # These both are fine shared.check_data_format("Csv") shared.check_data_format("pandaS") # Should raise an exception with pytest.raises(shared.HaltException): shared.check_data_format("txt")