def webpage(team): team_games = "https://www.basketball-reference.com/teams/" + NBA[ team] + "/2020_games.html" u_client = u_req(team_games) team_page = u_client.read() u_client.close() team_soup = soup(team_page, "html.parser") season = team_soup.findAll("tr") return season
def work(): from bs4 import BeautifulSoup as soup from urllib.request import urlopen as u_req my_url = "https://thehackernews.com" u_client = u_req(my_url) page_html = u_client.read() u_client.close() page_soup = soup(page_html, "html.parser") container = page_soup.findAll("div", {"class": "clear home-right"}) print("www.hackernews.com---\n\n") for getter in container: print(getter.h2.text) print() print()
from bs4 import BeautifulSoup as soup import json week = 1 data = {} while (week < 18): data['week_' + str(week)] = [] #Eventually add in week url = 'https://www.pro-football-reference.com/years/2019/week_' + str( week) + '.htm' #print(url) u_client = u_req(url) page_html = u_client.read() u_client.close() page_soup = soup(page_html, "html.parser") containers = page_soup.findAll("div", {"class": "game_summary expanded nohover"}) print(len(containers)) for x in range(len(containers)): contain = containers[x] # contain = contain.findAll("table"), {"class": "teams"} # print(contain)
from bs4 import BeautifulSoup as soup from urllib.request import urlopen as u_req import csv from datetime import datetime filename = "daily_gainer_and_losers_%s.csv" % datetime.date(datetime.now()) print(filename) client = u_req("https://www.moneycontrol.com/") page = client.read() client.close() page_soup = soup(page, 'html.parser') id = ['tgNifty', 'tgSensex', 'tlNifty', 'tlSensex'] ''' id for each div such that tgNifty: Top Gainers Nifty tgSensex: Top Gainers Sensex tlNifty: Top Losers Nifty tlSensex: Top Losers Sensex ''' for i in id: div = page_soup.findAll('div', {'id': i}) tbody = div[0].findAll('tbody') tr = tbody[0].findAll('tr') for row in tr: td = row.findAll('td') t = [d.text for d in td] with open(filename, 'a+') as csvFile:
def get_clues_per_game(game_number): print(game_number) # JSON to return game_JSON = {}; game_JSON["categories_sj"] = []; game_JSON["categories_dj"] = []; game_JSON["categories_fj"] = []; game_JSON["clues_sj"] = {}; game_JSON["clues_dj"] = {}; game_JSON["clues_fj"] = {}; # J! Archive URL url = "http://j-archive.com/showgame.php?game_id=" + str(game_number) # Open connection u_client = u_req(url) # Get source html and parse with soup page_html = u_client.read() u_client.close() page_soup = soup(page_html, "html.parser") # Get map of jeopardy round to category list categories = page_soup.findAll("td", {"class": "category_name"}) categories_list = [cat.getText() for cat in categories] category_counter = 0 while category_counter < 13: category_counter_str = str(category_counter % 6) cateogory_to_append = {} cateogory_to_append["title"] = categories_list[category_counter] cateogory_to_append["clues"] = [category_counter_str + "-0", category_counter_str + "-1", category_counter_str + "-2", category_counter_str + "-3", category_counter_str + "-4"] if category_counter < 6: game_JSON["categories_sj"].append(cateogory_to_append) elif category_counter < 12: game_JSON["categories_dj"].append(cateogory_to_append) else: cateogory_to_append["clues"] = ["0-0"] game_JSON["categories_fj"].append(cateogory_to_append) category_counter += 1; # Get clue attrs clues = page_soup.findAll("td", {"class": "clue"}) # Extract text, id, value, and answer from the clue clue_questions = [clue.findAll("td", {"class": "clue_text"})[0].getText() for clue in clues if clue.div is not None] clue_ids = [clue.div.findAll("td", {"class": "clue_unstuck"})[0]['id'] for clue in clues if clue.div is not None and len(clue.div.findAll("td", {"class": "clue_unstuck"})) > 0] clue_answers = [clue.div['onmouseover'].split("correct_response\">")[1].split("</em>")[0] for clue in clues if clue.div is not None] clean_clue_answers = [] for answer in clue_answers: clean_answer = answer.replace("<i>", "").replace("</i>", "").replace("\\", "") clean_clue_answers.append(clean_answer) # Exclude clues that they didn't get to during the game all_clue_ids = ['clue_J_1_1_stuck', 'clue_J_2_1_stuck', 'clue_J_3_1_stuck', 'clue_J_4_1_stuck', 'clue_J_5_1_stuck', 'clue_J_6_1_stuck', 'clue_J_1_2_stuck', 'clue_J_2_2_stuck', 'clue_J_3_2_stuck', 'clue_J_4_2_stuck', 'clue_J_5_2_stuck', 'clue_J_6_2_stuck', 'clue_J_1_3_stuck', 'clue_J_2_3_stuck', 'clue_J_3_3_stuck', 'clue_J_4_3_stuck', 'clue_J_5_3_stuck', 'clue_J_6_3_stuck', 'clue_J_1_4_stuck', 'clue_J_2_4_stuck', 'clue_J_3_4_stuck', 'clue_J_4_4_stuck', 'clue_J_5_4_stuck', 'clue_J_6_4_stuck', 'clue_J_1_5_stuck', 'clue_J_2_5_stuck', 'clue_J_3_5_stuck', 'clue_J_4_5_stuck', 'clue_J_5_5_stuck', 'clue_J_6_5_stuck', 'clue_DJ_1_1_stuck', 'clue_DJ_2_1_stuck', 'clue_DJ_3_1_stuck', 'clue_DJ_4_1_stuck', 'clue_DJ_5_1_stuck', 'clue_DJ_6_1_stuck', 'clue_DJ_1_2_stuck', 'clue_DJ_2_2_stuck', 'clue_DJ_3_2_stuck', 'clue_DJ_4_2_stuck', 'clue_DJ_5_2_stuck', 'clue_DJ_6_2_stuck', 'clue_DJ_1_3_stuck', 'clue_DJ_2_3_stuck', 'clue_DJ_3_3_stuck', 'clue_DJ_4_3_stuck', 'clue_DJ_5_3_stuck', 'clue_DJ_6_3_stuck', 'clue_DJ_1_4_stuck', 'clue_DJ_2_4_stuck', 'clue_DJ_3_4_stuck', 'clue_DJ_4_4_stuck', 'clue_DJ_5_4_stuck', 'clue_DJ_6_4_stuck', 'clue_DJ_1_5_stuck', 'clue_DJ_2_5_stuck', 'clue_DJ_3_5_stuck', 'clue_DJ_4_5_stuck', 'clue_DJ_5_5_stuck', 'clue_DJ_6_5_stuck'] excluded_clues = list(set(all_clue_ids).difference(clue_ids)) for ex_clue in excluded_clues: all_clue_ids[all_clue_ids.index(ex_clue)] = "unused" # Add clues to JSON for i in range(len(clue_ids)): row_str = str((i % 6)) if i < 30: col_str = str(math.floor(i / 6)) add_clue_sj = {} if all_clue_ids[i] == "unused": add_clue_sj["question"] = "Unused question" add_clue_sj["answer"] = "Unused answer" else: add_clue_sj["question"] = clue_questions[i] add_clue_sj["answer"] = clean_clue_answers[i] add_clue_sj["value"] = (math.floor(i / 6) + 1) * 200 add_clue_sj["is_dd"] = False game_JSON["clues_sj"][row_str + "-" + col_str] = add_clue_sj else: col_str = str((math.floor(i / 6) - 5)) add_clue_dj = {} if all_clue_ids[i] == "unused": add_clue_dj["question"] = "Unused question" add_clue_dj["answer"] = "Unused answer" else: add_clue_dj["question"] = clue_questions[i] add_clue_dj["answer"] = clean_clue_answers[i] add_clue_dj["value"] = (math.floor((i - 30) / 6) + 1) * 400 add_clue_dj["is_dd"] = False game_JSON["clues_dj"][row_str + "-" + col_str] = add_clue_dj # Handle final jeopardy separately final_jeopardy = page_soup.findAll("table", {"class": "final_round"})[0] add_clue_fj = {} add_clue_fj["question"] = page_soup.findAll("td", {"id": "clue_FJ"})[0].getText() add_clue_fj["answer"] = final_jeopardy.div['onmouseover'].split("correct_response")[1].split("</em>")[0][3:] add_clue_fj["value"] = 10000; add_clue_fj["is_dd"] = False; game_JSON["clues_fj"]["0-0"] = add_clue_fj; #pp.pprint(game_JSON) return game_JSON
def get_week_moneylines(week, season=2017): """Collect the historical moneylines for a given week/season Keyword argument: week -- week to query season -- season to query, defaults to 2017 Output arguments: df -- frame with columns (game #, season, week, favorite, home, away, home moneyline, away moneyline, favorite odds) """ # Collect page data casino_id = '2&wjb' my_url = 'http://m.espn.com/nfl/dailyline?week={}&season={}&seasonType={}'.format( week, season, casino_id) # open connection u_client = u_req(my_url) # download page page_html = u_client.read() u_client.close() # let beautiful soup parse it page_soup = Soup(page_html, "html.parser") table_data = page_soup.table.find_all("td") # Parse Data n_games = int(np.floor(len(table_data) / 4)) teams = np.empty(shape=[n_games, 2], dtype='U4') mline = np.empty(shape=[n_games, 2]) favorite_odds = np.empty(shape=[n_games, 1]) favorite = np.empty(n_games, dtype='U4') count = 0 for i in range(0, len(table_data), 4): # a = [table_data[i].contents[idx] for idx in [0, 2]] teams[count, :] = np.asarray( team_name2abbrv([table_data[i].contents[idx] for idx in [0, 2]])) mline[count, :] = [ int(table_data[i + 1].contents[idx]) for idx in [0, 2] ] favorite_odds[count] = probability_favorite_moneyline( mline[count, 0], mline[count, 1]) if mline[count, 1] < 0: favorite[count] = teams[count, 1] else: favorite[count] = teams[count, 0] count = count + 1 # Assign to data frame game_colname = [] for game in range(teams.shape[0]): game_colname.append(teams[game, 0] + '_' + teams[game, 1]) df = pd.DataFrame(columns=[ 'season', 'Week', 'home', 'away', 'favorite', 'homeML', 'awayML', 'favorite_odds' ], index=game_colname) df['season'] = season df['Week'] = week df['favorite'] = favorite df[['home', 'away']] = teams df['favorite_odds'] = favorite_odds df[['homeML', 'awayML']] = mline return df
def get_odds_current_week(week_number=None): """Get odds for the current week Output parameters: df -- dataFrame with columns (home/away is favorite, odds in favor of favorite, favorite team name) """ # Define functions for cleaner code def get_favorite_team_names(game_names, n_games, favorite): favorite_team_name_full = [] home = [] away = [] for g in range(n_games): s = game_names[g].split() idx_at = s.index('at') idx_dash = s.index('-') if favorite[g] == 'Home': favored_team = " ".join(s[(idx_at + 1):idx_dash]) # under_dog_team = " ".join(s[0:idx_at]) else: favored_team = " ".join(s[0:idx_at]) # under_dog_team = " ".join(s[(idx_at + 1):idx_dash]) favorite_team_name_full.append(team_name2abbrv(favored_team)[0]) home_current_game = team_name2abbrv(' '.join(s[(idx_at + 1):idx_dash]))[0] away_current_game = team_name2abbrv(' '.join(s[0:idx_at]))[0] home.append(home_current_game) away.append(away_current_game) return favorite_team_name_full, home, away def add_game_results(spread, favorite, spread_p, moneyline_p): if np.mean(spread) > 0: favorite.append('Home') else: favorite.append('Away') spread_p = np.append(spread_p, np.mean(spreads_p_current_game)) moneyline_p = np.append(moneyline_p, np.mean(moneylines_p_current_game)) return spread_p, moneyline_p, favorite def reset_one_game_variables(): spread = np.array([]) temp_spreads_p = np.array([]) temp_moneylines_p = np.array([]) return spread, temp_spreads_p, temp_moneylines_p def parse_current_source(current_row): row_data = current_row.find_all("td") if row_data[1].td is not None: spread_current_source = float(row_data[1].td.contents[0]) else: spread_current_source = None if len(row_data) < 5: moneyline_current_source = [None, None] else: moneyline_current_source = \ [re.findall('-?\d+', line) for line in [row_data[7].td.contents[i] for i in [0, 2]]] moneyline_current_source = [ int(moneyline_current_source[0][0]), int(moneyline_current_source[1][0]) ] return spread_current_source, moneyline_current_source if week_number is None: week_number = int(input('What is the current week?\n')) # ## Get the page my_url = 'http://www.espn.com/nfl/lines' # open connection class NoInternet(Exception): pass try: u_client = u_req(my_url) except urllib.error.URLError: raise NoInternet("\n\nNot connected to the internet") # download page page_html = u_client.read() u_client.close() # let beautiful soup parse it page_soup = Soup(page_html, "html.parser") # Extract the table page_table = page_soup.table # Get the games games = page_table.find_all('tr', {'class': 'stathead'}) n_games = len(games) game_names = [g.string[0:-5] for g in games] # ## Calculate probabilities from moneyline and spread # ### Calculate probabilities # #### 1) Get moneyline and spread # #### 2) convert each valid one to a probability # #### 3) Average probabilities moneylines_p_current_game = np.array( []) # Hold the probability based on moneyline for each source spreads_p_current_game = np.array( []) # Hold the probability based on spread for each source spreads_current_game = np.array( []) # Hold the spread for each source for a game moneyline_p = np.array( []) # Hold the probability based on moneyline for each game spread_p = np.array( []) # Hold the probability based on spread for each game # Home/away favorite? favorite = [] # Iterate through all rows in table, includes spreads, moneylines, team names, footers etc for row in games[0].next_siblings: is_new_game = row["class"] == [ 'stathead' ] # Contains info on the game (e.g. teams) is_new_source = ((row["class"] == ['oddrow']) or (row["class"] == ['evenrow'])) and (row.p is None) if is_new_game: # process information from just finished game spread_p, moneyline_p, favorite = add_game_results( spreads_current_game, favorite, spread_p, moneyline_p) spreads_current_game, spreads_p_current_game, moneylines_p_current_game = reset_one_game_variables( ) if is_new_source: spread_current_source, moneyline_current_source = parse_current_source( row) if spread_current_source is not None: # add this source to array holding spreads spreads_current_game = np.append(spreads_current_game, spread_current_source) else: continue # Get probabilities from spread and moneyline (if available) spreads_p_current_game = np.append( spreads_p_current_game, probability_favorite_spread(spread_current_source)) if any([line == 0 for line in moneyline_current_source]): continue else: moneylines_p_current_game = np.append( moneylines_p_current_game, probability_favorite_moneyline( moneyline_current_source[0], moneyline_current_source[1])) spread_p, moneyline_p, favorite = add_game_results(spreads_current_game, favorite, spread_p, moneyline_p) favorite_team_name, home, away = get_favorite_team_names( game_names, n_games, favorite) game_colname = [] for game in range(int(len(home))): game_colname.append(away[game] + '_' + home[game]) df = pd.DataFrame(columns=[ 'season', 'Week', 'home', 'away', 'favorite', 'favorite_odds' ], index=game_colname) team_name2abbrv(favorite_team_name) df['home'] = home df['away'] = away df['favorite'] = favorite_team_name # df['favorite home or away'] = favorite df['favorite_odds'] = moneyline_p df['season'] = 2017 df['Week'] = week_number return df