Exemple #1
0
def webpage(team):
    team_games = "https://www.basketball-reference.com/teams/" + NBA[
        team] + "/2020_games.html"

    u_client = u_req(team_games)
    team_page = u_client.read()
    u_client.close()

    team_soup = soup(team_page, "html.parser")
    season = team_soup.findAll("tr")
    return season
Exemple #2
0
def work():
    from bs4 import BeautifulSoup as soup
    from urllib.request import urlopen as u_req
    my_url = "https://thehackernews.com"
    u_client = u_req(my_url)
    page_html = u_client.read()
    u_client.close()
    page_soup = soup(page_html, "html.parser")
    container = page_soup.findAll("div", {"class": "clear home-right"})
    print("www.hackernews.com---\n\n")
    for getter in container:
        print(getter.h2.text)
        print()
    print()
Exemple #3
0
from bs4 import BeautifulSoup as soup
import json

week = 1
data = {}

while (week < 18):
    data['week_' + str(week)] = []

    #Eventually add in week
    url = 'https://www.pro-football-reference.com/years/2019/week_' + str(
        week) + '.htm'

    #print(url)

    u_client = u_req(url)
    page_html = u_client.read()
    u_client.close()

    page_soup = soup(page_html, "html.parser")

    containers = page_soup.findAll("div",
                                   {"class": "game_summary expanded nohover"})

    print(len(containers))

    for x in range(len(containers)):
        contain = containers[x]
        # contain = contain.findAll("table"), {"class": "teams"}
        # print(contain)
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as u_req
import csv
from datetime import datetime

filename = "daily_gainer_and_losers_%s.csv" % datetime.date(datetime.now())
print(filename)
client = u_req("https://www.moneycontrol.com/")
page = client.read()
client.close()
page_soup = soup(page, 'html.parser')

id = ['tgNifty', 'tgSensex', 'tlNifty', 'tlSensex']
'''
id for each div such that

tgNifty: Top Gainers Nifty
tgSensex: Top Gainers Sensex
tlNifty: Top Losers Nifty
tlSensex: Top Losers Sensex
'''

for i in id:

    div = page_soup.findAll('div', {'id': i})
    tbody = div[0].findAll('tbody')
    tr = tbody[0].findAll('tr')
    for row in tr:
        td = row.findAll('td')
        t = [d.text for d in td]
        with open(filename, 'a+') as csvFile:
def get_clues_per_game(game_number):
    print(game_number)
    # JSON to return
    game_JSON = {};
    game_JSON["categories_sj"] = [];
    game_JSON["categories_dj"] = [];
    game_JSON["categories_fj"] = [];
    game_JSON["clues_sj"] = {};
    game_JSON["clues_dj"] = {};
    game_JSON["clues_fj"] = {};

    # J! Archive URL
    url = "http://j-archive.com/showgame.php?game_id=" + str(game_number)

    # Open connection
    u_client = u_req(url)

    # Get source html and parse with soup
    page_html = u_client.read()
    u_client.close()
    page_soup = soup(page_html, "html.parser")

    # Get map of jeopardy round to category list
    categories = page_soup.findAll("td", {"class": "category_name"})
    categories_list = [cat.getText() for cat in categories]

    category_counter = 0
    while category_counter < 13:
        category_counter_str = str(category_counter % 6)
        cateogory_to_append = {}
        cateogory_to_append["title"] = categories_list[category_counter]
        cateogory_to_append["clues"] = [category_counter_str + "-0", category_counter_str + "-1",
            category_counter_str + "-2", category_counter_str + "-3", category_counter_str + "-4"]
        if category_counter < 6:
            game_JSON["categories_sj"].append(cateogory_to_append)
        elif category_counter < 12:
            game_JSON["categories_dj"].append(cateogory_to_append)
        else:
            cateogory_to_append["clues"] = ["0-0"]
            game_JSON["categories_fj"].append(cateogory_to_append)
        category_counter += 1;

    # Get clue attrs
    clues = page_soup.findAll("td", {"class": "clue"})

    # Extract text, id, value, and answer from the clue
    clue_questions = [clue.findAll("td", {"class": "clue_text"})[0].getText() for clue in clues if clue.div is not None]
    clue_ids = [clue.div.findAll("td", {"class": "clue_unstuck"})[0]['id'] for clue in clues
        if clue.div is not None and len(clue.div.findAll("td", {"class": "clue_unstuck"})) > 0]
    clue_answers = [clue.div['onmouseover'].split("correct_response\">")[1].split("</em>")[0] for clue in clues
        if clue.div is not None]

    clean_clue_answers = []
    for answer in clue_answers:
        clean_answer = answer.replace("<i>", "").replace("</i>", "").replace("\\", "")
        clean_clue_answers.append(clean_answer)

    # Exclude clues that they didn't get to during the game
    all_clue_ids = ['clue_J_1_1_stuck', 'clue_J_2_1_stuck', 'clue_J_3_1_stuck', 'clue_J_4_1_stuck',
        'clue_J_5_1_stuck', 'clue_J_6_1_stuck', 'clue_J_1_2_stuck', 'clue_J_2_2_stuck', 'clue_J_3_2_stuck',
        'clue_J_4_2_stuck', 'clue_J_5_2_stuck', 'clue_J_6_2_stuck', 'clue_J_1_3_stuck', 'clue_J_2_3_stuck',
        'clue_J_3_3_stuck', 'clue_J_4_3_stuck', 'clue_J_5_3_stuck', 'clue_J_6_3_stuck', 'clue_J_1_4_stuck',
        'clue_J_2_4_stuck', 'clue_J_3_4_stuck', 'clue_J_4_4_stuck', 'clue_J_5_4_stuck', 'clue_J_6_4_stuck',
        'clue_J_1_5_stuck', 'clue_J_2_5_stuck', 'clue_J_3_5_stuck', 'clue_J_4_5_stuck', 'clue_J_5_5_stuck',
        'clue_J_6_5_stuck', 'clue_DJ_1_1_stuck', 'clue_DJ_2_1_stuck', 'clue_DJ_3_1_stuck', 'clue_DJ_4_1_stuck',
        'clue_DJ_5_1_stuck', 'clue_DJ_6_1_stuck', 'clue_DJ_1_2_stuck', 'clue_DJ_2_2_stuck', 'clue_DJ_3_2_stuck',
        'clue_DJ_4_2_stuck', 'clue_DJ_5_2_stuck', 'clue_DJ_6_2_stuck', 'clue_DJ_1_3_stuck', 'clue_DJ_2_3_stuck',
        'clue_DJ_3_3_stuck', 'clue_DJ_4_3_stuck', 'clue_DJ_5_3_stuck', 'clue_DJ_6_3_stuck', 'clue_DJ_1_4_stuck',
        'clue_DJ_2_4_stuck', 'clue_DJ_3_4_stuck', 'clue_DJ_4_4_stuck', 'clue_DJ_5_4_stuck', 'clue_DJ_6_4_stuck',
        'clue_DJ_1_5_stuck', 'clue_DJ_2_5_stuck', 'clue_DJ_3_5_stuck', 'clue_DJ_4_5_stuck', 'clue_DJ_5_5_stuck',
        'clue_DJ_6_5_stuck']
    excluded_clues = list(set(all_clue_ids).difference(clue_ids))
    for ex_clue in excluded_clues:
        all_clue_ids[all_clue_ids.index(ex_clue)] = "unused"

    # Add clues to JSON
    for i in range(len(clue_ids)):
        row_str = str((i % 6))
        if i < 30:
            col_str = str(math.floor(i / 6))
            add_clue_sj = {}
            if all_clue_ids[i] == "unused":
                add_clue_sj["question"] = "Unused question"
                add_clue_sj["answer"] = "Unused answer"
            else:
                add_clue_sj["question"] = clue_questions[i]
                add_clue_sj["answer"] = clean_clue_answers[i]
            add_clue_sj["value"] = (math.floor(i / 6) + 1) * 200
            add_clue_sj["is_dd"] = False
            game_JSON["clues_sj"][row_str + "-" + col_str] = add_clue_sj
        else:
            col_str = str((math.floor(i / 6) - 5))
            add_clue_dj = {}
            if all_clue_ids[i] == "unused":
                add_clue_dj["question"] = "Unused question"
                add_clue_dj["answer"] = "Unused answer"
            else:
                add_clue_dj["question"] = clue_questions[i]
                add_clue_dj["answer"] = clean_clue_answers[i]
            add_clue_dj["value"] = (math.floor((i - 30) / 6) + 1) * 400
            add_clue_dj["is_dd"] = False
            game_JSON["clues_dj"][row_str + "-" + col_str] = add_clue_dj

    # Handle final jeopardy separately
    final_jeopardy = page_soup.findAll("table", {"class": "final_round"})[0]
    add_clue_fj = {}
    add_clue_fj["question"] = page_soup.findAll("td", {"id": "clue_FJ"})[0].getText()
    add_clue_fj["answer"] = final_jeopardy.div['onmouseover'].split("correct_response")[1].split("</em>")[0][3:]
    add_clue_fj["value"] = 10000;
    add_clue_fj["is_dd"] = False;
    game_JSON["clues_fj"]["0-0"] = add_clue_fj;

    #pp.pprint(game_JSON)
    return game_JSON
Exemple #6
0
def get_week_moneylines(week, season=2017):
    """Collect the historical moneylines for a given week/season

    Keyword argument:
    week -- week to query
    season -- season to query, defaults to 2017

    Output arguments:
    df -- frame with columns (game #, season, week, favorite, home, away, home moneyline, away moneyline, favorite odds)

    """
    # Collect page data
    casino_id = '2&wjb'
    my_url = 'http://m.espn.com/nfl/dailyline?week={}&season={}&seasonType={}'.format(
        week, season, casino_id)

    # open connection
    u_client = u_req(my_url)
    # download page
    page_html = u_client.read()
    u_client.close()

    # let beautiful soup parse it
    page_soup = Soup(page_html, "html.parser")
    table_data = page_soup.table.find_all("td")

    # Parse Data

    n_games = int(np.floor(len(table_data) / 4))
    teams = np.empty(shape=[n_games, 2], dtype='U4')
    mline = np.empty(shape=[n_games, 2])
    favorite_odds = np.empty(shape=[n_games, 1])
    favorite = np.empty(n_games, dtype='U4')
    count = 0
    for i in range(0, len(table_data), 4):
        # a = [table_data[i].contents[idx] for idx in [0, 2]]
        teams[count, :] = np.asarray(
            team_name2abbrv([table_data[i].contents[idx] for idx in [0, 2]]))
        mline[count, :] = [
            int(table_data[i + 1].contents[idx]) for idx in [0, 2]
        ]
        favorite_odds[count] = probability_favorite_moneyline(
            mline[count, 0], mline[count, 1])
        if mline[count, 1] < 0:
            favorite[count] = teams[count, 1]
        else:
            favorite[count] = teams[count, 0]
        count = count + 1
    # Assign to data frame
    game_colname = []
    for game in range(teams.shape[0]):
        game_colname.append(teams[game, 0] + '_' + teams[game, 1])
    df = pd.DataFrame(columns=[
        'season', 'Week', 'home', 'away', 'favorite', 'homeML', 'awayML',
        'favorite_odds'
    ],
                      index=game_colname)
    df['season'] = season
    df['Week'] = week
    df['favorite'] = favorite
    df[['home', 'away']] = teams
    df['favorite_odds'] = favorite_odds
    df[['homeML', 'awayML']] = mline

    return df
Exemple #7
0
def get_odds_current_week(week_number=None):
    """Get odds for the current week

    Output parameters:
    df -- dataFrame with columns (home/away is favorite, odds in favor of favorite, favorite team name)
    """

    # Define functions for cleaner code
    def get_favorite_team_names(game_names, n_games, favorite):
        favorite_team_name_full = []
        home = []
        away = []
        for g in range(n_games):
            s = game_names[g].split()
            idx_at = s.index('at')
            idx_dash = s.index('-')
            if favorite[g] == 'Home':
                favored_team = " ".join(s[(idx_at + 1):idx_dash])
                # under_dog_team = " ".join(s[0:idx_at])
            else:
                favored_team = " ".join(s[0:idx_at])
                # under_dog_team = " ".join(s[(idx_at + 1):idx_dash])
            favorite_team_name_full.append(team_name2abbrv(favored_team)[0])
            home_current_game = team_name2abbrv(' '.join(s[(idx_at +
                                                            1):idx_dash]))[0]
            away_current_game = team_name2abbrv(' '.join(s[0:idx_at]))[0]
            home.append(home_current_game)
            away.append(away_current_game)

        return favorite_team_name_full, home, away

    def add_game_results(spread, favorite, spread_p, moneyline_p):
        if np.mean(spread) > 0:
            favorite.append('Home')
        else:
            favorite.append('Away')
        spread_p = np.append(spread_p, np.mean(spreads_p_current_game))
        moneyline_p = np.append(moneyline_p,
                                np.mean(moneylines_p_current_game))
        return spread_p, moneyline_p, favorite

    def reset_one_game_variables():
        spread = np.array([])
        temp_spreads_p = np.array([])
        temp_moneylines_p = np.array([])
        return spread, temp_spreads_p, temp_moneylines_p

    def parse_current_source(current_row):
        row_data = current_row.find_all("td")
        if row_data[1].td is not None:
            spread_current_source = float(row_data[1].td.contents[0])
        else:
            spread_current_source = None
        if len(row_data) < 5:
            moneyline_current_source = [None, None]
        else:
            moneyline_current_source = \
                [re.findall('-?\d+', line) for line in [row_data[7].td.contents[i] for i in [0, 2]]]
            moneyline_current_source = [
                int(moneyline_current_source[0][0]),
                int(moneyline_current_source[1][0])
            ]
        return spread_current_source, moneyline_current_source

    if week_number is None:
        week_number = int(input('What is the current week?\n'))
    # ## Get the page
    my_url = 'http://www.espn.com/nfl/lines'

    # open connection
    class NoInternet(Exception):
        pass

    try:
        u_client = u_req(my_url)
    except urllib.error.URLError:
        raise NoInternet("\n\nNot connected to the internet")

    # download page
    page_html = u_client.read()
    u_client.close()

    # let beautiful soup parse it
    page_soup = Soup(page_html, "html.parser")

    # Extract the table
    page_table = page_soup.table

    # Get the games
    games = page_table.find_all('tr', {'class': 'stathead'})
    n_games = len(games)
    game_names = [g.string[0:-5] for g in games]

    # ## Calculate probabilities from moneyline and spread

    # ### Calculate probabilities
    # #### 1) Get moneyline and spread
    # #### 2) convert each valid one to a probability
    # #### 3) Average probabilities

    moneylines_p_current_game = np.array(
        [])  # Hold the probability based on moneyline for each source
    spreads_p_current_game = np.array(
        [])  # Hold the probability based on spread for each source
    spreads_current_game = np.array(
        [])  # Hold the spread for each source for a game

    moneyline_p = np.array(
        [])  # Hold the probability based on moneyline for each game
    spread_p = np.array(
        [])  # Hold the probability based on spread for each game

    # Home/away favorite?
    favorite = []
    # Iterate through all rows in table, includes spreads, moneylines, team names, footers etc
    for row in games[0].next_siblings:
        is_new_game = row["class"] == [
            'stathead'
        ]  # Contains info on the game (e.g. teams)
        is_new_source = ((row["class"] == ['oddrow']) or
                         (row["class"] == ['evenrow'])) and (row.p is None)
        if is_new_game:  # process information from just finished game
            spread_p, moneyline_p, favorite = add_game_results(
                spreads_current_game, favorite, spread_p, moneyline_p)
            spreads_current_game, spreads_p_current_game, moneylines_p_current_game = reset_one_game_variables(
            )
        if is_new_source:
            spread_current_source, moneyline_current_source = parse_current_source(
                row)
            if spread_current_source is not None:
                # add this source to array holding spreads
                spreads_current_game = np.append(spreads_current_game,
                                                 spread_current_source)
            else:
                continue
            # Get probabilities from spread and moneyline (if available)
            spreads_p_current_game = np.append(
                spreads_p_current_game,
                probability_favorite_spread(spread_current_source))
            if any([line == 0 for line in moneyline_current_source]):
                continue
            else:
                moneylines_p_current_game = np.append(
                    moneylines_p_current_game,
                    probability_favorite_moneyline(
                        moneyline_current_source[0],
                        moneyline_current_source[1]))
    spread_p, moneyline_p, favorite = add_game_results(spreads_current_game,
                                                       favorite, spread_p,
                                                       moneyline_p)
    favorite_team_name, home, away = get_favorite_team_names(
        game_names, n_games, favorite)
    game_colname = []
    for game in range(int(len(home))):
        game_colname.append(away[game] + '_' + home[game])
    df = pd.DataFrame(columns=[
        'season', 'Week', 'home', 'away', 'favorite', 'favorite_odds'
    ],
                      index=game_colname)
    team_name2abbrv(favorite_team_name)
    df['home'] = home
    df['away'] = away
    df['favorite'] = favorite_team_name
    # df['favorite home or away'] = favorite
    df['favorite_odds'] = moneyline_p
    df['season'] = 2017
    df['Week'] = week_number
    return df