def roster_helper(sport, team, parser_func): """ Delegate function which helps query the roster for a provided team of a provided sport. :param sport: The name of the sport :type sport: str :param team: The name of the team :type team: str :param parser_func: A callback function :type parser_func: str :returns: A JSON response :rtype: flask.Response """ team_id = get_team_id(sport, team) if team_id is None: abort(404) rv = fetch_cached_data(args=sport + str(team_id)) if rv is not None: return rv soup = help_fetch_soup( url=ROSTER_URL.replace(SPORT_TOKEN, sport), request_params={ PARAM_TEAM : team_id, PARAM_RESOURCE_TYPE: ARG_RESOURCE_TYPE } ) out = prepare_json_output(help_parse_soup(soup, parser_func)) del soup # Cache for 24 hours cache_data(data=out, args=sport + str(team_id), timeout=60 * 60 * 24) return out
def rankings_helper(url, parser_func): """ Returns all rankings for all matches :param url: URL of the ranking :type url: str :param parser_func: The parsing function to be applied to the scraped :type parser_func: str :returns: A formatted dictionary ready for display :rtype: dict """ rv = fetch_cached_data() if rv is not None: return rv tour = help_get_list_from_dropdown(url, attr_name="tour") stack = {} for the_round in tour: soup = help_fetch_soup( url=url, request_params={PARAM_TOUR : the_round} ) stack[the_round] = help_parse_soup(soup, parser_func) return stack out = prepare_json_output(stack) del stack # Cache for 12 hours cache_data(data=out, timeout=60 * 60 * 12) return out
def stats_helper(sport, team, parser_func): rv = fetch_cached_data() if rv is not None: return rv soup = help_fetch_soup( url=STATS_URL.replace(SPORT_TOKEN, sport), request_params={ PARAM_TEAM: team, # debugging for now PARAM_RESOURCE_TYPE: ARG_RESOURCE_TYPE }, class_attrs="sortable shsTable shsBorderTable") out = prepare_json_output(help_parse_soup(soup, parser_func)) del soup # Cache for 24 hours cache_data(data=out, timeout=60 * 60 * 24) return out
def stats_helper(sport, team, parser_func): rv = fetch_cached_data() if rv is not None: return rv soup = help_fetch_soup( url=STATS_URL.replace(SPORT_TOKEN, sport), request_params={ PARAM_TEAM : team, # debugging for now PARAM_RESOURCE_TYPE: ARG_RESOURCE_TYPE }, class_attrs="sortable shsTable shsBorderTable" ) out = prepare_json_output(help_parse_soup(soup, parser_func)) del soup # Cache for 24 hours cache_data(data=out, timeout=60 * 60 * 24) return out
def standings_helper(url, league, skip_conference_row=False): """ Fetches and parses data. Supports layouts with multiple tables or with single tables. At the time of writing this, the MLB standings is split into 2 tables, while the NHL is 1. :param league: The league of the desired scoreboard :type league: str :returns: A formatted dictionary ready for display :rtype: dict """ rv = fetch_cached_data() if rv is not None: return rv soup = help_fetch_soup(url) column_list = [] row_list = [] stack = {} # Iterate over each conference/league for table in soup("table"): conference = None division = None # Iterate over each division for row in table("tr"): if row.get("class") is None: continue elif "shsTableTtlRow" in row.get("class"): if skip_conference_row: continue # Single table layout support. # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # If the string conference evaluates to true, then we've # encountered a new conference. Save the data that # exists in the lists column_list and row_list if conference: # Does the list colum_stack have any data? if column_list: # Is this layout split into divisions? if division: row_list.append( { division : column_list } ) else: row_list = column_list column_list = [] stack[conference] = row_list row_list = [] conference = row.extract().text.strip().lower().encode("utf-8") conference = slugify(text=unicode(conference, "utf-8"), delimiter=u'_') elif "shsTableSubttlRow" in row.get("class"): # If the string division evaluates to true, then we've # encountered a new division. Save the data that # exists in the lists column_list and row_list if division: # Does the list colum_stack have any data? if column_list: # Is this layout split into divisions? if division: row_list.append( { division : column_list } ) else: row_list.append(column_list) column_list = [] division = row.extract().text.strip().lower().encode("utf-8") division = division.replace("division", '') division = slugify(text=unicode(division, "utf-8"), delimiter=u'_') elif any(css_class.startswith("shsRow") for css_class in row.get("class")): cells = row("td") value_dict = None if "mlb" == league: value_dict = help_parse_mlb_soup(cells) elif "nhl" == league: value_dict = help_parse_nhl_soup(cells) elif "nfl" == league: value_dict = help_parse_nfl_soup(cells) elif "nba" == league: value_dict = help_parse_nba_soup(cells) elif "mls" == league: value_dict = help_parse_mls_soup(cells) elif "epl" == league: value_dict = help_parse_epl_soup(cells) if value_dict is not None: column_list.append(value_dict) #end for row in table("tr") # Get the last division in the table if division: row_list.append( { division : column_list } ) # If there is no division, then make the columns close to the # conference else: row_list = column_list column_list = [] # We must evaluate conference because EPL and MLS do not have # conferences if conference: stack[conference] = row_list # If a conference is nonexistent, then check for division's # existence. If a division exists, then treat as if it was a # conference (i.e. place the division at the highest level). # Currently, this only occurs with MLS. elif division: if row_list[0][division]: stack[division] = row_list[0][division] # Otherwise, both conference and division are nonexistent. # Convert stack into a list so the teams are ordered # accordingly. # Currently, this only occurs with EPL. else: # stack is a Dictionary, change it to a list del stack stack = row_list row_list = [] #end for table in soup("table") out = prepare_json_output(stack) del row_list, stack # Cache for 2 hours cache_data(data=out, timeout=60 * 60 * 2) return out
def posts(): """ Queries posts from WordPress.com's public API implementation for NESN.com. This does a few things. First, it simplifies WordPress.com's complex JSON response from their public API. It gathers the Facebook like and comment count along the Tweet count for each URL. It also interfaces with WordPress.com's Photon service to crop images for display on mobile devices. :returns: JSON :rtype: flask.Response """ rv = fetch_cached_data() if rv is not None: return jsonify(rv) args = { PARAM_WORDPRESS_POST_CATEGORY : request.args.get(PARAM_NESN_POST_CATEGORY), PARAM_WORDPRESS_POST_COUNT : request.args.get(PARAM_NESN_POST_COUNT) } r = requests.get(url=POSTS_URL, params=args) posts = r.json() # Were any posts found? if 0 == posts["found"]: return jsonify(message="No posts found.", status=200) urls = [] # Build list of URLs to pass to Facebook's Graph tool for post in posts["posts"]: urls.append(post["URL"]) #-- Facebook Request urls_str = (','.join('\'' + url + '\'' for url in urls)) url = ARG_FQL.replace(FQL_TOKEN, urls_str) args = {PARAM_FACEBOOK_QUERY : url} r = requests.get(url=FACEBOOK_GRAPH_URL, params=args) fb_response = r.json() vals = {} stack = [] # stripper = HTMLStripper() for post in posts["posts"]: categories = [] vals["id"] = post["ID"] vals["author"] = post["author"]["name"] vals["title"] = post["title"] vals["published"] = int(timestamp_from_string(post["date"])) vals["modified"] = int(timestamp_from_string(post["modified"])) vals["image"] = post["featured_image"] # stripper.feed(post["content"]) # vals["content"] = stripper.get_text() vals["content"] = post["content"] vals["url"] = post["URL"] #-- Twitter Request args = {PARAM_TWITTER_URL : vals["url"]} r = requests.get(url=TWITTER_URLS_URL, params=args) vals["tweets"] = int(r.json()["count"]) # Match up Facebook data with URL for link_stat in fb_response["data"]: if link_stat["url"] == vals["url"]: vals["facebook"] = {"likes" : link_stat["like_count"], "comments" : link_stat["comment_count"]} for category in post["categories"]: categories.append(category) vals["categories"] = categories stack.append(vals.copy()) vals = {} del args, categories, fb_response, r, vals out = prepare_json_output(stack) del stack # Automatically cached for 15 minutes cache_data(out) return jsonify(out)
def scores_helper(year=None, month=None, day=None, sport=None, league=None): """ Generic helper function to scrape scoring data from STATS's JavaScript file. :param year: The year of the desired scoreboard :type year: int :param month: The month of the desired scoreboard :type month: int :param day: The day of the desired scoreboard :type day: int :param sport: The sport of the desired scoreboard :type sport: str :param league: The league of the desired scoreboard :type league: str :returns: A formatted dictionary ready for display :rtype: dict """ try: date_string = stats_date_string(date(year, month, day)) except (ValueError, TypeError): date_string = stats_date_string() rv = fetch_cached_data() if rv is not None: return rv args = { PARAM_SPORT : sport, PARAM_DATE : date_string, PARAM_LEAGUE : league } soup = help_fetch_soup( SCORES_URL, request_params=args, source_file_type="JavaScript", class_attrs="shsTable" ) # If there is 1 or 0 rows in the document, then, there are probably # no scores listed. if len(soup("tr")) <= 2: # del soup # Cache for a day to be safe out = {"message" : "No games scheduled for "} if not month and not day and not year: out["message"] += "today" else: out["message"] += "{month}/{day}/{year}".format(month=month, day=day, year=year) cache_data(data=out, timeout=60 * 60 * 24) return out stack = {} vals = [] section = '' team = None has_the_status_cell = False # logcat(str(soup)) for row in soup("tr"): # Rows which have team names do not have . # This test must be first. if row.get("class") is None: cells = row("td") has_the_status_cell = False # Does this row have a status row? if any(css_class in "shsMiniStatus" for cell in cells for css_class in cell.get("class") ): has_the_status_cell = True if len(cells) >= 2: team = "home" if team is "away" or None else "away" # If the list of values is empty, then initialize it if not vals: vals.append({"away": None, "home":None}) # If the list is complete, then append a new item # indicating a new game. elif vals[-1]["away"] and vals[-1]["home"]: vals.append({"away": None, "home":None}) # Add scoring information for the game. vals[-1][team] = { "team": cells[0].find('a').extract().text.strip().encode("utf-8"), "score": cells[1].extract().text.strip().encode("utf-8") } try: # Try to convert the string to an int. vals[-1][team]["score"] = int(vals[-1][team]["score"]) except (ValueError, TypeError): # If it fails, assign null vals[-1][team]["score"] = None if has_the_status_cell: status = cells[2].find('a') # Arbitrary game information, such as "OT" for # overtime extra = cells[2].find('br') time = cells[2].find(name="span", attrs={"class" : "shsTimezone shsGMTZone"}) # Set the status only if not null if status: vals[-1]["status"] = status.extract().text.strip().encode("utf-8") if 2 == len(vals[-1]["status"].split('-')): # Save the string to the right of '-' in # extra if not extra: extra = vals[-1]["status"].split('-')[1].strip() vals[-1]["status"] = vals[-1]["status"].split('-')[0].strip() vals[-1]["status"] = vals[-1]["status"].lower() if time: vals[-1]["time"] = time.extract().text.strip().encode("utf-8") if extra: # Sometimes, extra contains a NavigableString try: vals[-1]["extra"] = extra.extract().text.strip().encode("utf-8") # While other times, it's just a str except: vals[-1]["extra"] = extra vals[-1]["extra"] = vals[-1]["extra"].lower() # Skip over the first line, it's the title elif "shsTableTtlRow" in row.get("class"): continue elif any(css_class in "shsTableSubttlRow shsSubSectionRow shsMiniRowSpacer" for css_class in row.get("class")): cell = row("td") section = cell[0].extract().text.strip().encode("utf-8") # Are the scores separated into sections? If so, find the # separator if section: section = slugify(text=unicode(section, "utf-8"), delimiter=u'_') if vals: stack[section] = vals vals = [] # return section stack[section] = None # Save the last value else: if section: stack[section] = vals else: stack = vals del vals out = prepare_json_output(stack) # Cache for 1 minute cache_data(data=out, timeout=60) return out
def teams_helper(sport=None): """ Generic helper function to scrape scoring data from STATS's JavaScript file. """ flat_list = query_string_arg_to_bool(PARAM_FLAT_LIST) rv = fetch_cached_data(args=PARAM_FLAT_LIST if flat_list else None) if rv is not None: return rv # STATs does not order NFL teams nfl_teams = [ "Atlanta Falcons", "Buffalo Bills", "Chicago Bears", "Cincinnati Bengals", "Cleveland Browns", "Dallas Cowboys", "Denver Broncos", "Detroit Lions", "Green Bay Packers", "Tennessee Titans", "Indianapolis Colts", "Kansas City Chiefs", "Oakland Raiders", "St. Louis Rams", "Miami Dolphins", "Minnesota Vikings", "New England Patriots", "New Orleans Saints", "New York Giants", "New York Jets", "Philadelphia Eagles", "Arizona Cardinals", "Pittsburgh Steelers", "San Diego Chargers", "San Francisco 49ers", "Seattle Seahawks", "Tampa Bay Buccaneers", "Washington Redskins", "Carolina Panthers", "Jacksonville Jaguars", '', '', "Baltimore Ravens", "Houston Texans" ] soup = help_fetch_soup(url=TEAMS_URL.replace(URL_TOKEN, sport)) stack = [] redis_stack = [] league_stack = [] division_stack = [] league = None division = None # Iterate over each conference for table in soup("table"): for row in table("tr"): if row.get("class") is None: continue cells = row("td") # Conference Row if "shsTableTtlRow" in row.get("class"): if flat_list: continue if division_stack and division: league_stack.append({division: division_stack}) division_stack = [] if league_stack and league: stack.append({league: league_stack}) league_stack = [] league = format_division(row) # Division Row elif "shsTableSubttlRow" in row.get("class"): if flat_list: continue if division_stack and division: league_stack.append({division: division_stack}) division_stack = [] division = format_division(row) # Team Row else: team = cells[0].extract().text.strip().encode("utf-8") # Save the team as a flat list for persistent storage redis_stack.append(team) if flat_list: stack.append(team) else: division_stack.append(team) else: if division_stack and division: league_stack.append({division: division_stack}) division_stack = [] if league_stack and league: stack.append({league: league_stack}) league_stack = [] out = prepare_json_output(stack) del soup, division_stack, league_stack, stack redis_key = app.config["REDIS_KEY_TEAMS"].replace( app.config["REDIS_KEY_TOKEN_SPORT"], "nfl" if "fb" == sport else sport) if not redis.exists(redis_key): if "fb" == sport: redis_stack = nfl_teams # Convert the object to a JSON string redis.set(name=redis_key, value=dumps(prepare_json_output(redis_stack))) del redis_key, redis_stack cache_data( data=out, args=PARAM_FLAT_LIST if flat_list else None, timeout=60 * 60 * 24 * 300 # Cache for 300 days ) return out
def schedule_helper(sport, team, from_month=None, to_month=None, parser_func=None): """ Returns all rankings for all matches TODO: Support filtering operations passed via query string. :param url: URL of the ranking :type url: str :param team: The ID of the team :type team: int :param from_month: The month number from which the schedule begins :type from_month: int :param to_month: The month number for which the schedule terminates :type to_month: int :param parse_func: The parsing function to be applied to the scraped :type parse_func: str :returns: A formatted dictionary ready for display :rtype: dict """ team_id = get_team_id(sport, team) if team_id is None: abort(404) rv = fetch_cached_data(args=sport + str(team_id)) if rv is not None: return rv url = SCHEDULE_URL.replace(SPORT_TOKEN, sport) stack = [] # At this time, the NFL schedule is not listed by month. if from_month is None and to_month is None: args = { PARAM_TEAM : format_int_for_stats(team_id), PARAM_RESOURCE_TYPE: ARG_RESOURCE_TYPE } soup = help_fetch_soup(url, request_params=args) # Only use the first table stack = help_parse_soup(soup("table")[0], parser_func) # Iterate through schedules which have a separate URL for each month else: # To increase readability, we allow the caller function to define # from_month and to_month in a familiar format. However, if the # values of from_month and to_month are 9 to 6 respectively, then it # becomes impossible to build an xrange. To, correct this, we ensure # the value of to_month is always greater than the value of # from_month by increasing its value 12 and then taking # the mod base 12 later on down the river. to_month = to_month + 12 if to_month < from_month else to_month for month in xrange(from_month, to_month): # Build the argument list for STATS. args = { PARAM_TEAM : format_int_for_stats(team_id), PARAM_RESOURCE_TYPE: ARG_RESOURCE_TYPE, PARAM_MONTH : format_month_number_for_stats(month, pad_with_zero=True) } # http://stackoverflow.com/questions/15871769/using-beautiful-soup-grabbing-stuff-between-li-and-li soup = help_fetch_soup(url, request_params=args) # Must use += to make this a flat list stack += help_parse_soup(soup, parser_func, format_month_number_for_stats(month)) # stack[month] = help_parse_soup(soup, parser_func, month) out = prepare_json_output(stack) cache_data(data=out, args=sport + str(team_id), timeout=CACHE_TIMEOUT) return out
def mlb(): # Because this object does not take any arguments, always cache rv = fetch_cached_data(cache_key="mlb_injuries") if rv is not None: return jsonify(rv) r = requests.get("http://stats.nesn.com/mlb/stats.asp?file=recentinj") raw_string = re.sub(r"\s+", ' ', r.text) # http://stackoverflow.com/questions/15871769/using-beautiful-soup-grabbing-stuff-between-li-and-li soup = BeautifulSoup(raw_string, from_encoding="UTF-8", parse_only=SoupStrainer( name="div", attrs={"id": "shsMLBrecentinj"})) del r, raw_string # for e in soup.findAll('br'): # e.extract() team = None vals = {} stack = [] team_stack = [] # Remove title iter_soup = soup(["h2", "table"]) iter_soup.pop(0) iter_soup.pop(0) # return str(iter_soup) for item in iter_soup: if item is None: continue # The team name elif "shsTableTitle" in item.get("class"): team = item.extract().text.encode("utf-8").lower().replace( ' ', '_') # The important data else: for row in item("tr"): # The title row... Date, Player, Status if "shsTableTtlRow" in row.get("class"): continue cells = row("td") vals["ts"] = int( timestamp_from_string( cells[0].extract().text.encode("utf-8"))) vals["player"] = cells[1].extract().text.encode("utf-8") vals["status"] = cells[2].extract().text.encode("utf-8") team_stack.append(vals.copy()) if team_stack: stack.append({team: team_stack}) team_stack = [] out = prepare_json_output(stack) # Cache for 12 hours cache_data(data=out, timeout=60 * 60 * 12) return jsonify(out)