def purge_database(): """ Archive old documents """ with mongo.Mongo() as database: database.purge_old_documents() sys.exit(0)
def get_reputation_events_for_source(addr, source, start_date): """ Get reputation events with full data (raw data included) for a given ip and a given source. :param str addr: Ip the reputation must be computed with :param str source: Source short name to get events of :param int start_date: Timestamp the events must be retrieved from :rtype: array :return: Array of events """ with mongo.Mongo() as database: events = database.find_all_event_data_for_ip(addr, start_date, True) result = [event for event in events if event['source'] == _map_source_from_shortname(source)] # Find the first data to determine whether data are b64 encoded or not. is_encoded = False for event in result: if event['data']: is_encoded = utils.is_base64_encoded(event['data']) break # If data are encoded, then decode all if is_encoded: for event in result: event['data'] = base64.b64decode(event['data']).decode() if event['data'] else event['data'] return result
def player_box_score(game_id): """ Scrape all player stats from a specific game and store in Mongo :param game_id: MongoDB and Basketball Reference game id """ # HTML Content response = requests.get('https://www.basketball-reference.com/boxscores/' + game_id + '.html') soup = BeautifulSoup(response.content, "html.parser") # MongoDB Collection mongo_wrapper = mongo.Mongo() # The ids of the tables have team names in them table_id = re.compile('^box_[a-z]{3}_basic$') home_players = [] away_players = [] home = False for table in soup.find_all(id=table_id): sub_table = table.find('tbody') for player in sub_table.find_all('tr', {'class': None}): player_stats = {} # Player ID player_id = player.find('th') player_id = player_id['data-append-csv'] player_stats['player'] = player_id # Loop through each stat for stat in player.find_all('td'): player_stats[stat['data-stat']] = scrape_utils.stat_parse(stat['data-stat'], stat.string) # pylint: disable=line-too-long # If this key exists it means the player did not play if 'reason' not in player_stats: if home: home_players.append(player_stats) else: away_players.append(player_stats) home = True # Insert into database mongo_wrapper.update( 'game_log', {'_id': game_id}, {'$set': { 'hplayers': home_players, 'aplayers': away_players }})
def send_reports(self): """ The only public method used to run the process of email sending. """ with mongo.Mongo() as database: for entry in database.find_highest_scores(): subject = self._prepare_subject(entry['_id'], entry['value']) raw = self._prepare_raw(database, entry['_id']) body = self._prepare_body(entry['_id'], entry['value'], raw) self._send_mail(subject, body)
def betting_df(season=None, sportsbooks=None): """ Creates a Pandas DataFrame that contains betting information by game/sportsbook. Args: season: List of NBA Seasons sportsbooks: List of sportsbook names Returns: A Pandas DataFrame containing odds information """ mongo_wrapper = mongo.Mongo() season_match = {} sportsbook_match = {} # Match the right season if season is not None: if isinstance(season, int): season = [season] season_match = {'season': {'$in': season}} # Match the right sportsbook if sportsbooks is not None: if isinstance(sportsbooks, str): sportsbooks = [sportsbooks] sportsbook_match = {'sportsbook': {'$in': sportsbooks}} # Mongo Aggregation pipeline = [{ '$match': season_match }, { '$project': { 'odd': '$odds.sportsbooks' } }, { '$unwind': '$odd' }, { '$project': { 'sportsbook': '$odd.sportsbook', 'home_odds': '$odd.home_odds', 'away_odds': '$odd.away_odds' } }, { '$match': sportsbook_match }] cursor = mongo_wrapper.aggregate(mongo_wrapper.GAME_LOG, pipeline) return pd.DataFrame(list(cursor))
def team_abilities(decay, att_constraint, def_constraint, day_span): """ Return abilities based on the time decay factor Args: decay: Time decay parameter att_constraint: Mean Attack Constraint of the model def_constraint: Mean Defence Constraint of the model Returns: Pandas DataFrame of team parameters by week """ query = { 'mw': decay, 'att_constraint': att_constraint, 'def_constraint': def_constraint, 'day_span': day_span } projection = { '_id': 0, 'model': 0, 'mw': 0, 'att_constraint': 0, 'def_constraint': 0 } mongo_wrapper = mongo.Mongo() cursor = mongo_wrapper.find(mongo_wrapper.DIXON_TEAM, query, projection) # The attack and defence columns are dicts, so need to expand them and then # melt so that each row is a team/week abilities_df = pd.DataFrame(list(cursor)) attack = pd.DataFrame(abilities_df.att.values.tolist()) attack['date'] = abilities_df['date'] attack = attack.melt('date', var_name='team', value_name='attack') defence = pd.DataFrame(abilities_df['def'].values.tolist()) defence['date'] = abilities_df['date'] defence = defence.melt('date', var_name='team', value_name='defence') home_adv = pd.DataFrame(abilities_df['home_adv'].values.tolist()) home_adv['date'] = abilities_df['date'] home_adv = home_adv.melt('date', var_name='team', value_name='home_adv') abilities_df = attack.merge(defence) abilities_df = abilities_df.merge(home_adv) return abilities_df
def player_per_game(player): """ Scrape a player's yearly per game stats""" # Mongo mongo_wrapper = mongo.Mongo() # Request url = "http://www.basketball-reference.com" + player['url'] response = requests.get(url) soup = BeautifulSoup(response.content, "html.parser") # Player's statistics per_game = soup.find(id="per_game").find('tbody') # When there are missing years, there is no id or data-stat for the year regex = re.compile('.*') # Player dictionary player_stats = { '_id': url.rsplit('/', 1)[-1].rsplit('.', 1)[0], 'name': player['name'], 'seasons': [] } # These entries are defined in the per game and advanced tables # Only want them to be displayed once per season for a player #entries = ['age', 'team_id', 'lg_id', 'pos', 'g', 'gs', 'mp'] # Iterate through the years for year in per_game.find_all('tr', {'id': regex}): # Season stats season = {} season_year = year['id'][9:13] season['season'] = int(season_year) # Each stat in a season (Per Game) for stat in year.find_all('td', {'data-stat': regex}): season[stat['data-stat']] = scrape_utils.stat_parse( stat['data-stat'], stat.string) #for key in entries: # if key in season: # player_stats[season_year][key] = season.pop(key) player_stats['seasons'].append(season) mongo_wrapper.insert('player_season', player_stats)
def aggregate_reputation_per_source(addr, start_date): """ Aggregate ip reputation per source returning for each source the sum of the weights. :param str addr: Ip the reputation must be computed with :param int start_date: Timestamp the events must be retrieved from :rtype: dict :return: dictionary that gives for each source, the aggregated weight """ with mongo.Mongo() as database: events = database.find_all_events_for_ip(addr, start_date, True) # Reduce by source scores_by_source = _compute_score_by_source(events) # Append sources which are missing in scores_by_source (no attached events) for parser in parsers.keys(): if parser not in scores_by_source.keys(): scores_by_source[parser] = 0 # Format final dto result = [] for source in scores_by_source.keys(): if source not in shortened_names.keys(): short_name = source else: short_name = shortened_names[source] result.append({ 'short_name': short_name, 'full_name': source, 'result': scores_by_source[source], }) return result
def player_abilities(decay, day_span): query = {'mw': decay, 'day_span': day_span} projection = { '_id': 0, 'mw': 0, 'day_span': 0, } mongo_wrapper = mongo.Mongo() cursor = mongo_wrapper.find(mongo_wrapper.PLAYERS_BETA, query, projection) abilities_df = pd.DataFrame(list(cursor)) df = pd.concat([ abilities_df.drop(['player'], axis=1), abilities_df['player'].apply( pd.Series) ], axis=1) df['mean'] = beta.mean(df.a, df.b) return df
def init_source_ips(): collection_name = MONGO_COLLECTION_SOURCE mongo_conn = mongo.Mongo().get_conn(collection_name) Downloader(APNIC_URL).download() _gen_source_ip(mongo_conn) mongo.Mongo().init_index(collection_name)
def get_starting_lineups(team, year): """ Scrape a team's starting lineup for every game in a season. :param team: NBA Team (Team abbreviation) :param year: NBA Season """ # MongoDB mongo_wrapper = mongo.Mongo() # Rename team if relocated team = scrape_utils.rename_team(team, year) # Starting Lineup URL url = "http://www.basketball-reference.com/teams/%s/%s_start.html" % (team, year) response = requests.get(url) response.raise_for_status() soup = BeautifulSoup(response.content, "html.parser") team = scrape_utils.rename_team(team) # Line up table lineup_table = soup.find(id='starting_lineups').find('tbody') # Iterate through each game for game in lineup_table.find_all('tr', {'class': None}): # Information to query mongodb to update collection date = game.find('td', {'data-stat': 'date_game'}).text date = datetime.strptime(date, '%a, %b %d, %Y') opponent = game.find('td', {'data-stat': 'opp_name'}) opponent = opponent.find('a')['href'][7:10] # Determine home team for query location = game.find('td', {'data-stat': 'game_location'}).text lineup = [] if location == '@': home = opponent away = team key = 'starters.away' else: home = team away = opponent key = 'starters.home' # Get the starting lineup starters = game.find('td', {'data-stat': 'game_starters'}) for player in starters.find_all('a'): lineup.append(player['href'].rsplit('/', 1)[-1].rsplit('.', 1)[0]) # Update document mongo_wrapper.update('game_log', { 'date': date, 'home.team': home, 'away.team': away }, {'$set': { key: lineup }})
def __init__(self): self.source_ips_path = SOURCE_IPS_PATH self.collection = mongo.Mongo().get_conn(MONGO_COLLECTION_SOURCE) self.port_str = PORT_STR self.scan_options = "-sV --host-timeout {}".format(SCAN_TIMEOUT)
def game_results(season=None, teams=None, date=None): """ Creates a Pandas DataFrame that contains game results. Args: season: A list of season numbers teams: Team Names, if it's not None the DataFrame will contain indices Returns: A Pandas DataFrame containing a historical NBA results. """ mongo_wrapper = mongo.Mongo() season_match = {} # Match the right season if season is not None: if isinstance(season, int): season = [season] season_match['season'] = {'$in': season} date_match = {} if date is not None: date_match['date'] = {'$lt': date} pipeline = [{ '$match': season_match }, { '$project': { 'home_team': '$home.team', 'away_team': '$away.team', 'home_pts': '$home.pts', 'away_pts': '$away.pts', 'season': 1, 'date': 1 } }, { '$match': date_match }] # Could aggregate cursor = mongo_wrapper.aggregate(mongo_wrapper.GAME_LOG, pipeline) games_df = pd.DataFrame(list(cursor)) # If team names are included, replace index numbers if teams is not None: home_index = np.zeros(len(games_df), dtype=int) away_index = np.zeros(len(games_df), dtype=int) # Iterate through each game for row in games_df.itertuples(): # Team indexes home_index[row.Index] = teams.index(row.home_team) away_index[row.Index] = teams.index(row.away_team) games_df['home_team'] = pd.Series(home_index, index=games_df.index) games_df['away_team'] = pd.Series(away_index, index=games_df.index) return games_df
def player_results(season=None, date=None): # MongoDB m = mongo.Mongo() season_match = {} # Match the right season if season is not None: if isinstance(season, int): season = [season] season_match['season'] = {'$in': season} date_match = {} if date is not None: date_match['date'] = {'$lt': date} df = None for i in [['$hplayers.player', '$hplayers.pts', '$home.team', '$home.pts'], ['$aplayers.player', '$aplayers.pts', '$away.team', '$away.pts']]: pipeline = [{ '$match': season_match }, { '$match': date_match }, { '$project': { 'player': i[0], 'pts': i[1], 'team': i[2], 'team_pts': i[3], 'date': 1, 'season': 1 } }, { '$unwind': { 'path': '$player', 'includeArrayIndex': 'player_index' } }, { '$unwind': { 'path': '$pts', 'includeArrayIndex': 'pts_index' } }, { '$project': { 'date': 1, 'team': 1, 'season': 1, 'player': 1, 'pts': 1, 'team_pts': 1, 'compare': { '$cmp': ['$player_index', '$pts_index'] } } }, { '$match': { 'compare': 0 } }] games = m.aggregate('game_log', pipeline) if df is None: df = pd.DataFrame(list(games)) else: df = pd.concat([df, pd.DataFrame(list(games))]) return df
def __init__(self): self.collection_source = mongo.Mongo().get_conn(MONGO_COLLECTION_SOURCE) self.collection_http = mongo.Mongo().get_conn(MONGO_COLLECTION_HTTP) self.collection_https = mongo.Mongo().get_conn(MONGO_COLLECTION_HTTPS) self.http_check_url = HTTP_CHECK_URL self.https_check_url = HTTPS_CHECK_URL
def get_conn(): if not hasattr(g, 'mongodb'): g.mongodb = mongo.Mongo() return g.mongodb
def __init__(self, mw, att_constraint, def_constraint, day_span=7): # Team Information self.nteams = 30 self.teams = process_utils.name_teams(False, 30) # MongoDB self.mongo = mongo.Mongo() # Model parameters self.mw = mw self.att_constraint = att_constraint self.def_constraint = def_constraint self.day_span = day_span self.today = datetime.datetime.now() self.today = pd.Timestamp( self.today.replace(hour=0, minute=0, second=0, microsecond=0)) # Train new abilities if they don't exist in the database if self.mongo.count( self.mongo.DIXON_TEAM, { 'mw': self.mw, 'att_constraint': self.att_constraint, 'def_constraint': self.def_constraint, 'day_span': self.day_span }) == 0: print('Training Team Abilities') self.train_all(teams=True, players=False) # ELIF TRAIN MISSING DAYS elif self.mongo.count( self.mongo.DIXON_TEAM, { 'mw': self.mw, 'att_constraint': self.att_constraint, 'def_constraint': self.def_constraint, 'day_span': self.day_span, 'date': self.today }) == 0: print('Scraping Missing Games') for team in scrape_utils.team_names(): team_scraper.season_game_logs(team, 2019) print('Training Missing Days (Including Today)') ab = datasets.team_abilities(mw, att_constraint, def_constraint, day_span) games = datasets.game_results([2017, 2018, 2019]) missing_ab = ab.merge(games, on='date', how='right') # Train for the missing dates for date in missing_ab.loc[missing_ab.team.isnull(), 'date'].unique(): self.train(pd.Timestamp(date)) # Need to add today as this won't include that self.train(self.today) # Train new abilities if they don't exist in the database if self.mongo.count(self.mongo.PLAYERS_BETA, { 'mw': 0.044, 'day_span': self.day_span }) == 0: print('Training Player Abilities') self.train_all(teams=False, players=True) # ELIF TRAIN MISSING DAYS elif self.mongo.count(self.mongo.PLAYERS_BETA, { 'mw': 0.044, 'day_span': self.day_span, 'date': self.today }) == 0: ab = datasets.player_abilities(0.044, day_span) games = datasets.game_results([2017, 2018, 2019]) # Determine which games need to be scraped missing_ab = ab.merge(games, on='date', how='right') missing_ids = missing_ab[ missing_ab['mean'].isnull()]['_id'].unique() # Scrape the missing game logs print('Scraping Player Box Scpres') for id in missing_ids: player_scraper.player_box_score(id) # Train for the missing dates print('Train Missing Days') for date in missing_ab.loc[missing_ab.team.isnull(), 'date'].unique(): self.train_players(pd.Timestamp(date)) # Need to add today as this won't include that self.train_players(self.today) # Get all abilities in DF self.abilities = datasets.team_abilities(mw, att_constraint, def_constraint, day_span) self.player_abilities = datasets.player_abilities(0.044, day_span)