Esempio n. 1
0
def parseWeek(year, week):
    """
    parses a specific week on http://rotoguru1.com/cgi-bin/fyday.pl?week={}&year={}&game=fd&scsv=1
    which contains a csv of the fan duel player prices
    stores this info in fanduel_prices collection
    """
    logger = makeLogger(str(year) + '_' + str(week), r'./logs_RotoFDStats/')

    startTime = datetime.now()

    logger.debug('Starting %d', year)

    client = MongoClient('localhost', 27017)
    db = client['nfl_data']
    col_fanduel_prices = db['fanduel_prices']

    if col_fanduel_prices.find({'year': year, 'yeek': week}).count():
        logger.debug('Already parsed %d %d', year, week)
        closeLogger(logger)
        return None

    wait = random.uniform(1.5, 3.5)
    logger.debug('Waiting %f', wait)
    time.sleep(wait)

    logger.debug('Opening main page')
    browser = RoboBrowser(history=False,
                          parser='html.parser',
                          user_agent=get_user_agent(logger),
                          timeout=10)
    url = "http://rotoguru1.com/cgi-bin/fyday.pl?week={}&year={}&game=fd&scsv=1".format(
        week, year)
    browser = open_or_follow_link(logger, browser, 'open', url)

    docs = []
    try:
        data = browser.find('pre').text
        lines = data.split('\n')
        header = lines[0]
        header = header.split(';')
        lines = lines[1:]
        for line in lines:
            doc = {}
            if not line:
                continue
            for index, each in enumerate(line.split(';')):
                doc[cleanKey(header[index])] = convertToNumber(each)
            docs.append(doc)
    except:
        logger.exception("Parse fail: %s", url)

    try:
        logger.debug('Bulk Creating docs')
        col_fanduel_prices.insert_many(docs)
    except:
        logger.exception('insert_many error')

    logger.debug('parseWeek time elapsed: ' + str(datetime.now() - startTime))

    closeLogger(str(year) + '_' + str(week))
Esempio n. 2
0
def run(wait):
    """Starts the scrapping proccess.
    Opens a teamstats page and gathers all the form inputs
    Then sends these inputs to parseSeason which opens a new page for every possible option in the form
    If you get an error at the start, with role.find_all, just try again, nfl.com returns weird pages sometimes
    """

    logger = makeLogger('main', r'./logs_nflteamStat/')

    startTime = datetime.now()
    
    logger.debug('start time: ' + str(startTime))
    logger.debug('waiting %d seconds', wait)
    time.sleep(wait)

    pool = Pool(processes=int(get_proxy_count()/2.5))

    #html5lib parser required for broken html on gameSplits
    browser = RoboBrowser(history=False,  parser='html5lib', user_agent=get_user_agent(logger), timeout=10)
    startingUrl = "http://www.nfl.com/stats/categorystats?tabSeq=2&offensiveStatisticCategory=GAME_STATS&conference=ALL&role=TM&season=2015&seasonType=REG"
    browser = open_or_follow_link(logger, browser, 'open', startingUrl)
    
    role = browser.find(id="role")
    roles = role.find_all("option")
    offensiveCategory = browser.find(id="offensive-category")
    offensiveCategories = offensiveCategory.find_all("option")
    defensiveCategory = browser.find(id="defensive-category")
    defensiveCategories = defensiveCategory.find_all("option")
    season = browser.find(id="season-dropdown")
    seasons = season.find_all("option")
    seasonType = browser.find(id="season-type")
    seasonTypes = seasonType.find_all("option")

    
    for role in roles:
        availableCategories = None
        if role.text == "Offense":
            availableCategories = offensiveCategories
        elif role.text == "Defense":
            availableCategories = defensiveCategories
        else:
            print "unknown role"

        for category in availableCategories:
            if category.text == "Category...":
                continue

            for season in seasons:
                if season.text == "Season..." or convertToNumber(removeNewLine(season.text)) < 1960:
                    continue
                #parseSeason(role, category, season, seasonTypes)
                pool.apply_async(parseSeason, (role, category, season, seasonTypes,))

    pool.close() #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit.
    pool.join() #Wait for the worker processes to exit. One must call close() or terminate() before using join().

    logger.debug('run time: ' + str(datetime.now()-startTime ))

    closeLogger('main')
Esempio n. 3
0
def parseWeek(year, week):
    """
    parses a specific week on http://rotoguru1.com/cgi-bin/fyday.pl?week={}&year={}&game=fd&scsv=1
    which contains a csv of the fan duel player prices
    stores this info in fanduel_prices collection
    """
    logger = makeLogger(str(year) + '_' + str(week), r'./logs_RotoFDStats/')

    startTime = datetime.now()

    logger.debug('Starting %d', year)

    client = MongoClient('localhost', 27017)
    db = client['nfl_data']
    col_fanduel_prices = db['fanduel_prices']

    if col_fanduel_prices.find({'year': year, 'yeek': week}).count():
        logger.debug('Already parsed %d %d', year, week)
        closeLogger(logger)
        return None

    wait = random.uniform(1.5,3.5)
    logger.debug('Waiting %f', wait)
    time.sleep(wait)

    logger.debug('Opening main page')
    browser = RoboBrowser(history=False, parser='html.parser', user_agent=get_user_agent(logger), timeout=10)
    url = "http://rotoguru1.com/cgi-bin/fyday.pl?week={}&year={}&game=fd&scsv=1".format(week, year)
    browser = open_or_follow_link(logger, browser, 'open', url)

    docs = []
    try:
        data = browser.find('pre').text
        lines = data.split('\n')
        header = lines[0]
        header = header.split(';')
        lines = lines[1:]
        for line in lines:
            doc = {}
            if not line:
                continue
            for index, each in enumerate(line.split(';')):
                doc[cleanKey(header[index])] = convertToNumber(each)
            docs.append(doc)
    except:
        logger.exception("Parse fail: %s", url)
    
    try:
        logger.debug('Bulk Creating docs')
        col_fanduel_prices.insert_many(docs)
    except:
        logger.exception('insert_many error')

    logger.debug('parseWeek time elapsed: ' + str(datetime.now() - startTime))

    closeLogger(str(year) + '_' + str(week))
def parseTable(logger, pfr_player_bio_id, table, season_type):
    game_stat_dicts = []
    career_total_dict = {
        'pfr_player_bio_id': pfr_player_bio_id,
        'season_type': season_type
    }

    logger.debug('Parsing ' + season_type)
    games = table.find('tbody').find_all('tr')
    for game in games:
        game_stats = {}
        #skips over class='thead' rows since only has th's
        values = game.find_all('td')
        for column_index, value_object in enumerate(values):
            key = cleanKey(value_object['data-stat'])
            if value_object.text == '':
                value = 0
            elif key == 'game_date':
                value = datetime.strptime(value_object.text, '%Y-%m-%d')
            else:
                value = convertToNumber(value_object.text)
            
            #key scoring not accurate when kicker didnt attempt any FG but hasd XP
            game_stats[key] = value
        if not game_stats:
            continue

        game_stats['pfr_player_bio_id'] = pfr_player_bio_id
        game_stats['season_type'] = season_type
        game_stat_dicts.append(game_stats)


    logger.debug('Parsing totals ' + season_type)
    career_totals = table.find('tfoot').find('tr').find_all('td')
    for career_total in career_totals:
        key = cleanKey(career_total['data-stat'])
        value = convertToNumber(career_total.text)
        if not value:
            continue
        career_total_dict[key] = value

    return game_stat_dicts, career_total_dict
Esempio n. 5
0
def parseTable(logger, pfr_player_bio_id, table, season_type):
    game_stat_dicts = []
    career_total_dict = {
        'pfr_player_bio_id': pfr_player_bio_id,
        'season_type': season_type
    }

    logger.debug('Parsing ' + season_type)
    games = table.find('tbody').find_all('tr')
    for game in games:
        game_stats = {}
        #skips over class='thead' rows since only has th's
        values = game.find_all('td')
        for column_index, value_object in enumerate(values):
            key = cleanKey(value_object['data-stat'])
            if value_object.text == '':
                value = 0
            elif key == 'game_date':
                value = datetime.strptime(value_object.text, '%Y-%m-%d')
            else:
                value = convertToNumber(value_object.text)

            #key scoring not accurate when kicker didnt attempt any FG but hasd XP
            game_stats[key] = value
        if not game_stats:
            continue

        game_stats['pfr_player_bio_id'] = pfr_player_bio_id
        game_stats['season_type'] = season_type
        game_stat_dicts.append(game_stats)

    logger.debug('Parsing totals ' + season_type)
    career_totals = table.find('tfoot').find('tr').find_all('td')
    for career_total in career_totals:
        key = cleanKey(career_total['data-stat'])
        value = convertToNumber(career_total.text)
        if not value:
            continue
        career_total_dict[key] = value

    return game_stat_dicts, career_total_dict
Esempio n. 6
0
def parseTeam(team_url, team_name):
    """
    parses a teams page returns a list of year urls
    there is some data on this page that would be usefull to scrape in the future
    """
    logger = makeLogger(cleanKey(team_name), r'./logs_pfrTeamStats/')

    startTime = datetime.now()

    logger.debug('Starting %s', team_name)

    wait = random.uniform(1.5, 3.5)
    logger.debug('Waiting %f', wait)
    time.sleep(wait)

    logger.debug('Opening main page')
    browser = RoboBrowser(history=False,
                          parser='html5lib',
                          user_agent=get_user_agent(logger),
                          timeout=10)
    browser = open_or_follow_link(logger, browser, 'open', team_url)
    table = browser.find(id='team_index').find('tbody')
    year_columns = table.find_all('th')

    year_url_tups = []
    for index, year_column in enumerate(year_columns):
        logger.debug('Row %d of %d', index, len(year_columns))
        try:
            year_link = year_column.find('a')
            if year_link:
                year_url = 'http://www.pro-football-reference.com' + year_link[
                    'href']
                year = convertToNumber(year_link.text)
                if not isinstance(year, int):
                    continue
                year_url_tups.append((team_name, year_url, year))
        except:
            logger.exception(year_column)

    logger.debug('parseTeam time elapsed: ' + str(datetime.now() - startTime))

    closeLogger(logger)

    return year_url_tups
Esempio n. 7
0
def parseTeam(team_url, team_name):
    """
    parses a teams page returns a list of year urls
    there is some data on this page that would be usefull to scrape in the future
    """
    logger = makeLogger(cleanKey(team_name), r"./logs_pfrTeamStats/")

    startTime = datetime.now()

    logger.debug("Starting %s", team_name)

    wait = random.uniform(1.5, 3.5)
    logger.debug("Waiting %f", wait)
    time.sleep(wait)

    logger.debug("Opening main page")
    browser = RoboBrowser(history=False, parser="html5lib", user_agent=get_user_agent(logger), timeout=10)
    browser = open_or_follow_link(logger, browser, "open", team_url)
    table = browser.find(id="team_index").find("tbody")
    year_columns = table.find_all("th")

    year_url_tups = []
    for index, year_column in enumerate(year_columns):
        logger.debug("Row %d of %d", index, len(year_columns))
        try:
            year_link = year_column.find("a")
            if year_link:
                year_url = "http://www.pro-football-reference.com" + year_link["href"]
                year = convertToNumber(year_link.text)
                if not isinstance(year, int):
                    continue
                year_url_tups.append((team_name, year_url, year))
        except:
            logger.exception(year_column)

    logger.debug("parseTeam time elapsed: " + str(datetime.now() - startTime))

    closeLogger(logger)

    return year_url_tups
Esempio n. 8
0
def parseYear(year):
    """
    parses a schedule for a specific year on http://www.pro-football-reference.com/years/{YEAR}/games.htm
    follows all the "boxscore" links (column[3]) to get stadium and weather conditions (game_info)
    stores schedule info in nfl_data.schedule
    stores game_info in nfl_data.game_info with schedule ids
    """
    logger = makeLogger(year, r'./logs_pfrSchedule/')

    startTime = datetime.now()

    logger.debug('Starting %d', year)

    schedule_list = []
    gameInfo_list = []

    client = MongoClient('localhost', 27017)
    db = client['nfl_data']
    col_schedule = db['schedule']
    col_game_info = db['game_info']
    col_failed_game_info = db['failed_game_info']

    if col_schedule.find({'year': year}).count():
        logger.debug('Already parsed %s', year)
        closeLogger(logger)
        return None

    wait = random.uniform(1.5, 3.5)
    logger.debug('Waiting %f', wait)
    time.sleep(wait)

    logger.debug('Opening main page')
    browser = RoboBrowser(history=False,
                          parser='html5lib',
                          user_agent=get_user_agent(logger),
                          timeout=10)
    browser = open_or_follow_link(
        logger, browser, 'open',
        "http://www.pro-football-reference.com/years/{}/games.htm".format(
            year))
    table = browser.find(id='games')
    rows = table.find_all('tr')
    for index, row in enumerate(rows):
        logger.debug('Row %d of %d', index, len(rows))
        try:
            schedule_dict = {}
            gameInfo_dict = {}
            columns = row.find_all('td')
            if columns:
                schedule_dict['week'] = convertToNumber(columns[0].text)
                schedule_dict['day'] = columns[1].text
                schedule_dict['date'] = columns[2].text
                schedule_dict['year'] = convertToNumber(year)
                homeIndicator = columns[5].text
                if homeIndicator == '@':
                    schedule_dict['homeTeam'] = columns[6].text
                    schedule_dict['awayTeam'] = columns[4].text
                    schedule_dict['homeTeamScore'] = convertToNumber(
                        columns[8].text)
                    schedule_dict['awayTeamScore'] = convertToNumber(
                        columns[7].text)
                else:
                    schedule_dict['homeTeam'] = columns[4].text
                    schedule_dict['awayTeam'] = columns[6].text
                    schedule_dict['homeTeamScore'] = convertToNumber(
                        columns[7].text)
                    schedule_dict['awayTeamScore'] = convertToNumber(
                        columns[8].text)
                gameInfo_dict['week'] = convertToNumber(columns[0].text)
                gameInfo_dict['year'] = convertToNumber(year)
                wait = random.uniform(.5, 2.5)
                logger.debug('Waiting to follow_link %f', wait)
                time.sleep(wait)
                logger.debug('Following link')
                url = columns[3].find('a')
                if url:
                    url = 'http://www.pro-football-reference.com' + url['href']
                    failed_game_info = True
                    browser = open_or_follow_link(logger, browser, 'open', url)
                    game_info = browser.find(id="game_info")
                    if game_info:
                        for each in game_info.find_all('tr'):
                            pair = each.find_all('td')
                            if pair:
                                failed_game_info = False
                                key = pair[0].text
                                value = convertToNumber(pair[1].text)
                                gameInfo_dict[cleanKey(key)] = convertToNumber(
                                    value)
                    if failed_game_info:
                        failed_dict = schedule_dict
                        failed_dict['row'] = index
                        failed_dict['href'] = url['href']
                        col_failed_game_info.insert(failed_dict)
                        gameInfo_dict['FAIL'] = True

                schedule_list.append(schedule_dict)
                gameInfo_list.append(gameInfo_dict)
        except:
            logger.exception(row)

    logger.debug('nfl_schedule.inert_many')

    schedule_ids = col_schedule.insert_many(schedule_list).inserted_ids

    logger.debug('mapping nfl_schedule.id to gameInfo_list')

    for index, schedule_id in enumerate(schedule_ids):
        if len(gameInfo_list[index].keys()) <= 2:
            logger.debug('Empty game_info: %s', schedule_id)
        gameInfo_list[index]['schedule_id'] = schedule_id

    logger.debug('game_info.insert_many')
    col_game_info.insert_many(gameInfo_list)

    logger.debug('parseYear time elapsed: ' + str(datetime.now() - startTime))

    closeLogger(year)
Esempio n. 9
0
def parseSeason(role, category, season, seasonTypes):
    """Parses every seasonType in a season at http://www.nfl.com/stats/categorystats for a given role/category/season
    doesnt follow any links
    some years dont have any info, but still return a page.
    These are loged with Exception('No teams found %s' % url)
    All data is stored in team_stats
    """
    logger = makeLogger(role.text + '_' + category.text + '_' + season.text, r'./logs_nflteamStat/')

    startTime = datetime.now()

    logger.debug('Starting %s %s %s', role.text, category.text, season.text)

    teamStat_list = []
    for seasonType in seasonTypes:
        if seasonType.text == "Season Type...":
            continue

        team_stats_query = {'year': convertToNumber(removeNewLine(season.text)),
            'seasonType': removeNewLine(seasonType.text),
            'role': removeNewLine(role.text),
            'category': removeNewLine(category.text)
        }

        if col_team_stats.find(team_stats_query).count():
            logger.debug('Already parsed %s', team_stats_query)
            continue

        wait = random.uniform(1.5,3.5)
        logger.debug('Waiting %f', wait)
        time.sleep(wait)

        logger.debug('Starting: %s', team_stats_query)
        url = 'http://www.nfl.com/stats/categorystats?' + 'archive=true&conference=null' + '&role=' + role['value']
        try:
            if role.text == "Offense":
                categoryUrl = '&offensiveStatisticCategory=' + category['value'] + '&defensiveStatisticCategory=null'
                
            elif role.text == "Defense":
                categoryUrl = '&offensiveStatisticCategory=null&defensiveStatisticCategory=' + category['value']
            else:
                raise Exception('Unsupported role: %s', role.text)
            
            url += categoryUrl
            url += '&season=' + season['value'] + '&seasonType=' + seasonType['value'] + '&tabSeq=2&qualified=false&Submit=Go'

            logger.debug('Opening: %s', url)
            browser = RoboBrowser(history=False,  parser='html5lib', user_agent=get_user_agent(logger), timeout=10)
            browser = open_or_follow_link(logger, browser, 'open', url)
            result = browser.find(id="result")

            tries = 0
            # sometimes when using slow proxies nfl.com returns 200 without the whole page being loaded
            while not result:
                if tries > 10:
                    raise Exception('No teams found %s' % url)
                elif tries > 0:
                    time.sleep(random.uniform(5, 7))
                tries += 1
                logger.debug('No result-tries: %d', tries)
                browser = RoboBrowser(history=False,  parser='html5lib', user_agent=get_user_agent(logger), timeout=10)
                browser = open_or_follow_link(logger, browser, 'open', url)
                result = browser.find(id="result")

            tbodies = result.find_all("tbody")
            if len(tbodies) != 2:
                raise Exception("error parsing result")
            tableKey = tbodies[0]
            tableKeyRows = tableKey.find_all("tr")
            topTableKeys = []
            if len(tableKeyRows) == 1:
                tableKey = tableKey.find_all("th")
            elif len(tableKeyRows) == 2:
                topTableColumns = tableKeyRows[0].find_all("th")
                for topTableColumn in topTableColumns:
                    for _ in range(int(topTableColumn['colspan'])):
                        topTableKeys.append(topTableColumn.text)
                tableKey = tableKeyRows[1].find_all("th")
            else:
                raise Exception('To many header rows found')

            tableItems = tbodies[1]
            tableItems = tableItems.find_all("td")

            tableColumn = 0
            teamStatDict = {}
            for tableIndex, tableItem in enumerate(tableItems):
                if tableColumn == 0:
                    logger.debug('Row %d of %d', tableIndex, len(tableItems))
                    tableColumn += 1
                    continue

                if tableColumn == 1:
                    teamStatDict['team'] = removeNewLine(tableItem.text)
                    teamStatDict['year'] = int(removeNewLine(season.text))
                    teamStatDict['seasonType'] = removeNewLine(seasonType.text)
                    teamStatDict['role'] = removeNewLine(role.text)
                    teamStatDict['category'] = removeNewLine(category.text)
                    tableColumn += 1
                    continue

                if topTableKeys and topTableKeys[tableColumn]:
                    key = topTableKeys[tableColumn] + '_' + tableKey[tableColumn].text
                else:
                    key = tableKey[tableColumn].text
                key = cleanKey(removeNewLine(key))
                value = convertToNumber(removeNewLine(tableItem.text))
                teamStatDict[key] = value

                tableColumn += 1
                if tableColumn >= len(tableKey):
                    teamStat_list.append(teamStatDict)
                    teamStatDict = {}
                    tableColumn = 0
        except:
            logger.exception('row fail')

    try:
        if teamStat_list:
            logger.debug('Bulk Creating teamStat_list')
            col_team_stats.insert_many(teamStat_list)
    except:
        logger.exception('insert_many error')

    logger.debug('parseSeason time elapsed: ' + str(datetime.now() - startTime))

    closeLogger(role.text + '_' + category.text)
Esempio n. 10
0
def parseSplits(logger, splits, year, splitType, player_profile_id):
    """
    splitType: game or situational 
    Parses 1 year of games splits or situational stats for given player.
    Stores each row in its own doc
    """
    startTime = datetime.now()

    logger.debug('Starting %s splits', splitType)

    try:
        tabs = splits.find(class_="player-tabs")
        tabs = tabs.find_all('li')
    except:
        logger.exception('failed parsing player tabs')
        return

    splits_list = []
    for index, tab in enumerate(tabs):
        logger.debug('tab %d of %d', index, len(tabs))
        try:
            currentTabText = tab.text.strip()
            currentTab = splits.find(id='game_split_tabs_' + str(index))
            tables = currentTab.find_all('table')
        except:
            logger.exception('failed parsing player tables for tab %d of %d',
                             index, len(tabs))
            continue

        for tableIndex, table in enumerate(tables):
            logger.debug('table %d of %d', tableIndex, len(tables))
            try:
                tableKey = table.find(class_="player-table-key")
                tableKey = tableKey.find_all('td')
                tableName = tableKey[0].text.strip()

                tableItems = table.find('tbody').find_all('td')
                rowDict = {
                    'currentTabText': currentTabText,
                    'category': tableName,
                    'player_profile_id': player_profile_id,
                    'year': int(year),
                    'splitType': splitType
                }
                tableColumn = 0
            except:
                logger.exception('failed parsing player table %d of %d',
                                 tableIndex, len(tables))
                continue

            for rowIndex, item in enumerate(tableItems):
                try:
                    if 'class' in item.attrs:
                        if item.attrs['class'][0] == 'border-td':
                            continue

                    if tableColumn == 0:
                        logger.debug('Row %d of %d', rowIndex, len(tableItems))
                        rowName = item.text.strip()
                        rowDict['rowName'] = rowName
                        tableColumn += 1
                        continue

                    rowDict[cleanKey(
                        tableKey[tableColumn].text)] = convertToNumber(
                            item.text.strip())

                    tableColumn += 1
                    if tableColumn >= len(tableKey):
                        splits_list.append(rowDict)
                        tableColumn = 0
                        rowDict = {
                            'currentTabText': currentTabText,
                            'category': tableName,
                            'player_profile_id': player_profile_id,
                            'year': int(year),
                            'splitType': splitType
                        }
                except:
                    logger.exception('failed parsing row %d of %s', rowIndex,
                                     tableName)
                    while (tableColumn < len(tableKey)):
                        tableColumn += 1
                    rowDict = {
                        'currentTabText': currentTabText,
                        'category': tableName,
                        'player_profile_id': player_profile_id,
                        'year': int(year),
                        'splitType': splitType
                    }

    try:
        logger.debug('Bulk Creating splits_list')
        if splits_list:
            col_player_splits.insert_many(splits_list)
        else:
            logger.debug('Nothing to insert')
    except:
        logger.exception('insert_many error')

    logger.debug('parseSplits time elapsed: ' +
                 str(datetime.now() - startTime))
Esempio n. 11
0
def parseYear(team_name, year_url, year):
    """
    parses a schedule for a specific year on http://www.pro-football-reference.com/years/{YEAR}/games.htm
    follows all the "boxscore" links (column[3]) to get stadium and weather conditions (game_info)
    stores schedule info in nfl_data.schedule
    stores game_info in nfl_data.game_info with schedule ids
    """
    logger = makeLogger(
        cleanKey(team_name) + '_' + str(year), r'./logs_pfrTeamStats/')

    startTime = datetime.now()

    logger.debug('Starting %d', year)

    schedule_list = []
    gameInfo_list = []

    client = MongoClient('localhost', 27017)
    db = client['nfl_data']
    col_team_stats_weekly = db['team_stats_weekly']

    #need to fix this to actually detect duplicate
    # if col_team_stats_weekly.find({'year': year}).count():
    #     logger.debug('Already parsed %s', year)
    #     closeLogger(logger)
    #     return None

    wait = random.uniform(1.5, 3.5)
    logger.debug('Waiting %f', wait)
    time.sleep(wait)

    logger.debug('Opening main page')
    browser = RoboBrowser(history=False,
                          parser='html5lib',
                          user_agent=get_user_agent(logger),
                          timeout=10)
    browser = open_or_follow_link(logger, browser, 'open', year_url)
    table = browser.find(id='games')
    rows = table.find_all('tr')
    header = [
        cleanKey(each.attrs['data-stat']) for each in rows[0].find_all('th')
    ]
    rows = rows[1:]

    row_dicts = []
    for index, row in enumerate(rows):
        logger.debug('Row %d of %d', index, len(rows))
        try:
            week_number = convertToNumber(row.find('th').text)
            row_values = [
                convertToNumber(value.text) for value in row.find_all('td')
            ]
            row_values.insert(0, week_number)
            row_dict = dict(zip(header, row_values))
            row_dict['year'] = year
            row_dict['team_name'] = team_name
            row_dict['year_url'] = year_url

            if row_dict['game_date'].lower() == 'playoffs':
                continue

            row_dicts.append(row_dict)
        except:
            logger.exception(row)

    logger.debug('team_stats_weekly.inert_many')

    col_team_stats_weekly.insert_many(row_dicts)

    logger.debug('parseYear time elapsed: ' + str(datetime.now() - startTime))

    closeLogger(logger)
Esempio n. 12
0
def parseYear(year):
    """
    parses a schedule for a specific year on http://www.pro-football-reference.com/years/{YEAR}/games.htm
    follows all the "boxscore" links (column[3]) to get stadium and weather conditions (game_info)
    stores schedule info in nfl_data.schedule
    stores game_info in nfl_data.game_info with schedule ids
    """
    logger = makeLogger(year, r'./logs_pfrSchedule/')

    startTime = datetime.now()

    logger.debug('Starting %d', year)

    schedule_list = []
    gameInfo_list = []

    client = MongoClient('localhost', 27017)
    db = client['nfl_data']
    col_schedule = db['schedule']
    col_game_info = db['game_info']
    col_failed_game_info = db['failed_game_info']

    if col_schedule.find({'year': year}).count():
        logger.debug('Already parsed %s', year)
        closeLogger(logger)
        return None

    wait = random.uniform(1.5,3.5)
    logger.debug('Waiting %f', wait)
    time.sleep(wait)

    logger.debug('Opening main page')
    browser = RoboBrowser(history=False,  parser='html5lib', user_agent=get_user_agent(logger), timeout=10)
    browser = open_or_follow_link(logger, browser, 'open', "http://www.pro-football-reference.com/years/{}/games.htm".format(year))
    table = browser.find(id='games')
    rows = table.find_all('tr')
    for index, row in enumerate(rows):
        logger.debug('Row %d of %d', index, len(rows))
        try:
            schedule_dict = {}
            gameInfo_dict = {}
            columns = row.find_all('td')
            if columns:
                schedule_dict['week'] = convertToNumber(columns[0].text)
                schedule_dict['day'] = columns[1].text
                schedule_dict['date'] = columns[2].text
                schedule_dict['year'] = convertToNumber(year)
                homeIndicator = columns[5].text
                if homeIndicator == '@':
                    schedule_dict['homeTeam'] = columns[6].text
                    schedule_dict['awayTeam'] = columns[4].text
                    schedule_dict['homeTeamScore'] = convertToNumber(columns[8].text)
                    schedule_dict['awayTeamScore'] = convertToNumber(columns[7].text)
                else:
                    schedule_dict['homeTeam'] = columns[4].text
                    schedule_dict['awayTeam'] = columns[6].text
                    schedule_dict['homeTeamScore'] = convertToNumber(columns[7].text)
                    schedule_dict['awayTeamScore'] = convertToNumber(columns[8].text)
                gameInfo_dict['week'] = convertToNumber(columns[0].text)
                gameInfo_dict['year'] = convertToNumber(year)
                wait = random.uniform(.5, 2.5)
                logger.debug('Waiting to follow_link %f', wait)
                time.sleep(wait)
                logger.debug('Following link')
                url = columns[3].find('a')
                if url:
                    url = 'http://www.pro-football-reference.com' + url['href']
                    failed_game_info = True
                    browser = open_or_follow_link(logger, browser, 'open', url)
                    game_info = browser.find(id="game_info")
                    if game_info:
                        for each in game_info.find_all('tr'):
                            pair = each.find_all('td')
                            if pair:
                                failed_game_info = False
                                key = pair[0].text
                                value = convertToNumber(pair[1].text)
                                gameInfo_dict[cleanKey(key)] = convertToNumber(value)
                    if failed_game_info:
                        failed_dict = schedule_dict
                        failed_dict['row'] = index
                        failed_dict['href'] = url['href']
                        col_failed_game_info.insert(failed_dict)
                        gameInfo_dict['FAIL'] = True

                schedule_list.append(schedule_dict)
                gameInfo_list.append(gameInfo_dict)
        except:
            logger.exception(row)

    logger.debug('nfl_schedule.inert_many')

    schedule_ids = col_schedule.insert_many(schedule_list).inserted_ids
    
    logger.debug('mapping nfl_schedule.id to gameInfo_list')

    for index, schedule_id in enumerate(schedule_ids):
        if len(gameInfo_list[index].keys()) <= 2:
            logger.debug('Empty game_info: %s', schedule_id)
        gameInfo_list[index]['schedule_id'] = schedule_id

    logger.debug('game_info.insert_many')
    col_game_info.insert_many(gameInfo_list)

    logger.debug('parseYear time elapsed: ' + str(datetime.now() - startTime))

    closeLogger(year)
Esempio n. 13
0
def parseWeek(year, week):
    """Parsing a specific week at http://nflweather.com/week/{}/Week-{}
    Follows all detial links, which is where must of the data is scraped.
    Scrapes weather, and stadium enough per week, and stores them in their respective collections
    """
    logger = makeLogger(str(year) + '_' + str(week), r'./logs_nflWeather/')

    startTime = datetime.now()

    logger.debug('Starting %d %d', year, week)

    weather_list = []
    stadium_list = []

    if col_weather_info.find({'year': year, 'week': week}).count():
        logger.debug('Already parsed %d %d', year, week)
        return None

    wait = random.uniform(1.5,3.5)
    logger.debug('Waiting %f', wait)
    time.sleep(wait)

    logger.debug('Opening main page')
    browser = RoboBrowser(history=False,  parser='html5lib', user_agent=get_user_agent(logger), timeout=10)
    browser = open_or_follow_link(logger, browser, 'open', "http://nflweather.com/week/{}/Week-{}".format(year, week))

    data = browser.find(class_="footable")
    rows = data.find_all('tr')

    for index, row in enumerate(rows):
        logger.debug('Row %d of %d', index, len(rows))
        weatherInfo = {'year': year, 'week': week}
        stadiumInfo = {'year': year, 'week': week}

        try:
            columns = row.find_all('td')
            if columns:
                weatherInfo['weatherPicAlt'] = columns[8].find('img')['alt']
                weatherInfo['weatherText'] = columns[9].text.strip()
                weatherInfo['shortWind'] = columns[10].text
                details = columns[12]
                detialsLink = 'http://nflweather.com' + details.find('a')['href']
                wait = random.uniform(.5, 2.5)
                logger.debug('Waiting to follow_link %f', wait)
                time.sleep(wait)
                logger.debug('Following link')
                browser = open_or_follow_link(logger, browser, 'open', detialsLink)
                gameTime = browser.find('strong').text.split('-')[0].split(':', 1)[1].strip()
                awayTeam = browser.find_all(class_='g-away')[1].find('a').text.replace('  ', ' ').strip()
                homeTeam = browser.find_all(class_='g-home')[1].find('a').text.replace('  ', ' ').strip()
                spans = browser.find_all(class_='span5')
                if len(spans) != 2:
                    raise Exception('to many spans')

                weatherItems = spans[0].find_all('p')
                stadiumItems = spans[1].find_all('p')

                index = spans[0].text.find('Temperature:')
                weatherCondition = spans[0].text[:index].strip()

                for each in weatherItems:
                    split = each.text.strip().split(':')
                    if len(split) == 2:
                        weatherInfo[cleanKey(split[0].strip())] = convertToNumber(split[1].strip())
                
                for index, each in enumerate(stadiumItems):
                    split = each.text.strip().split(':')
                    if len(split) == 2:
                        if split[0] == 'Surface':
                            stadiumInfo['stadium'] = stadiumItems[index-1].text.strip()
                        stadiumInfo[cleanKey(split[0].strip())] = convertToNumber(split[1].strip())

                #find nfl_schedule, update gameTime, hoepfully result as id, insert id into both info dicts, append to _list
                schedule_query = {'year': year, 'week': week, 'homeTeam': homeTeam, 'awayTeam': awayTeam}
                schedule_doc = col_schedule.find(schedule_query)
                if schedule_doc.count() != 1:
                    error_docs = str(schedule_query) + ' | ' + str(weatherInfo) + ' | ' + str(stadiumInfo)
                    raise Exception("nfl_scedule doc not found " + error_docs)
                result = col_schedule.update_one(schedule_query, {'$set': {'dateTime': gameTime}})
                schedule_id = schedule_doc[0]['_id']
                weatherInfo['schedule_id'] = schedule_id
                stadiumInfo['schedule_id'] = schedule_id
                weather_list.append(weatherInfo)
                stadium_list.append(stadiumInfo)
        except:
            logger.exception(row)

    try:
        logger.debug('Bulk Creating weather_list')
        col_weather_info.insert_many(weather_list)
        logger.debug('Bulk Creating stadium_list')
        col_stadium_info.insert_many(stadium_list)
    except:
        logger.exception('insert_many error')
    logger.debug('parseWeek time elapsed: ' + str(datetime.now() - startTime))

    closeLogger(str(year) + '_' + str(week))
Esempio n. 14
0
def parseGameLogs(logger, gameLogs, year, player_profile_id):
    """
    Parses 1 year of games logs for given player.
    Stores each row in its own doc
    """
    startTime = datetime.now()

    logger.debug('Starting gameLogs')

    gameLogs_list = []
    #messy because of bye weeks, 1 less column present
    for tableNumber, gameLog in enumerate(gameLogs):
        logger.debug('Table %d of %d', tableNumber, len(gameLogs))
        try:
            topTableColumns = gameLog.find(
                class_="player-table-header").find_all('td')
            topTableKey = []
            if len(topTableColumns) > 1:
                for index, topTableColumn in enumerate(topTableColumns):
                    for _ in range(int(topTableColumn['colspan'])):
                        if index == 0:
                            topTableKey.append('')
                        else:
                            topTableKey.append(topTableColumn.text)
            tableName = topTableColumns[0].text.strip()

            tableKey = gameLog.find(class_="player-table-key")
            tableKey = tableKey.find_all('td')

            if topTableKey:
                for index, key in enumerate(tableKey):
                    if topTableKey[index]:
                        tableKey[index] = cleanKey(topTableKey[index] + '_' +
                                                   key.text)
                    else:
                        tableKey[index] = cleanKey(key.text)

            tableItems = gameLog.find("tbody").find_all("td")

            rowDict = {
                'category': tableName,
                'player_profile_id': player_profile_id,
                'year': int(year)
            }
            tableColumn = 0
            byeWeek = False
            columnsSkip = 0
            rowWeek = None
        except:
            logger.exception('failed parsing table')
            continue

        for index, item in enumerate(tableItems):
            try:
                if byeWeek:
                    if columnsSkip >= len(tableKey) - 3:
                        byeWeek = False
                        columnsSkip = 0
                        tableColumn = 0
                    else:
                        columnsSkip += 1
                    continue

                #skip borders
                if 'class' in item.attrs:
                    if item.attrs['class'][0] == 'border-td':
                        continue
                #detect Total row and break
                if 'colspan' in item.attrs:
                    if item.attrs['colspan'] == "3":
                        if 'class' in tableItems[index + 1].attrs:
                            if tableItems[
                                    index +
                                    1].attrs["class"][0] == "player-totals":
                                break

                if tableColumn == 0:
                    logger.debug('Row %d of %d', index, len(tableItems))
                    rowDict['week'] = convertToNumber(item.text.strip())
                    tableColumn += 1
                    continue

                if tableColumn == 1:
                    if item.text.strip() == "Bye":
                        byeWeek = True
                        gameDate = "Bye"
                        tableColumn += 1
                        while (tableColumn < len(tableKey)):
                            rowDict[tableKey[tableColumn]] = None
                            tableColumn += 1
                        #store nones

                if not byeWeek:
                    if tableColumn == 2:
                        opp = None
                        linksFound = len(item.find_all('a'))
                        if linksFound == 2:
                            opp = item.find_all('a')[1].text.strip()
                        elif linksFound == 1:
                            opp = item.find_all('a')[0].text.strip()
                        else:
                            opp = item.text.strip()
                        rowDict[tableKey[tableColumn]] = opp.replace(
                            '\t', '').replace('\n', '')
                        tableColumn += 1
                        continue

                    if tableColumn == 3:
                        outCome = item.find("span")
                        if not outCome:
                            outCome = 'T'
                        else:
                            outCome = outCome.text.strip()
                        score = None
                        linksFound = len(item.find_all("a"))
                        if linksFound == 1:
                            score = item.find("a").text.strip()
                        elif linksFound == 0:
                            score = re.findall('[0-9]+-[0-9]+', item.text)[0]
                        result = outCome + score
                        rowDict[tableKey[tableColumn]] = result
                        tableColumn += 1
                        continue

                    rowDict[tableKey[tableColumn]] = convertToNumber(
                        item.text.strip())

                    tableColumn += 1
                    if tableColumn >= len(tableKey):
                        gameLogs_list.append(rowDict)
                        rowDict = {
                            'category': tableName,
                            'player_profile_id': player_profile_id,
                            'year': int(year)
                        }
                        tableColumn = 0
                        byeWeek = False
            except:
                logger.exception(
                    'failed parsing row %d of %s. Skipping the row', index,
                    tableName)
                while (tableColumn < len(tableKey)):
                    tableColumn += 1
                rowDict = {
                    'category': tableName,
                    'player_profile_id': player_profile_id,
                    'year': int(year)
                }

    try:
        logger.debug('Bulk Creating gameLogs_list')
        if gameLogs_list:
            col_player_game_logs.insert_many(gameLogs_list)
        else:
            logger.debug('Nothing to insert')
    except:
        logger.exception('insert_many error')

    logger.debug('parseGameLogs time elapsed: ' +
                 str(datetime.now() - startTime))
Esempio n. 15
0
def run(wait):
    """Starts the scrapping proccess.
    Opens a teamstats page and gathers all the form inputs
    Then sends these inputs to parseSeason which opens a new page for every possible option in the form
    If you get an error at the start, with role.find_all, just try again, nfl.com returns weird pages sometimes
    """

    logger = makeLogger('main', r'./logs_nflteamStat/')

    startTime = datetime.now()

    logger.debug('start time: ' + str(startTime))
    logger.debug('waiting %d seconds', wait)
    time.sleep(wait)

    pool = Pool(processes=int(get_proxy_count() / 2.5))

    #html5lib parser required for broken html on gameSplits
    browser = RoboBrowser(history=False,
                          parser='html5lib',
                          user_agent=get_user_agent(logger),
                          timeout=10)
    startingUrl = "http://www.nfl.com/stats/categorystats?tabSeq=2&offensiveStatisticCategory=GAME_STATS&conference=ALL&role=TM&season=2015&seasonType=REG"
    browser = open_or_follow_link(logger, browser, 'open', startingUrl)

    role = browser.find(id="role")
    roles = role.find_all("option")
    offensiveCategory = browser.find(id="offensive-category")
    offensiveCategories = offensiveCategory.find_all("option")
    defensiveCategory = browser.find(id="defensive-category")
    defensiveCategories = defensiveCategory.find_all("option")
    season = browser.find(id="season-dropdown")
    seasons = season.find_all("option")
    seasonType = browser.find(id="season-type")
    seasonTypes = seasonType.find_all("option")

    for role in roles:
        availableCategories = None
        if role.text == "Offense":
            availableCategories = offensiveCategories
        elif role.text == "Defense":
            availableCategories = defensiveCategories
        else:
            print "unknown role"

        for category in availableCategories:
            if category.text == "Category...":
                continue

            for season in seasons:
                if season.text == "Season..." or convertToNumber(
                        removeNewLine(season.text)) < 1960:
                    continue
                #parseSeason(role, category, season, seasonTypes)
                pool.apply_async(parseSeason, (
                    role,
                    category,
                    season,
                    seasonTypes,
                ))

    pool.close(
    )  #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit.
    pool.join(
    )  #Wait for the worker processes to exit. One must call close() or terminate() before using join().

    logger.debug('run time: ' + str(datetime.now() - startTime))

    closeLogger('main')
Esempio n. 16
0
def parseCareerStats(logger, careerStats, player_profile_id):
    """
    Parses career stats page for given player
    Stores each row in its own doc
    """
    startTime = datetime.now()

    logger.debug('Starting careerStats')

    careerStats_list = []
    for tableNumber, careerStat in enumerate(careerStats):
        logger.debug('Table %d of %d', tableNumber, len(careerStats))
        try:
            tableName = careerStat.find("div").text.strip()

            tableKey = careerStat.find_all(class_="player-table-key")[-1]
            tableKey = tableKey.find_all('td')
            
            tableItems = careerStat.find("tbody").find_all("td")

            rowDict = {'category': tableName, 'player_profile_id': player_profile_id}
            rowYear = None
            tableColumn = 0
        except:
            logger.exception('failed parsing table')
            continue

        for index, item in enumerate(tableItems, start=1):
            try:
                if 'class' in item.attrs:
                    if item.attrs['class'][0] == 'border-td':
                        continue
                    if item.attrs['class'][0] == 'player-totals':
                        break
                if tableColumn == 0:
                    logger.debug('Row %d of %d', index, len(tableItems))
                    rowDict['year'] = convertToNumber(item.text.strip())
                    tableColumn += 1
                    continue

                rowDict[cleanKey(tableKey[tableColumn].text)] = convertToNumber(item.text.strip())

                tableColumn += 1
                if tableColumn >= len(tableKey):
                    careerStats_list.append(rowDict)
                    rowDict = {'category': tableName, 'player_profile_id': player_profile_id}
                    tableColumn = 0
            except:
                logger.exception('failed parsing row %d of %s', index, tableName)
                while(tableColumn < len(tableKey)):
                    tableColumn += 1
                rowDict = {'category': tableName, 'player_profile_id': player_profile_id}


    try:
        logger.debug('Bulk Creating careerStats_list')
        if careerStats_list:
            col_player_career_stats.insert_many(careerStats_list)
        else:
            logger.debug('Nothing to insert')
    except:
        logger.exception('insert_many error')

    logger.debug('parseCareerStats time elapsed: ' + str(datetime.now() - startTime))
Esempio n. 17
0
def parseSplits(logger, splits, year, splitType, player_profile_id):
    """
    splitType: game or situational 
    Parses 1 year of games splits or situational stats for given player.
    Stores each row in its own doc
    """
    startTime = datetime.now()

    logger.debug('Starting %s splits', splitType)
    
    try:
        tabs = splits.find(class_="player-tabs")
        tabs = tabs.find_all('li')
    except:
        logger.exception('failed parsing player tabs')
        return

    splits_list = []
    for index, tab in enumerate(tabs):
        logger.debug('tab %d of %d', index, len(tabs))
        try:
            currentTabText = tab.text.strip()
            currentTab = splits.find(id='game_split_tabs_' + str(index))
            tables = currentTab.find_all('table')
        except:
            logger.exception('failed parsing player tables for tab %d of %d', index, len(tabs))
            continue

        for tableIndex, table in enumerate(tables):
            logger.debug('table %d of %d', tableIndex, len(tables))
            try:
                tableKey = table.find(class_="player-table-key")
                tableKey = tableKey.find_all('td')
                tableName = tableKey[0].text.strip()

                tableItems = table.find('tbody').find_all('td')
                rowDict = {'currentTabText': currentTabText, 'category': tableName, 'player_profile_id': player_profile_id, 'year': int(year), 'splitType': splitType}
                tableColumn = 0
            except:
                logger.exception('failed parsing player table %d of %d', tableIndex, len(tables))
                continue

            for rowIndex, item in enumerate(tableItems):
                try:
                    if 'class' in item.attrs:
                        if item.attrs['class'][0] == 'border-td':
                            continue

                    if tableColumn == 0:
                        logger.debug('Row %d of %d', rowIndex, len(tableItems))
                        rowName = item.text.strip()
                        rowDict['rowName'] = rowName
                        tableColumn += 1
                        continue

                    rowDict[cleanKey(tableKey[tableColumn].text)] = convertToNumber(item.text.strip())

                    tableColumn += 1
                    if tableColumn >= len(tableKey):
                        splits_list.append(rowDict)
                        tableColumn = 0
                        rowDict = {'currentTabText': currentTabText, 'category': tableName, 'player_profile_id': player_profile_id, 'year': int(year), 'splitType': splitType}
                except:
                    logger.exception('failed parsing row %d of %s', rowIndex, tableName)
                    while(tableColumn < len(tableKey)):
                        tableColumn += 1
                    rowDict = {'currentTabText': currentTabText, 'category': tableName, 'player_profile_id': player_profile_id, 'year': int(year), 'splitType': splitType}

    try:
        logger.debug('Bulk Creating splits_list')
        if splits_list:
            col_player_splits.insert_many(splits_list)
        else:
            logger.debug('Nothing to insert')
    except:
        logger.exception('insert_many error')

    logger.debug('parseSplits time elapsed: ' + str(datetime.now() - startTime))
Esempio n. 18
0
def parseGameLogs(logger, gameLogs, year, player_profile_id):
    """
    Parses 1 year of games logs for given player.
    Stores each row in its own doc
    """
    startTime = datetime.now()

    logger.debug('Starting gameLogs')

    gameLogs_list = []
    #messy because of bye weeks, 1 less column present
    for tableNumber, gameLog in enumerate(gameLogs):
        logger.debug('Table %d of %d', tableNumber, len(gameLogs))
        try:
            topTableColumns = gameLog.find(class_="player-table-header").find_all('td')
            topTableKey = []
            if len(topTableColumns) > 1:
                for index, topTableColumn in enumerate(topTableColumns):
                    for _ in range(int(topTableColumn['colspan'])):
                        if index == 0:
                            topTableKey.append('')
                        else:
                            topTableKey.append(topTableColumn.text)
            tableName = topTableColumns[0].text.strip()

            tableKey = gameLog.find(class_="player-table-key")
            tableKey = tableKey.find_all('td')

            if topTableKey:
                for index, key in enumerate(tableKey):
                    if topTableKey[index]:
                        tableKey[index] = cleanKey(topTableKey[index] + '_' + key.text)
                    else:
                        tableKey[index] = cleanKey(key.text)

            tableItems = gameLog.find("tbody").find_all("td")

            rowDict = {'category': tableName, 'player_profile_id': player_profile_id, 'year': int(year)}
            tableColumn = 0
            byeWeek = False
            columnsSkip = 0
            rowWeek = None
        except:
            logger.exception('failed parsing table')
            continue

        for index, item in enumerate(tableItems):
            try:
                if byeWeek:
                    if columnsSkip >= len(tableKey)-3:
                        byeWeek = False
                        columnsSkip = 0
                        tableColumn = 0
                    else:
                        columnsSkip += 1
                    continue

                #skip borders
                if 'class' in item.attrs:
                    if item.attrs['class'][0] == 'border-td':
                        continue
                #detect Total row and break
                if 'colspan' in item.attrs:
                    if item.attrs['colspan'] == "3":
                        if 'class' in tableItems[index+1].attrs:
                            if tableItems[index+1].attrs["class"][0] == "player-totals":
                                break

                if tableColumn == 0:
                    logger.debug('Row %d of %d', index, len(tableItems))
                    rowDict['week'] = convertToNumber(item.text.strip())
                    tableColumn += 1
                    continue

                if tableColumn == 1:
                    if item.text.strip() == "Bye":
                        byeWeek = True
                        gameDate = "Bye"
                        tableColumn +=1
                        while(tableColumn < len(tableKey)):
                            rowDict[tableKey[tableColumn]] = None
                            tableColumn += 1
                        #store nones

                if not byeWeek:
                    if tableColumn == 2:
                        opp = None
                        linksFound = len(item.find_all('a'))
                        if linksFound == 2:
                            opp = item.find_all('a')[1].text.strip()
                        elif linksFound == 1:
                            opp = item.find_all('a')[0].text.strip()
                        else:
                            opp = item.text.strip()
                        rowDict[tableKey[tableColumn]] = opp.replace('\t', '').replace('\n', '')
                        tableColumn += 1
                        continue

                    if tableColumn == 3:
                        outCome = item.find("span")
                        if not outCome:
                            outCome = 'T'
                        else:
                            outCome = outCome.text.strip()
                        score = None
                        linksFound = len(item.find_all("a"))
                        if linksFound == 1:
                            score = item.find("a").text.strip()
                        elif linksFound == 0:
                            score = re.findall('[0-9]+-[0-9]+', item.text)[0]
                        result = outCome + score
                        rowDict[tableKey[tableColumn]] = result
                        tableColumn += 1
                        continue

                    rowDict[tableKey[tableColumn]] = convertToNumber(item.text.strip())


                    tableColumn += 1
                    if tableColumn >= len(tableKey):
                        gameLogs_list.append(rowDict)
                        rowDict = {'category': tableName, 'player_profile_id': player_profile_id, 'year': int(year)}
                        tableColumn = 0
                        byeWeek = False
            except:
                logger.exception('failed parsing row %d of %s. Skipping the row', index, tableName)
                while(tableColumn < len(tableKey)):
                    tableColumn += 1
                rowDict = {'category': tableName, 'player_profile_id': player_profile_id, 'year': int(year)}

    try:
        logger.debug('Bulk Creating gameLogs_list')
        if gameLogs_list:
            col_player_game_logs.insert_many(gameLogs_list)
        else:
            logger.debug('Nothing to insert')
    except:
        logger.exception('insert_many error')

    logger.debug('parseGameLogs time elapsed: ' + str(datetime.now() - startTime))
Esempio n. 19
0
def parseYear(team_name, year_url, year):
    """
    parses a schedule for a specific year on http://www.pro-football-reference.com/years/{YEAR}/games.htm
    follows all the "boxscore" links (column[3]) to get stadium and weather conditions (game_info)
    stores schedule info in nfl_data.schedule
    stores game_info in nfl_data.game_info with schedule ids
    """
    logger = makeLogger(cleanKey(team_name) + "_" + str(year), r"./logs_pfrTeamStats/")

    startTime = datetime.now()

    logger.debug("Starting %d", year)

    schedule_list = []
    gameInfo_list = []

    client = MongoClient("localhost", 27017)
    db = client["nfl_data"]
    col_team_stats_weekly = db["team_stats_weekly"]

    # need to fix this to actually detect duplicate
    # if col_team_stats_weekly.find({'year': year}).count():
    #     logger.debug('Already parsed %s', year)
    #     closeLogger(logger)
    #     return None

    wait = random.uniform(1.5, 3.5)
    logger.debug("Waiting %f", wait)
    time.sleep(wait)

    logger.debug("Opening main page")
    browser = RoboBrowser(history=False, parser="html5lib", user_agent=get_user_agent(logger), timeout=10)
    browser = open_or_follow_link(logger, browser, "open", year_url)
    table = browser.find(id="games")
    rows = table.find_all("tr")
    header = [cleanKey(each.attrs["data-stat"]) for each in rows[0].find_all("th")]
    rows = rows[1:]

    row_dicts = []
    for index, row in enumerate(rows):
        logger.debug("Row %d of %d", index, len(rows))
        try:
            week_number = convertToNumber(row.find("th").text)
            row_values = [convertToNumber(value.text) for value in row.find_all("td")]
            row_values.insert(0, week_number)
            row_dict = dict(zip(header, row_values))
            row_dict["year"] = year
            row_dict["team_name"] = team_name
            row_dict["year_url"] = year_url

            if row_dict["game_date"].lower() == "playoffs":
                continue

            row_dicts.append(row_dict)
        except:
            logger.exception(row)

    logger.debug("team_stats_weekly.inert_many")

    col_team_stats_weekly.insert_many(row_dicts)

    logger.debug("parseYear time elapsed: " + str(datetime.now() - startTime))

    closeLogger(logger)
Esempio n. 20
0
def parseCareerStats(logger, careerStats, player_profile_id):
    """
    Parses career stats page for given player
    Stores each row in its own doc
    """
    startTime = datetime.now()

    logger.debug('Starting careerStats')

    careerStats_list = []
    for tableNumber, careerStat in enumerate(careerStats):
        logger.debug('Table %d of %d', tableNumber, len(careerStats))
        try:
            tableName = careerStat.find("div").text.strip()

            tableKey = careerStat.find_all(class_="player-table-key")[-1]
            tableKey = tableKey.find_all('td')

            tableItems = careerStat.find("tbody").find_all("td")

            rowDict = {
                'category': tableName,
                'player_profile_id': player_profile_id
            }
            rowYear = None
            tableColumn = 0
        except:
            logger.exception('failed parsing table')
            continue

        for index, item in enumerate(tableItems, start=1):
            try:
                if 'class' in item.attrs:
                    if item.attrs['class'][0] == 'border-td':
                        continue
                    if item.attrs['class'][0] == 'player-totals':
                        break
                if tableColumn == 0:
                    logger.debug('Row %d of %d', index, len(tableItems))
                    rowDict['year'] = convertToNumber(item.text.strip())
                    tableColumn += 1
                    continue

                rowDict[cleanKey(
                    tableKey[tableColumn].text)] = convertToNumber(
                        item.text.strip())

                tableColumn += 1
                if tableColumn >= len(tableKey):
                    careerStats_list.append(rowDict)
                    rowDict = {
                        'category': tableName,
                        'player_profile_id': player_profile_id
                    }
                    tableColumn = 0
            except:
                logger.exception('failed parsing row %d of %s', index,
                                 tableName)
                while (tableColumn < len(tableKey)):
                    tableColumn += 1
                rowDict = {
                    'category': tableName,
                    'player_profile_id': player_profile_id
                }

    try:
        logger.debug('Bulk Creating careerStats_list')
        if careerStats_list:
            col_player_career_stats.insert_many(careerStats_list)
        else:
            logger.debug('Nothing to insert')
    except:
        logger.exception('insert_many error')

    logger.debug('parseCareerStats time elapsed: ' +
                 str(datetime.now() - startTime))
Esempio n. 21
0
def parseSeason(role, category, season, seasonTypes):
    """Parses every seasonType in a season at http://www.nfl.com/stats/categorystats for a given role/category/season
    doesnt follow any links
    some years dont have any info, but still return a page.
    These are loged with Exception('No teams found %s' % url)
    All data is stored in team_stats
    """
    logger = makeLogger(role.text + '_' + category.text + '_' + season.text,
                        r'./logs_nflteamStat/')

    startTime = datetime.now()

    logger.debug('Starting %s %s %s', role.text, category.text, season.text)

    teamStat_list = []
    for seasonType in seasonTypes:
        if seasonType.text == "Season Type...":
            continue

        team_stats_query = {
            'year': convertToNumber(removeNewLine(season.text)),
            'seasonType': removeNewLine(seasonType.text),
            'role': removeNewLine(role.text),
            'category': removeNewLine(category.text)
        }

        if col_team_stats.find(team_stats_query).count():
            logger.debug('Already parsed %s', team_stats_query)
            continue

        wait = random.uniform(1.5, 3.5)
        logger.debug('Waiting %f', wait)
        time.sleep(wait)

        logger.debug('Starting: %s', team_stats_query)
        url = 'http://www.nfl.com/stats/categorystats?' + 'archive=true&conference=null' + '&role=' + role[
            'value']
        try:
            if role.text == "Offense":
                categoryUrl = '&offensiveStatisticCategory=' + category[
                    'value'] + '&defensiveStatisticCategory=null'

            elif role.text == "Defense":
                categoryUrl = '&offensiveStatisticCategory=null&defensiveStatisticCategory=' + category[
                    'value']
            else:
                raise Exception('Unsupported role: %s', role.text)

            url += categoryUrl
            url += '&season=' + season['value'] + '&seasonType=' + seasonType[
                'value'] + '&tabSeq=2&qualified=false&Submit=Go'

            logger.debug('Opening: %s', url)
            browser = RoboBrowser(history=False,
                                  parser='html5lib',
                                  user_agent=get_user_agent(logger),
                                  timeout=10)
            browser = open_or_follow_link(logger, browser, 'open', url)
            result = browser.find(id="result")

            tries = 0
            # sometimes when using slow proxies nfl.com returns 200 without the whole page being loaded
            while not result:
                if tries > 10:
                    raise Exception('No teams found %s' % url)
                elif tries > 0:
                    time.sleep(random.uniform(5, 7))
                tries += 1
                logger.debug('No result-tries: %d', tries)
                browser = RoboBrowser(history=False,
                                      parser='html5lib',
                                      user_agent=get_user_agent(logger),
                                      timeout=10)
                browser = open_or_follow_link(logger, browser, 'open', url)
                result = browser.find(id="result")

            tbodies = result.find_all("tbody")
            if len(tbodies) != 2:
                raise Exception("error parsing result")
            tableKey = tbodies[0]
            tableKeyRows = tableKey.find_all("tr")
            topTableKeys = []
            if len(tableKeyRows) == 1:
                tableKey = tableKey.find_all("th")
            elif len(tableKeyRows) == 2:
                topTableColumns = tableKeyRows[0].find_all("th")
                for topTableColumn in topTableColumns:
                    for _ in range(int(topTableColumn['colspan'])):
                        topTableKeys.append(topTableColumn.text)
                tableKey = tableKeyRows[1].find_all("th")
            else:
                raise Exception('To many header rows found')

            tableItems = tbodies[1]
            tableItems = tableItems.find_all("td")

            tableColumn = 0
            teamStatDict = {}
            for tableIndex, tableItem in enumerate(tableItems):
                if tableColumn == 0:
                    logger.debug('Row %d of %d', tableIndex, len(tableItems))
                    tableColumn += 1
                    continue

                if tableColumn == 1:
                    teamStatDict['team'] = removeNewLine(tableItem.text)
                    teamStatDict['year'] = int(removeNewLine(season.text))
                    teamStatDict['seasonType'] = removeNewLine(seasonType.text)
                    teamStatDict['role'] = removeNewLine(role.text)
                    teamStatDict['category'] = removeNewLine(category.text)
                    tableColumn += 1
                    continue

                if topTableKeys and topTableKeys[tableColumn]:
                    key = topTableKeys[tableColumn] + '_' + tableKey[
                        tableColumn].text
                else:
                    key = tableKey[tableColumn].text
                key = cleanKey(removeNewLine(key))
                value = convertToNumber(removeNewLine(tableItem.text))
                teamStatDict[key] = value

                tableColumn += 1
                if tableColumn >= len(tableKey):
                    teamStat_list.append(teamStatDict)
                    teamStatDict = {}
                    tableColumn = 0
        except:
            logger.exception('row fail')

    try:
        if teamStat_list:
            logger.debug('Bulk Creating teamStat_list')
            col_team_stats.insert_many(teamStat_list)
    except:
        logger.exception('insert_many error')

    logger.debug('parseSeason time elapsed: ' +
                 str(datetime.now() - startTime))

    closeLogger(role.text + '_' + category.text)
Esempio n. 22
0
def parseWeek(year, week):
    """Parsing a specific week at http://nflweather.com/week/{}/Week-{}
    Follows all detial links, which is where must of the data is scraped.
    Scrapes weather, and stadium enough per week, and stores them in their respective collections
    """
    logger = makeLogger(str(year) + '_' + str(week), r'./logs_nflWeather/')

    startTime = datetime.now()

    logger.debug('Starting %d %d', year, week)

    weather_list = []
    stadium_list = []

    if col_weather_info.find({'year': year, 'week': week}).count():
        logger.debug('Already parsed %d %d', year, week)
        return None

    wait = random.uniform(1.5, 3.5)
    logger.debug('Waiting %f', wait)
    time.sleep(wait)

    logger.debug('Opening main page')
    browser = RoboBrowser(history=False,
                          parser='html5lib',
                          user_agent=get_user_agent(logger),
                          timeout=10)
    browser = open_or_follow_link(
        logger, browser, 'open',
        "http://nflweather.com/week/{}/Week-{}".format(year, week))

    data = browser.find(class_="footable")
    rows = data.find_all('tr')

    for index, row in enumerate(rows):
        logger.debug('Row %d of %d', index, len(rows))
        weatherInfo = {'year': year, 'week': week}
        stadiumInfo = {'year': year, 'week': week}

        try:
            columns = row.find_all('td')
            if columns:
                weatherInfo['weatherPicAlt'] = columns[8].find('img')['alt']
                weatherInfo['weatherText'] = columns[9].text.strip()
                weatherInfo['shortWind'] = columns[10].text
                details = columns[12]
                detialsLink = 'http://nflweather.com' + details.find(
                    'a')['href']
                wait = random.uniform(.5, 2.5)
                logger.debug('Waiting to follow_link %f', wait)
                time.sleep(wait)
                logger.debug('Following link')
                browser = open_or_follow_link(logger, browser, 'open',
                                              detialsLink)
                gameTime = browser.find('strong').text.split('-')[0].split(
                    ':', 1)[1].strip()
                awayTeam = browser.find_all(
                    class_='g-away')[1].find('a').text.replace('  ',
                                                               ' ').strip()
                homeTeam = browser.find_all(
                    class_='g-home')[1].find('a').text.replace('  ',
                                                               ' ').strip()
                spans = browser.find_all(class_='span5')
                if len(spans) != 2:
                    raise Exception('to many spans')

                weatherItems = spans[0].find_all('p')
                stadiumItems = spans[1].find_all('p')

                index = spans[0].text.find('Temperature:')
                weatherCondition = spans[0].text[:index].strip()

                for each in weatherItems:
                    split = each.text.strip().split(':')
                    if len(split) == 2:
                        weatherInfo[cleanKey(
                            split[0].strip())] = convertToNumber(
                                split[1].strip())

                for index, each in enumerate(stadiumItems):
                    split = each.text.strip().split(':')
                    if len(split) == 2:
                        if split[0] == 'Surface':
                            stadiumInfo['stadium'] = stadiumItems[
                                index - 1].text.strip()
                        stadiumInfo[cleanKey(
                            split[0].strip())] = convertToNumber(
                                split[1].strip())

                #find nfl_schedule, update gameTime, hoepfully result as id, insert id into both info dicts, append to _list
                schedule_query = {
                    'year': year,
                    'week': week,
                    'homeTeam': homeTeam,
                    'awayTeam': awayTeam
                }
                schedule_doc = col_schedule.find(schedule_query)
                if schedule_doc.count() != 1:
                    error_docs = str(schedule_query) + ' | ' + str(
                        weatherInfo) + ' | ' + str(stadiumInfo)
                    raise Exception("nfl_scedule doc not found " + error_docs)
                result = col_schedule.update_one(
                    schedule_query, {'$set': {
                        'dateTime': gameTime
                    }})
                schedule_id = schedule_doc[0]['_id']
                weatherInfo['schedule_id'] = schedule_id
                stadiumInfo['schedule_id'] = schedule_id
                weather_list.append(weatherInfo)
                stadium_list.append(stadiumInfo)
        except:
            logger.exception(row)

    try:
        logger.debug('Bulk Creating weather_list')
        col_weather_info.insert_many(weather_list)
        logger.debug('Bulk Creating stadium_list')
        col_stadium_info.insert_many(stadium_list)
    except:
        logger.exception('insert_many error')
    logger.debug('parseWeek time elapsed: ' + str(datetime.now() - startTime))

    closeLogger(str(year) + '_' + str(week))