Ejemplo n.º 1
0
def main(args):
    """Use this function to generate a JSON file of player profile data, for
    seeding our database.
    """

    # init logging
    logging.basicConfig(filename='scrape_players_json.log',
                        level=logging.DEBUG,
                        format='%(asctime)s %(message)s',
                        datefmt='%m/%d/%Y %I:%M:%S %p')
    logging.info('Starting player scrape...')

    # seasons to process, descending order
    years = [2015, 2014, 2013, 2012, 2011, 2010, 2009, 2008, 2007]

    all_players = []

    for year in years:

        logging.info('Begin scraping page for year {}'.format(year))

        # make a urllib url object for the year's player list page
        list_url = urllib.parse.urlparse('http://www.rugbyleagueproject.org/'
                                         'seasons/nrl-{}/players.html'.format(
                                             year))
        logging.info('Requesting {}'.format(list_url))

        try:
            list_soup = saltytools.make_soup(list_url, USER_AGENT)
        except requests.exceptions.HTTPError as err:
            logging.exception(err)
            break

        # find all links to individual player detail pages
        player_links = list_soup.find_all(href=re.compile('players/\d+'))

        # load individual player detail pages
        for link in player_links:

            # dict to fill with player data
            player = {}
            player['uuid'] = str(uuid.uuid4())

            # make url and parse
            player_url = urllib.parse.urlparse(
                'http://www.rugbyleagueproject.org' + link['href'])
            logging.info('Requesting {}'.format(player_url))
            try:
                player_soup = saltytools.make_soup(player_url, USER_AGENT)
            except requests.exceptions.HTTPError as err:
                logging.exception(err)
                break

            # process player name
            name = player_soup.h1.get_text()
            name = saltytools.normalise_caseless(name)
            player['name'] = name
            print(player['name'])
            logging.info('Player name: {}'.format(player['name']))

            # process player DOB
            born_dt = player_soup.find('dt', text='Born')

            if born_dt is None:
                player['dob'] = None
            else:
                dob = born_dt.find_next_sibling('dd').get_text()
                player['dob'] = dateutil.parser.parse(dob).date().isoformat()

            print(player['dob'])
            logging.info('Player DOB: {}'.format(player['dob']))

            # process player career
            nrl_year_td = player_soup.find_all('td',
                                               text=re.compile('NRL\s\d+'))

            years = [int(td.get_text().split()[1]) for td in nrl_year_td]

            teams = [td.find_previous_sibling('td').get_text()
                     for td in nrl_year_td]

            career = dict(zip(years, teams))

            # drop {year: team} pairs prior to 2007 as we don't care about
            # these matches, normalise the teams from years we do care about
            for year in list(career):
                if year < 2007:
                    career.pop(year, None)
                else:
                    career[year] = saltytools.process_team(career[year])
            '''
            for year, team in raw_career.items():
                if year < 2007:
                    career.pop(year, None)
                else:
                    career[year] = saltytools.process_team(team)
            '''

            player['career'] = career
            logging.info('Processed career')

            # Duplicate handling:
            # Because we process years in descending ordering, we don't
            # need to update the career information when we find the
            # player a second time.
            # The first occurance of the player will always be the
            # newest, and hence have the most up-to-date career info.
            is_duplicate = False

            for ex in all_players:
                if ex['name'] == player['name'] and ex['dob'] == player['dob']:
                    is_duplicate = True
                    break

            logging.info('Duplicate: {}'.format(is_duplicate))

            if not is_duplicate:
                all_players.append(player)

            time.sleep(SLEEP_TIME)

            # break  # end player list loop

            # break  # end year loop

    all_players_asc = sorted(all_players, key=itemgetter('name'))
    with open('players.json', 'w') as outfile:
        json.dump(all_players_asc, outfile)
Ejemplo n.º 2
0
def main(args):

    # handle options
    parser = argparse.ArgumentParser(description='''
        SaltyStats: the NRL stats scraper''')
    parser.add_argument('type', choices=['match'])
    parser.add_argument('store', choices=['csv', 'sqlite'])
    parser.add_argument('url')
    parser.add_argument('--verbose', '-v', action='count')
    parser.add_argument('--version', action='version', version=VERSION)
    args = parser.parse_args()
    print('Arguments: ', vars(args))

    # parse URL
    urlobj = urlparse(args.url)

    # parse HTML w/BeautifulSoup
    match_soup = saltytools.make_soup(urlobj, USER_AGENT)

    # parse URL for datasource
    kw = saltytools.datasource_kw(urlobj)

    # choose object type based on datasource URL
    if kw == 'afltables':
        match_data = MatchDataAfl(match_soup)
    elif kw == 'rlproject':
        match_data = MatchDataRlp(match_soup)
    elif kw is None:
        raise UnknownDatasourceError('''Couldn\'t parse datasource from {url}.
            Is this site supported?'''.format(urlobj.geturl))

    print('\n### HOME ###')
    print('team name (raw): ', match_data.home)
    print('team name: ', saltytools.process_team(match_data.home))
    print('team score: ', match_data.home_score)
    print('team scrums: ', match_data.home_scrums)
    print('team penalties: ', match_data.home_penalties)

    print('players: ', str(match_data.home_players))
    print('try scorers: ', str(match_data.home_tryscorers))
    print('goal scorers: ', str(match_data.home_goalscorers))
    print('field goal scorers: ', str(match_data.home_fgoalscorers))

    print('\n### AWAY ###')
    print('team name (raw): ', match_data.away)
    print('team name: ', saltytools.process_team(match_data.away))
    print('team score: ', match_data.away_score)
    print('team scrums: ', match_data.away_scrums)
    print('team penalties: ', match_data.away_penalties)

    print('players: ', str(match_data.away_players))
    print('try scorers: ', str(match_data.away_tryscorers))
    print('goal scorers: ', str(match_data.away_goalscorers))
    print('field goal scorers: ', str(match_data.away_fgoalscorers))

    print('\n### META ###')
    print('ref(s): ', match_data.referees)
    print('venue (raw): ', match_data.venue)
    print('venue: ', saltytools.process_venue(match_data.venue))
    print('crowd: ', match_data.crowd)
    print('date: ', match_data.date)
    print('time: ', match_data.time)
    print('round: ', match_data.round)

    print('\n### STRUCTURE ###')
    print('__repr__: ', match_data)

    print('\n### OUTPUT ###')
    if args.store == 'csv':
        filename = 'export/{} vs {} {}.saltystats_{}.csv'.format(
            match_data.home, match_data.away, match_data.date, VERSION)
        print('write csv...')
        match_data.write_csv(filename)

    elif args.store == 'sqlite':
        print('do sql stuffs...')

        # setup db
        print('Initialising database "{}"'.format(SQLITE_FILE))
        print('Result: {}'.format(saltysql.create_database(SQLITE_FILE)))

        # seed db
        print('Seeding players from players.json')
        print('Result: {}'.format(saltysql.seed_players('players.json',
                                                        SQLITE_FILE)))

        # ensure home, away, venue, round exist & and get ID
        home_cannonical = saltytools.process_team(match_data.home)
        print('Home ID: {}'.format(saltysql.insert_team(home_cannonical,
                                                        SQLITE_FILE)))

        away_cannonical = saltytools.process_team(match_data.away)
        print('Away ID: {}'.format(saltysql.insert_team(away_cannonical,
                                                        SQLITE_FILE)))

        venue_cannonical = saltytools.process_venue(match_data.venue)
        print('Venue ID: {}'.format(saltysql.insert_venue(venue_cannonical,
                                                          SQLITE_FILE)))

        print('Round ID: {}'.format(saltysql.insert_round(
            match_data.round, match_data.date.year, SQLITE_FILE)))

        print('Home players:')
        # For player disambiguation, we must find each players uuid in the
        # json data file & use it find the correct player_id.
        # First look up by name. This will work in 95% of cases.
        # If we get a name clash, we need to disambiguate.
        # For match datasources that include a DOB, we can compare the incoming
        # DOB to the DOBs of the clashing player's in the json data file.
        # Otherwise, we can look at the clashing player's career's in the json
        # data file and find which one was playing for the incoming player's
        # team at the time of the match in question.

        # query db to find is multiple players exist with name
        # what is the minimum we know about this player at this stage?
        # his name and that he played for a certain team in a certain year
        # we can parse the json data file & find the right guy with this info

        # get json from file
        print(saltytools.find_player_uuid(
            match_data.home_players[13], match_data.home, match_data.date.year,
            'players.json'))

    return 0