Ejemplo n.º 1
0
def insert_player(player_name, player_dob, json_uuid, filename):
    """Inserts a player into the database if that player does not already
    exist.

    Returns the player_id
    Args:

    """

    conn = sqlite3.connect(filename)
    c = conn.cursor()

    values = (saltytools.normalise_caseless(player_name), player_dob, json_uuid
              )

    c.execute('INSERT OR IGNORE INTO PLAYER VALUES (NULL, ?, ?, ?)', values)
    conn.commit()

    c.execute('''
        SELECT player_id FROM PLAYER
        WHERE player_name = ?''', (player_name, ))
    player_row = c.fetchone()
    player_id = player_row[0]
    conn.close()

    return player_id
Ejemplo n.º 2
0
def main(args):
    """Use this function to generate a JSON file of player profile data, for
    seeding our database.
    """

    # init logging
    logging.basicConfig(filename='scrape_players_json.log',
                        level=logging.DEBUG,
                        format='%(asctime)s %(message)s',
                        datefmt='%m/%d/%Y %I:%M:%S %p')
    logging.info('Starting player scrape...')

    # seasons to process, descending order
    years = [2015, 2014, 2013, 2012, 2011, 2010, 2009, 2008, 2007]

    all_players = []

    for year in years:

        logging.info('Begin scraping page for year {}'.format(year))

        # make a urllib url object for the year's player list page
        list_url = urllib.parse.urlparse('http://www.rugbyleagueproject.org/'
                                         'seasons/nrl-{}/players.html'.format(
                                             year))
        logging.info('Requesting {}'.format(list_url))

        try:
            list_soup = saltytools.make_soup(list_url, USER_AGENT)
        except requests.exceptions.HTTPError as err:
            logging.exception(err)
            break

        # find all links to individual player detail pages
        player_links = list_soup.find_all(href=re.compile('players/\d+'))

        # load individual player detail pages
        for link in player_links:

            # dict to fill with player data
            player = {}
            player['uuid'] = str(uuid.uuid4())

            # make url and parse
            player_url = urllib.parse.urlparse(
                'http://www.rugbyleagueproject.org' + link['href'])
            logging.info('Requesting {}'.format(player_url))
            try:
                player_soup = saltytools.make_soup(player_url, USER_AGENT)
            except requests.exceptions.HTTPError as err:
                logging.exception(err)
                break

            # process player name
            name = player_soup.h1.get_text()
            name = saltytools.normalise_caseless(name)
            player['name'] = name
            print(player['name'])
            logging.info('Player name: {}'.format(player['name']))

            # process player DOB
            born_dt = player_soup.find('dt', text='Born')

            if born_dt is None:
                player['dob'] = None
            else:
                dob = born_dt.find_next_sibling('dd').get_text()
                player['dob'] = dateutil.parser.parse(dob).date().isoformat()

            print(player['dob'])
            logging.info('Player DOB: {}'.format(player['dob']))

            # process player career
            nrl_year_td = player_soup.find_all('td',
                                               text=re.compile('NRL\s\d+'))

            years = [int(td.get_text().split()[1]) for td in nrl_year_td]

            teams = [td.find_previous_sibling('td').get_text()
                     for td in nrl_year_td]

            career = dict(zip(years, teams))

            # drop {year: team} pairs prior to 2007 as we don't care about
            # these matches, normalise the teams from years we do care about
            for year in list(career):
                if year < 2007:
                    career.pop(year, None)
                else:
                    career[year] = saltytools.process_team(career[year])
            '''
            for year, team in raw_career.items():
                if year < 2007:
                    career.pop(year, None)
                else:
                    career[year] = saltytools.process_team(team)
            '''

            player['career'] = career
            logging.info('Processed career')

            # Duplicate handling:
            # Because we process years in descending ordering, we don't
            # need to update the career information when we find the
            # player a second time.
            # The first occurance of the player will always be the
            # newest, and hence have the most up-to-date career info.
            is_duplicate = False

            for ex in all_players:
                if ex['name'] == player['name'] and ex['dob'] == player['dob']:
                    is_duplicate = True
                    break

            logging.info('Duplicate: {}'.format(is_duplicate))

            if not is_duplicate:
                all_players.append(player)

            time.sleep(SLEEP_TIME)

            # break  # end player list loop

            # break  # end year loop

    all_players_asc = sorted(all_players, key=itemgetter('name'))
    with open('players.json', 'w') as outfile:
        json.dump(all_players_asc, outfile)