def scrape_data(self): print self.name, self.overview_url if self.overview_url_content is not None: raise Exception("Can't populate this!") overview_soup = getSoupFromURL(self.overview_url) self.overview_url_content = overview_soup.text try: pos = filter(lambda x: 'Position:' in x, [p.text for p in overview_soup.findAll('p')])[0].strip().replace('\n','') self.positions = re.findall(self.POSN_PATTERN, pos)[0].strip().encode("utf8").split(" and ") self.height = overview_soup.find('span', {'itemprop':'height'}).text self.weight = overview_soup.find('span', {'itemprop':'weight'}).text[:-2] except Exception as ex: logging.error(ex.message) self.positions = [] self.height = None self.weight = None # the links to each year's game logs are in <li> tags, and the text contains 'Game Logs' # so we can use those to pull out our urls. for li in overview_soup.find_all('li'): game_log_links = [] if 'Game Logs' in li.getText(): game_log_links = li.findAll('a') for game_log_link in game_log_links: if 'gamelog' in game_log_link.get('href'): self.gamelog_url_list.append('http://www.basketball-reference.com' + game_log_link.get('href'))
def scrape_data(self): print self.name,self.overview_url if self.overview_url_content is not None: raise Exception("Can't populate this!") overview_soup = getSoupFromURL(self.overview_url) self.overview_url_content = overview_soup.text try: player_position_text = overview_soup.findAll(text=re.compile(u'(Point Guard|Center|Power Forward|Shooting Guard|Small Forward)'))[0] player_height_text = overview_soup.findAll(text=re.compile(self.HEIGHT_PATTERN))[0] player_weight_text = overview_soup.findAll(text=re.compile(self.WEIGHT_PATTERN))[0] self.height = re.findall(self.HEIGHT_PATTERN,player_height_text)[0].strip().encode("utf8") self.weight = re.findall(self.WEIGHT_PATTERN,player_weight_text)[0].strip().encode("utf8") tempPositions = re.findall(self.POSN_PATTERN,player_position_text) self.positions = [position.strip().encode("utf8") for position in tempPositions] self.salaries = self.findSalaries(overview_soup) self.age = self.findAge(overview_soup) except Exception as ex: logging.error(ex.message) self.positions = [] self.height = None self.weight = None # the links to each year's game logs are in <li> tags, and the text contains 'Game Logs' # so we can use those to pull out our urls. for li in overview_soup.find_all('li'): game_log_links = [] if 'Game Logs' in li.getText(): game_log_links = li.findAll('a') for game_log_link in game_log_links: self.gamelog_url_list.append('http://www.basketball-reference.com' + game_log_link.get('href'))
def scrape_data(self): print self.name,self.overview_url if self.overview_url_content is not None: raise Exception("Can't populate this!") overview_soup = getSoupFromURL(self.overview_url) self.overview_url_content = overview_soup.text try: player_infotext = overview_soup.findAll('p',attrs={'class':'padding_bottom_half'})[0].text.split('\n')[0] self.positions = re.findall(self.POSN_PATTERN,player_infotext)[0].strip().encode("utf8").split(" and ") self.height = re.findall(self.HEIGHT_PATTERN,player_infotext)[0].strip().encode("utf8") self.weight = re.findall(self.WEIGHT_PATTERN,player_infotext)[0].strip().encode("utf8") except Exception as ex: logging.error(ex.message) self.positions = [] self.height = None self.weight = None # the links to each year's game logs are in <li> tags, and the text contains 'Game Logs' # so we can use those to pull out our urls. for li in overview_soup.find_all('li'): game_log_links = [] if 'Game Logs' in li.getText(): game_log_links = li.findAll('a') for game_log_link in game_log_links: self.gamelog_url_list.append('http://www.basketball-reference.com' + game_log_link.get('href'))
def scrape_data(self): print(self.name, self.overview_url) if self.overview_url_content is not None: raise Exception("Can't populate this!") overview_soup = getSoupFromURL(self.overview_url) print(type(overview_soup)) # this is a test line type_name = type(overview_soup).__name__ if type_name == 'bs4.BeautifulSoup': self.overview_url_content = overview_soup.get_text() try: player_position_text = overview_soup.find_all( text=self.POSN_PATTERN)[0] player_height_text = overview_soup.find_all( text=self.HEIGHT_PATTERN)[0] player_weight_text = overview_soup.find_all( text=self.WEIGHT_PATTERN)[0] self.height = self.HEIGHT_PATTERN.findall( player_height_text)[0].strip() self.weight = self.WEIGHT_PATTERN.findall( player_weight_text)[0].strip() tempPositions = self.POSN_PATTERN.findall(player_position_text) self.positions = [position.strip() for position in tempPositions] self.scrape_player_nicknames(overview_soup) self.scrape_teams(overview_soup) except Exception as ex: logging.error(ex) self.positions = [] self.nicknames = [] self.height = None self.weight = None # the links to each year's game logs are in <li> tags, and the text # contains 'Game Logs'so we can use those to pull out our urls. link_prefix = "https://www.basketball-reference.com" for li in overview_soup.find_all('li'): if 'Game Logs' in li.getText(): all_links = li.findAll('a') for link in all_links: link_suffix = link.get('href') if "/gamelog/" in link_suffix: full_link = link_prefix + link_suffix season = link.get_text().strip() self.gamelog_url_list.append(full_link) self.gamelog_url_dict[season] = full_link if len(self.gamelog_url_list) > 0: break
def scrape_data(self): print(self.name, self.overview_url) if self.overview_url_content is not None: raise Exception("Can't populate this!") overview_soup = getSoupFromURL(self.overview_url) self.overview_url_content = overview_soup.text try: self.scrape_teams(overview_soup) except Exception as ex: logging.error(ex.message) self.teams = {}
def scrape_data(self): print(self.name, self.overview_url) if self.overview_url_content is not None: raise Exception("Can't populate this!") overview_soup = getSoupFromURL(self.overview_url) self.overview_url_content = overview_soup.get_text() try: bio_soup = overview_soup.find('div', attrs={"id": "meta"}) bio_lines = bio_soup.find_all('p') bio_text_lines = [line for line in bio_lines if line.find("strong") is not None] self.scrape_location(bio_text_lines) self.scrape_former_names(bio_text_lines) except Exception as ex: logging.error(ex.message) self.location = {} self.former_names = []
def scrape_data(self): print self.name, self.overview_url if self.overview_url_content is not None: raise Exception("Can't populate this!") overview_soup = getSoupFromURL(self.overview_url) self.overview_url_content = overview_soup.text try: player_infotext = overview_soup.findAll( 'p', attrs={'class': 'padding_bottom_half'})[0].text.split('\n')[0] self.positions = re.findall( self.POSN_PATTERN, player_infotext)[0].strip().encode("utf8").split(" and ") self.height = re.findall(self.HEIGHT_PATTERN, player_infotext)[0].strip().encode("utf8") self.weight = re.findall(self.WEIGHT_PATTERN, player_infotext)[0].strip().encode("utf8") except Exception as ex: logging.error(ex.message) self.positions = [] self.height = None self.weight = None # the links to each year's game logs are in <li> tags, and the text contains 'Game Logs' # so we can use those to pull out our urls. for li in overview_soup.find_all('li'): game_log_links = [] if 'Game Logs' in li.getText(): game_log_links = li.findAll('a') for game_log_link in game_log_links: self.gamelog_url_list.append( 'http://www.basketball-reference.com' + game_log_link.get('href'))