def roster_parser(season_id, team_id, division=1): team_season = TeamSeason.objects.select_related().get(team__ncaa_id=team_id, season__end_year=season_id) url = "http://stats.ncaa.org/team/index/%s?org_id=%s" % (team_season.season.ncaa_id, team_id) soup = soupify(url) rows = soup.findAll("table")[2].findAll("tr") player_links = rows[2 : len(rows)] for p in player_links: try: ncaa_id = int(float(p.findAll("td")[1].find("a")["href"].split("=", 2)[2])) name = extract_player_name(p.findAll("td")[1].find("a").contents[0].split(",")) except: ncaa_id = -1 name = extract_player_name(p.findAll("td")[1].contents[0].split(",")) player, player_created = Player.objects.get_or_create(name=name, ncaa_id=ncaa_id) player_season, ps_created = PlayerSeason.objects.get_or_create(player=player, team_season=team_season) if ps_created: try: player_season.jersey = int(p.findAll("td")[0].contents[0]) except: player_season.jersey = None try: player_season.position = SafeUnicode(p.findAll("td")[2].contents[0]) player_season.feet = int(p.findAll("td")[3].contents[0].split("-")[0]) player_season.inches = int(p.findAll("td")[3].contents[0].split("-")[1]) player_season.year = SafeUnicode(p.findAll("td")[4].contents[0]) except: pass player_season.save()
def extractLinks(url): """ This function takes in the kiplinger url and returns a list of tuples Each tuple has the (date, url) for a given article """ page = attemptUrl(kip_url) # read the page, sub out newlines soup = soupify(page) list_of_articles = soup.find_all('div', {'id': re.compile(r'recent\[\d*\]')}) list_of_links = [] url_prefix = 'http://www.kiplinger.com' for article in list_of_articles: # grab the link to the article as well as the date article_link = article.findChild('h3').findChild('a').get('href') article_link = url_prefix + article_link article_date = article.findChild('h4').get_text() # some of the dates have 'From Kiplinger's Personal Finance, ' article_date = re.sub(r'From Kiplinger\'s Personal Finance, ', '', article_date) # add to our list list_of_links.append((article_date, article_link)) return list_of_links
def extractBlogLinks(url): """ This function takes in a url of the form from insidermonkey.com/blog And extracts the blog links Also extracts the next page link Returns a tuple (blog_link_list, next_page_link) """ page = attemptUrl(url) soup = soupify(page) # grab list of blog links content_list = soup.find_all('div', {'class': 'post'}) blog_link_list = [x.findChild('h2').findChild('a').get('href') for x in content_list] # also grab the link to the next page try: next_tag = soup.find_all('div', {'class': 'navigation'})[0] except: print 'could not find next link' return (blog_link_list, None) # grab the link next_page_link = next_tag.findChild('a').get('href') return (blog_link_list, next_page_link)
def extractArticleContents(page_url): """ This function takes the url of a article and extracts the title, date, and content Returns in the form of a tuple (title, date, content) """ page = attemptUrl(page_url) soup = soupify(page) # First, we need to check if there is a "see all" option for the post if soup.find_all('div', {'class': 'see-all'}): # we found a 'see-all' tag, so grab the link see_all_tag = soup.find_all('div', {'class': 'see-all'})[0] see_all_link = see_all_tag.findChild('a').get('href') # and we will extract the full contents from that pages, recursively return extractArticleContents(see_all_link) # otherwise, we will pull out the content, title, and date else: # grab the text try: # find where the text is in the doc content_obj = soup.find_all('div', {'class': 'blog-content-container clearfix'})[0] content_child = content_obj.findChild('div', {'class': 'post'}) content_gchild = content_child.findChild('div', {'class': re.compile(r'content\-with.*-wrap')}) # grab the text content_text = content_gchild.get_text() except: print 'could not extract text: ' + page_url content_text = None # grab the title try: title_obj = soup.find_all('div', {'class': 'single-post-title'})[0] title = title_obj.findChild('h1').get_text() except: print 'could not extract title: ' + page_url title = None # grab the date try: date_obj = soup.find_all('h6', {'class': 'date-line'})[0] date = date_obj.get_text() # strip out 'published' and timestamp date = re.sub(r'Published\:\s?', '', date) date = re.sub(r'\sat.*', '', date) date = re.sub(r'\W', '_', date) except IndexError: print 'could not extract the date: ' + page_url date = None return (title, date, content_text)
def extractArticleText(list_of_tuples): """ Takes in a list of tuples returned by extractLinks Returns the text body of each article """ # loop through list of tuples for (date, url) in list_of_tuples: # read the page try: page = attemptUrl(url) except: print 'unable to open: ' + url continue # if we succeeded in opening the url, then read the page soup = soupify(page) # search for the content body list_of_contents = soup.find_all('div', {'class': re.compile(r'kip\-column\-content')}) # if we couldn't find anything, continue if not list_of_contents: print 'could not extract content' print 'url: ' + url continue page_text = '' # in case there are multiple pages for a given article for content_page in list_of_contents: # getting a strange error...with 'call-me-manny-the-arb' # will skip try: for paragraph in content_page.find_all('p'): page_text = page_text + paragraph.get_text() + ' ' except: print 'issue with ' + url continue print 'saving ' + url saveArticle(date, page_text, url) print 'save complete' print '\n' # wait before we start with the next link wait_time = round(max(0, random.gauss(0, 0.5)), 2) time.sleep(wait_time) return
def team_parser(season_id=2011, division="1"): # defaults to division 1, but also supports division 3 season = Season.objects.get(end_year=season_id) url = "http://stats.ncaa.org/team/inst_team_list/%s?division=%s" % (season.ncaa_id, division) soup = soupify(url) team_links = [x.find("a") for x in soup.findAll("td")] for team in team_links: ncaa_id = int(team["href"].split("=")[1]) name = SafeUnicode(team.contents[0]) t, created = Team.objects.get_or_create(ncaa_id=ncaa_id, name=name) team_season, created = TeamSeason.objects.get_or_create(team=t, season=season, division=1)
def schedule_parser(season_id, team_id): season = Season.objects.get(ncaa_id=season_id) url = "http://stats.ncaa.org/team/index/%s?org_id=%s" % (season_id, team_id) soup = soupify(url) game_ids = [] links = soup.findAll("table")[1].findAll( lambda tag: tag.name == "a" and tag.findParent("td", attrs={"class": "smtext"}) ) for link in links: if not link.has_key("onclick"): game_ids.append(int(link["href"].split("?")[0].split("/")[3])) for game_id in game_ids: game_parser(game_id)
def schedule_parser(season_id, team_id): season = Season.objects.get(ncaa_id=season_id) url = "http://stats.ncaa.org/team/index/%s?org_id=%s" % (season_id, team_id) soup = soupify(url) game_ids = [] links = soup.findAll('table')[1].findAll( lambda tag: tag.name == 'a' and tag.findParent( 'td', attrs={'class': 'smtext'})) for link in links: if not link.has_key('onclick'): game_ids.append(int(link["href"].split("?")[0].split("/")[3])) for game_id in game_ids: game_parser(game_id)
def team_parser(season_id=2011, division="1"): # defaults to division 1, but also supports division 3 season = Season.objects.get(end_year=season_id) url = "http://stats.ncaa.org/team/inst_team_list/%s?division=%s" % ( season.ncaa_id, division) soup = soupify(url) team_links = [x.find('a') for x in soup.findAll('td')] for team in team_links: ncaa_id = int(team["href"].split("=")[1]) name = SafeUnicode(team.contents[0]) t, created = Team.objects.get_or_create(ncaa_id=ncaa_id, name=name) team_season, created = TeamSeason.objects.get_or_create(team=t, season=season, division=1)
def game_parser(game_id, season_id=2011): url = "http://stats.ncaa.org/game/box_score/%s" % game_id soup = soupify(url) season = Season.objects.get(end_year=season_id) visit_id, home_id = [ int(x['href'].split('=')[1]) for x in soup.findAll('table')[0].findAll('a') ] try: visit = TeamSeason.objects.select_related().get(team__ncaa_id=visit_id, season=season) except: v_team, created = Team.objects.get_or_create( ncaa_id=visit_id, name=soup.findAll('table')[0].findAll('a')[0].renderContents()) visit = TeamSeason.objects.create(team=v_team, season=season, division=0) home = TeamSeason.objects.select_related().get(team__ncaa_id=home_id, season=season) game_details = soup.findAll('table')[2] dt = parse(game_details.findAll('td')[1].contents[0]) loc = game_details.findAll('td')[3].contents[0] try: attend = int( game_details.findAll('td')[5].contents[0].replace(',', '')) except: attend = None officials = soup.findAll('table')[3].findAll('td')[1].contents[0].strip() scores = soup.findAll('table')[0].findAll('td', attrs={'align': 'right'}) visit_team_scores = [ int(x.renderContents()) for x in scores[0:len(scores) / 2] ] home_team_scores = [ int(x.renderContents()) for x in scores[len(scores) / 2:len(scores)] ] # second team listed is considered home team home_final = home_team_scores[(len(scores) / 2) - 1] visit_final = visit_team_scores[(len(scores) / 2) - 1] game, created = Game.objects.get_or_create( ncaa_id=game_id, home_team=home, visiting_team=visit, datetime=dt, location=SafeUnicode(loc), attendance=attend, officials=SafeUnicode(officials), home_team_score=home_final, visiting_team_score=visit_final)
def game_parser(game_id, season_id=2011): url = "http://stats.ncaa.org/game/box_score/%s" % game_id soup = soupify(url) season = Season.objects.get(end_year=season_id) visit_id, home_id = [int(x["href"].split("=")[1]) for x in soup.findAll("table")[0].findAll("a")] try: visit = TeamSeason.objects.select_related().get(team__ncaa_id=visit_id, season=season) except: v_team, created = Team.objects.get_or_create( ncaa_id=visit_id, name=soup.findAll("table")[0].findAll("a")[0].renderContents() ) visit = TeamSeason.objects.create(team=v_team, season=season, division=0) home = TeamSeason.objects.select_related().get(team__ncaa_id=home_id, season=season) game_details = soup.findAll("table")[2] dt = parse(game_details.findAll("td")[1].contents[0]) loc = game_details.findAll("td")[3].contents[0] try: attend = int(game_details.findAll("td")[5].contents[0].replace(",", "")) except: attend = None officials = soup.findAll("table")[3].findAll("td")[1].contents[0].strip() scores = soup.findAll("table")[0].findAll("td", attrs={"align": "right"}) visit_team_scores = [int(x.renderContents()) for x in scores[0 : len(scores) / 2]] home_team_scores = [ int(x.renderContents()) for x in scores[len(scores) / 2 : len(scores)] ] # second team listed is considered home team home_final = home_team_scores[(len(scores) / 2) - 1] visit_final = visit_team_scores[(len(scores) / 2) - 1] game, created = Game.objects.get_or_create( ncaa_id=game_id, home_team=home, visiting_team=visit, datetime=dt, location=SafeUnicode(loc), attendance=attend, officials=SafeUnicode(officials), home_team_score=home_final, visiting_team_score=visit_final, )
def roster_parser(season_id, team_id, division=1): team_season = TeamSeason.objects.select_related().get( team__ncaa_id=team_id, season__end_year=season_id) url = "http://stats.ncaa.org/team/index/%s?org_id=%s" % ( team_season.season.ncaa_id, team_id) soup = soupify(url) rows = soup.findAll('table')[2].findAll('tr') player_links = rows[2:len(rows)] for p in player_links: try: ncaa_id = int( float(p.findAll('td')[1].find('a')['href'].split('=', 2)[2])) name = extract_player_name( p.findAll('td')[1].find('a').contents[0].split(',')) except: ncaa_id = -1 name = extract_player_name( p.findAll('td')[1].contents[0].split(',')) player, player_created = Player.objects.get_or_create(name=name, ncaa_id=ncaa_id) player_season, ps_created = PlayerSeason.objects.get_or_create( player=player, team_season=team_season) if ps_created: try: player_season.jersey = int(p.findAll('td')[0].contents[0]) except: player_season.jersey = None try: player_season.position = SafeUnicode( p.findAll('td')[2].contents[0]) player_season.feet = int( p.findAll('td')[3].contents[0].split('-')[0]) player_season.inches = int( p.findAll('td')[3].contents[0].split('-')[1]) player_season.year = SafeUnicode( p.findAll('td')[4].contents[0]) except: pass player_season.save()