Python soupifyの例、utils.soupify Pythonの例

コード例 #1

0

ファイルを表示

ファイル: scrapers.py プロジェクト: dwillis/NCAA-API

def roster_parser(season_id, team_id, division=1):
    team_season = TeamSeason.objects.select_related().get(team__ncaa_id=team_id, season__end_year=season_id)
    url = "http://stats.ncaa.org/team/index/%s?org_id=%s" % (team_season.season.ncaa_id, team_id)
    soup = soupify(url)
    rows = soup.findAll("table")[2].findAll("tr")
    player_links = rows[2 : len(rows)]
    for p in player_links:
        try:
            ncaa_id = int(float(p.findAll("td")[1].find("a")["href"].split("=", 2)[2]))
            name = extract_player_name(p.findAll("td")[1].find("a").contents[0].split(","))
        except:
            ncaa_id = -1
            name = extract_player_name(p.findAll("td")[1].contents[0].split(","))
        player, player_created = Player.objects.get_or_create(name=name, ncaa_id=ncaa_id)
        player_season, ps_created = PlayerSeason.objects.get_or_create(player=player, team_season=team_season)
        if ps_created:
            try:
                player_season.jersey = int(p.findAll("td")[0].contents[0])
            except:
                player_season.jersey = None
            try:
                player_season.position = SafeUnicode(p.findAll("td")[2].contents[0])
                player_season.feet = int(p.findAll("td")[3].contents[0].split("-")[0])
                player_season.inches = int(p.findAll("td")[3].contents[0].split("-")[1])
                player_season.year = SafeUnicode(p.findAll("td")[4].contents[0])
            except:
                pass
            player_season.save()

コード例 #2

0

ファイルを表示

ファイル: kiplinger_scrape.py プロジェクト: mshadish/stock_market_blog_scraper

def extractLinks(url):
    """
    This function takes in the kiplinger url
    and returns a list of tuples
    Each tuple has the (date, url) for a given article
    """
    page = attemptUrl(kip_url)
    
    # read the page, sub out newlines    
    soup = soupify(page)
    
    list_of_articles = soup.find_all('div',
                                     {'id': re.compile(r'recent\[\d*\]')})
                                     
    list_of_links = []
    url_prefix = 'http://www.kiplinger.com'
    
    for article in list_of_articles:
        
        # grab the link to the article as well as the date
        article_link = article.findChild('h3').findChild('a').get('href')
        article_link = url_prefix + article_link
        article_date = article.findChild('h4').get_text()
        
        # some of the dates have 'From Kiplinger's Personal Finance, '
        article_date = re.sub(r'From Kiplinger\'s Personal Finance, ',
                              '', article_date)
        
        # add to our list
        list_of_links.append((article_date, article_link))
        
        
    return list_of_links

コード例 #3

0

ファイルを表示

ファイル: insidermonkey_scrape.py プロジェクト: mshadish/stock_market_blog_scraper

def extractBlogLinks(url):
    """
    This function takes in a url
    of the form from insidermonkey.com/blog
    And extracts the blog links
    Also extracts the next page link
    
    Returns a tuple (blog_link_list, next_page_link)
    """
    page = attemptUrl(url)
    soup = soupify(page)
    
    # grab list of blog links
    content_list = soup.find_all('div', {'class': 'post'})
    
    blog_link_list = [x.findChild('h2').findChild('a').get('href')
                        for x in content_list]
                            
    # also grab the link to the next page
    try:
        next_tag = soup.find_all('div', {'class': 'navigation'})[0]
    except:
        print 'could not find next link'
        return (blog_link_list, None)
        
    # grab the link
    next_page_link = next_tag.findChild('a').get('href')
    
    return (blog_link_list, next_page_link)

コード例 #4

0

ファイルを表示

ファイル: insidermonkey_scrape.py プロジェクト: mshadish/stock_market_blog_scraper

def extractArticleContents(page_url):
    """
    This function takes the url of a article
    and extracts the title, date, and content
    
    Returns in the form of a tuple
    (title, date, content)
    """
    page = attemptUrl(page_url)
    soup = soupify(page)
    
    # First, we need to check if there is a "see all" option for the post
    if soup.find_all('div', {'class': 'see-all'}):
        # we found a 'see-all' tag, so grab the link
        see_all_tag = soup.find_all('div', {'class': 'see-all'})[0]
        see_all_link = see_all_tag.findChild('a').get('href')
        
        # and we will extract the full contents from that pages, recursively
        return extractArticleContents(see_all_link)
        
    # otherwise, we will pull out the content, title, and date
    else:
        # grab the text
        try:
            # find where the text is in the doc
            content_obj = soup.find_all('div',
                            {'class': 'blog-content-container clearfix'})[0]
            content_child = content_obj.findChild('div', {'class': 'post'})
            content_gchild = content_child.findChild('div',
                                {'class': re.compile(r'content\-with.*-wrap')})
            # grab the text
            content_text = content_gchild.get_text()
        except:
            print 'could not extract text: ' + page_url
            content_text = None
        
        # grab the title
        try:
            title_obj = soup.find_all('div', {'class': 'single-post-title'})[0]
            title = title_obj.findChild('h1').get_text()
        except:
            print 'could not extract title: ' + page_url
            title = None
            
        # grab the date
        try:
            date_obj = soup.find_all('h6', {'class': 'date-line'})[0]
            date = date_obj.get_text()
            # strip out 'published' and timestamp
            date = re.sub(r'Published\:\s?', '', date)
            date = re.sub(r'\sat.*', '', date)
            date = re.sub(r'\W', '_', date)
        except IndexError:
            print 'could not extract the date: ' + page_url
            date = None
            
        return (title, date, content_text)

コード例 #5

0

ファイルを表示

ファイル: kiplinger_scrape.py プロジェクト: mshadish/stock_market_blog_scraper

def extractArticleText(list_of_tuples):
    """
    Takes in a list of tuples returned by extractLinks
    Returns the text body of each article
    """
    
    # loop through list of tuples
    for (date, url) in list_of_tuples:
        
        # read the page
        try:
            page = attemptUrl(url)
        except:
            print 'unable to open: ' + url
            continue
        
        # if we succeeded in opening the url, then read the page
        soup = soupify(page)
        
        # search for the content body
        list_of_contents = soup.find_all('div',
                                         {'class':
                                         re.compile(r'kip\-column\-content')})
                                         
        # if we couldn't find anything, continue
        if not list_of_contents:
            print 'could not extract content'
            print 'url: ' + url
            continue
                                         
        page_text = ''
        
        # in case there are multiple pages for a given article
        for content_page in list_of_contents:
            
            # getting a strange error...with 'call-me-manny-the-arb'
            # will skip
            try:
                for paragraph in content_page.find_all('p'):
                    page_text = page_text + paragraph.get_text() + ' '
            except:
                print 'issue with ' + url
                continue
                
        
        print 'saving ' + url
        saveArticle(date, page_text, url)
        print 'save complete'
        print '\n'
        
        
        # wait before we start with the next link
        wait_time = round(max(0, random.gauss(0, 0.5)), 2)
        time.sleep(wait_time)
        
    return

コード例 #6

0

ファイルを表示

ファイル: scrapers.py プロジェクト: dwillis/NCAA-API

def team_parser(season_id=2011, division="1"):
    # defaults to division 1, but also supports division 3
    season = Season.objects.get(end_year=season_id)
    url = "http://stats.ncaa.org/team/inst_team_list/%s?division=%s" % (season.ncaa_id, division)
    soup = soupify(url)
    team_links = [x.find("a") for x in soup.findAll("td")]
    for team in team_links:
        ncaa_id = int(team["href"].split("=")[1])
        name = SafeUnicode(team.contents[0])
        t, created = Team.objects.get_or_create(ncaa_id=ncaa_id, name=name)
        team_season, created = TeamSeason.objects.get_or_create(team=t, season=season, division=1)

コード例 #7

0

ファイルを表示

ファイル: scrapers.py プロジェクト: dwillis/NCAA-API

def schedule_parser(season_id, team_id):
    season = Season.objects.get(ncaa_id=season_id)
    url = "http://stats.ncaa.org/team/index/%s?org_id=%s" % (season_id, team_id)
    soup = soupify(url)
    game_ids = []
    links = soup.findAll("table")[1].findAll(
        lambda tag: tag.name == "a" and tag.findParent("td", attrs={"class": "smtext"})
    )
    for link in links:
        if not link.has_key("onclick"):
            game_ids.append(int(link["href"].split("?")[0].split("/")[3]))
    for game_id in game_ids:
        game_parser(game_id)

コード例 #8

0

ファイルを表示

def schedule_parser(season_id, team_id):
    season = Season.objects.get(ncaa_id=season_id)
    url = "http://stats.ncaa.org/team/index/%s?org_id=%s" % (season_id,
                                                             team_id)
    soup = soupify(url)
    game_ids = []
    links = soup.findAll('table')[1].findAll(
        lambda tag: tag.name == 'a' and tag.findParent(
            'td', attrs={'class': 'smtext'}))
    for link in links:
        if not link.has_key('onclick'):
            game_ids.append(int(link["href"].split("?")[0].split("/")[3]))
    for game_id in game_ids:
        game_parser(game_id)

コード例 #9

0

ファイルを表示

def team_parser(season_id=2011, division="1"):
    # defaults to division 1, but also supports division 3
    season = Season.objects.get(end_year=season_id)
    url = "http://stats.ncaa.org/team/inst_team_list/%s?division=%s" % (
        season.ncaa_id, division)
    soup = soupify(url)
    team_links = [x.find('a') for x in soup.findAll('td')]
    for team in team_links:
        ncaa_id = int(team["href"].split("=")[1])
        name = SafeUnicode(team.contents[0])
        t, created = Team.objects.get_or_create(ncaa_id=ncaa_id, name=name)
        team_season, created = TeamSeason.objects.get_or_create(team=t,
                                                                season=season,
                                                                division=1)

コード例 #10

0

ファイルを表示

def game_parser(game_id, season_id=2011):
    url = "http://stats.ncaa.org/game/box_score/%s" % game_id
    soup = soupify(url)
    season = Season.objects.get(end_year=season_id)
    visit_id, home_id = [
        int(x['href'].split('=')[1])
        for x in soup.findAll('table')[0].findAll('a')
    ]
    try:
        visit = TeamSeason.objects.select_related().get(team__ncaa_id=visit_id,
                                                        season=season)
    except:
        v_team, created = Team.objects.get_or_create(
            ncaa_id=visit_id,
            name=soup.findAll('table')[0].findAll('a')[0].renderContents())
        visit = TeamSeason.objects.create(team=v_team,
                                          season=season,
                                          division=0)
    home = TeamSeason.objects.select_related().get(team__ncaa_id=home_id,
                                                   season=season)
    game_details = soup.findAll('table')[2]
    dt = parse(game_details.findAll('td')[1].contents[0])
    loc = game_details.findAll('td')[3].contents[0]
    try:
        attend = int(
            game_details.findAll('td')[5].contents[0].replace(',', ''))
    except:
        attend = None
    officials = soup.findAll('table')[3].findAll('td')[1].contents[0].strip()
    scores = soup.findAll('table')[0].findAll('td', attrs={'align': 'right'})
    visit_team_scores = [
        int(x.renderContents()) for x in scores[0:len(scores) / 2]
    ]
    home_team_scores = [
        int(x.renderContents()) for x in scores[len(scores) / 2:len(scores)]
    ]  # second team listed is considered home team
    home_final = home_team_scores[(len(scores) / 2) - 1]
    visit_final = visit_team_scores[(len(scores) / 2) - 1]
    game, created = Game.objects.get_or_create(
        ncaa_id=game_id,
        home_team=home,
        visiting_team=visit,
        datetime=dt,
        location=SafeUnicode(loc),
        attendance=attend,
        officials=SafeUnicode(officials),
        home_team_score=home_final,
        visiting_team_score=visit_final)

コード例 #11

0

ファイルを表示

ファイル: scrapers.py プロジェクト: dwillis/NCAA-API

def game_parser(game_id, season_id=2011):
    url = "http://stats.ncaa.org/game/box_score/%s" % game_id
    soup = soupify(url)
    season = Season.objects.get(end_year=season_id)
    visit_id, home_id = [int(x["href"].split("=")[1]) for x in soup.findAll("table")[0].findAll("a")]
    try:
        visit = TeamSeason.objects.select_related().get(team__ncaa_id=visit_id, season=season)
    except:
        v_team, created = Team.objects.get_or_create(
            ncaa_id=visit_id, name=soup.findAll("table")[0].findAll("a")[0].renderContents()
        )
        visit = TeamSeason.objects.create(team=v_team, season=season, division=0)
    home = TeamSeason.objects.select_related().get(team__ncaa_id=home_id, season=season)
    game_details = soup.findAll("table")[2]
    dt = parse(game_details.findAll("td")[1].contents[0])
    loc = game_details.findAll("td")[3].contents[0]
    try:
        attend = int(game_details.findAll("td")[5].contents[0].replace(",", ""))
    except:
        attend = None
    officials = soup.findAll("table")[3].findAll("td")[1].contents[0].strip()
    scores = soup.findAll("table")[0].findAll("td", attrs={"align": "right"})
    visit_team_scores = [int(x.renderContents()) for x in scores[0 : len(scores) / 2]]
    home_team_scores = [
        int(x.renderContents()) for x in scores[len(scores) / 2 : len(scores)]
    ]  # second team listed is considered home team
    home_final = home_team_scores[(len(scores) / 2) - 1]
    visit_final = visit_team_scores[(len(scores) / 2) - 1]
    game, created = Game.objects.get_or_create(
        ncaa_id=game_id,
        home_team=home,
        visiting_team=visit,
        datetime=dt,
        location=SafeUnicode(loc),
        attendance=attend,
        officials=SafeUnicode(officials),
        home_team_score=home_final,
        visiting_team_score=visit_final,
    )

コード例 #12

0

ファイルを表示

def roster_parser(season_id, team_id, division=1):
    team_season = TeamSeason.objects.select_related().get(
        team__ncaa_id=team_id, season__end_year=season_id)
    url = "http://stats.ncaa.org/team/index/%s?org_id=%s" % (
        team_season.season.ncaa_id, team_id)
    soup = soupify(url)
    rows = soup.findAll('table')[2].findAll('tr')
    player_links = rows[2:len(rows)]
    for p in player_links:
        try:
            ncaa_id = int(
                float(p.findAll('td')[1].find('a')['href'].split('=', 2)[2]))
            name = extract_player_name(
                p.findAll('td')[1].find('a').contents[0].split(','))
        except:
            ncaa_id = -1
            name = extract_player_name(
                p.findAll('td')[1].contents[0].split(','))
        player, player_created = Player.objects.get_or_create(name=name,
                                                              ncaa_id=ncaa_id)
        player_season, ps_created = PlayerSeason.objects.get_or_create(
            player=player, team_season=team_season)
        if ps_created:
            try:
                player_season.jersey = int(p.findAll('td')[0].contents[0])
            except:
                player_season.jersey = None
            try:
                player_season.position = SafeUnicode(
                    p.findAll('td')[2].contents[0])
                player_season.feet = int(
                    p.findAll('td')[3].contents[0].split('-')[0])
                player_season.inches = int(
                    p.findAll('td')[3].contents[0].split('-')[1])
                player_season.year = SafeUnicode(
                    p.findAll('td')[4].contents[0])
            except:
                pass
            player_season.save()