Ejemplo n.º 1
0
def scrape(
    url="http://money.cnn.com/2012/02/20/news/economy/david_walker_third_party/index.htm"
):
    response = requests.get(url)
    soup = BeautifulSoup(response.content)

    container = soup.find("div", id="storytext")
    content_list = [p.string for p in container.findAll("p") if p.string]
    content = "\n".join(content_list)

    # Also convert any HTML or XML entitied
    stoned_content = BeautifulStoneSoup(
        content, convertEntities=BeautifulStoneSoup.ALL_ENTITIES)

    return "".join(stoned_content.contents)
Ejemplo n.º 2
0
def get_hltb():
    stats = utils.retrieve_stats()

    total_main_with_hours = 0
    total_main = 0.0
    total_completion_with_hours = 0
    total_with_hours = 0
    total_completion = 0.0


    num_url = urllib2.urlopen("http://74.63.212.37/hltb/num.html").read()

    file_range = range(1,int(num_url))
    for i in file_range:
    #for i in range(1,2): # Use for testing
        curr_games = []
        soup = BeautifulSoup(open_morpheus(i))
        search_results = soup.find("div", {"class": "search_results"})
        games = search_results.findAll("li", {"class": "backwhite radius shadow_box"})

        for game in games:

            title = game.find("a", {"class": "textwhite"})
            
            try:
                url = title['href']
            except KeyError:
                url = None
            title = title.text      
            main = None
            completion = None
            combined = None
            tidbits = game.findAll("div",  {'class': tidbit_re})

            if len(tidbits) > 1:
                total_with_hours += 1
                main_recorded = False
                for i in range(len(tidbits)):
                    if tidbits[i].text == "Main Story":
                        main_recorded = True
                        main = tidbits[i+1].text
                        main = validate_hours(main)
                        if main is not None:
                            total_main += main
                            total_main_with_hours += 1
                    elif tidbits[i].text == "Completionist":
                        completion = tidbits[i+1].text
                        completion = validate_hours(completion)
                        if completion is not None:
                            total_completion += completion
                            total_completion_with_hours += 1
                    elif tidbits[i].text == "Combined":
                        combined = tidbits[i+1].text
                        combined = validate_hours(combined)
                    if main_recorded is False:
                        if combined is not None:
                            main = combined

            this_game = {'title': title, 'url': url, 'main': main, 'completion': completion}
            curr_games.append(this_game)
        update_hltb(curr_games)

    average_main = total_main / total_main_with_hours
    average_completion = total_completion / total_completion_with_hours
    stats.total_with_hours = total_with_hours
    stats.average_main = average_main - 2
    stats.average_completion = average_completion - 2
    stats.hltb_last_updated = datetime.now()
    stats.put()
    return None, None