def scrape( url="http://money.cnn.com/2012/02/20/news/economy/david_walker_third_party/index.htm" ): response = requests.get(url) soup = BeautifulSoup(response.content) container = soup.find("div", id="storytext") content_list = [p.string for p in container.findAll("p") if p.string] content = "\n".join(content_list) # Also convert any HTML or XML entitied stoned_content = BeautifulStoneSoup( content, convertEntities=BeautifulStoneSoup.ALL_ENTITIES) return "".join(stoned_content.contents)
def get_hltb(): stats = utils.retrieve_stats() total_main_with_hours = 0 total_main = 0.0 total_completion_with_hours = 0 total_with_hours = 0 total_completion = 0.0 num_url = urllib2.urlopen("http://74.63.212.37/hltb/num.html").read() file_range = range(1,int(num_url)) for i in file_range: #for i in range(1,2): # Use for testing curr_games = [] soup = BeautifulSoup(open_morpheus(i)) search_results = soup.find("div", {"class": "search_results"}) games = search_results.findAll("li", {"class": "backwhite radius shadow_box"}) for game in games: title = game.find("a", {"class": "textwhite"}) try: url = title['href'] except KeyError: url = None title = title.text main = None completion = None combined = None tidbits = game.findAll("div", {'class': tidbit_re}) if len(tidbits) > 1: total_with_hours += 1 main_recorded = False for i in range(len(tidbits)): if tidbits[i].text == "Main Story": main_recorded = True main = tidbits[i+1].text main = validate_hours(main) if main is not None: total_main += main total_main_with_hours += 1 elif tidbits[i].text == "Completionist": completion = tidbits[i+1].text completion = validate_hours(completion) if completion is not None: total_completion += completion total_completion_with_hours += 1 elif tidbits[i].text == "Combined": combined = tidbits[i+1].text combined = validate_hours(combined) if main_recorded is False: if combined is not None: main = combined this_game = {'title': title, 'url': url, 'main': main, 'completion': completion} curr_games.append(this_game) update_hltb(curr_games) average_main = total_main / total_main_with_hours average_completion = total_completion / total_completion_with_hours stats.total_with_hours = total_with_hours stats.average_main = average_main - 2 stats.average_completion = average_completion - 2 stats.hltb_last_updated = datetime.now() stats.put() return None, None