def get_data(self, name, src): try: _data = src.findAll(name)[0] except(IndexError): _data = BeautifulSoup("No Data Received") data = _data.getText() return data
def _parse_rss(self, urltype): result = namedtuple('Result', ['title', 'description', 'link', 'content', 'image', 'pubDate', 'src', 'newstype', 'urlkey']) results = [] for url in urllist[urltype]: try: _page_ = urlfetch.fetch(url) except urlfetch.DeadlineExceededError: logging.error("DeadlineExceededError") continue page = _page_.content xml = BeautifulStoneSoup(page, convertEntities=BeautifulSoup.HTML_ENTITIES) posts = xml.findAll("item")[:POST_LIMIT] urlkey = urllist[urltype][url] source = srcmap[urlkey] for post in posts: newstype = urltype #title = self.get_data("title", post) title = post.find("title").getText() _desc = post.find("description").getText() #desc = post.find("description").getText() #desc = self.get_data("description", post) link = post.find("link").getText() try: image = BeautifulSoup(_desc, convertEntities=BeautifulSoup.HTML_ENTITIES).find('img')['src'] except(TypeError): try: image = post.find("image").getText() except(AttributeError): try: image = post.find("ipimage").getText() except(AttributeError): try: image = xml.find("image").find('url').getText() logging.error('bokllu resim') except(AttributeError): image = '' desc = BeautifulSoup(_desc).find(text = True) image = image.replace("htufak", "detay") if not desc: desc = "" logging.info("Empty Desc") _pubDate_ = post.find("pubDate") if not _pubDate_: _pubDate_ = post.find("pubdate") try: pubDate = parser.parse(_pubDate_.getText()) except (ValueError): logging.info("Auto pubDate added") pubDate = datetime.now() content = "no content yet" #add parsing for content later" src = source urlkey = urllist[urltype][url] if link: results.append(result(title, desc, link, content, image, pubDate, src, newstype, urlkey)) else: logging.info("no link") return results
def get_games_from(self, url, url_property): url_info = self._open_url(url) all_games = [] if url_info is not None: soup = BeautifulSoup(url_info) # Finds all of the sections containing game info games = soup.findAll("a", "search_result_row") appid_re = r"http://store.steampowered.com/%s/([0-9]*).*" % url_property for game in games: price = 0 # If the price is in "<strike>" tags, it's on sale, and we have to get it with another method price_text = game.find('div', {'class': 'col search_price'}) sale = price_text.find('strike') name = game.find("h4").text if name: formatted_game = name.replace("®", '') # Replace the (R) symbol formatted_game = formatted_game.replace('™', '') # Replace the (TM) symbol else: name = "" if sale: try: price = float(sale.text[5:]) except ValueError: price = 0.00 else: try: price = float(price_text.text[5:]) except ValueError: price = 0.00 appid = re.search(appid_re, game['href']) if appid: appid = appid.group(1) try: appid = int(appid) except: appid = -1 store_url = "http://store.steampowered.com/%s/%s" % (url_property, appid) #logging.error("appid: %s store: %s" % (appid, store_url)) image_url = self._image_url(appid, url_property) current_game = {'name': formatted_game, 'appid': appid, 'price': price, 'store_url': store_url, 'image_url': image_url} all_games.append(current_game) return all_games else: return None
def get_groups(self): url = "http://steamcommunity.com/profiles/%s/groups/" % self.steam_id soup = BeautifulSoup(self._open_url(url)) groups = soup.findAll('div', 'groupBlockMedium') all_groups = [] for group in groups: group = group.find('a') if group: group_url = group['href'] if group_url: all_groups.append(group_url) return all_groups
def scrape( url="http://money.cnn.com/2012/02/20/news/economy/david_walker_third_party/index.htm" ): response = requests.get(url) soup = BeautifulSoup(response.content) container = soup.find("div", id="storytext") content_list = [p.string for p in container.findAll("p") if p.string] content = "\n".join(content_list) # Also convert any HTML or XML entitied stoned_content = BeautifulStoneSoup( content, convertEntities=BeautifulStoneSoup.ALL_ENTITIES) return "".join(stoned_content.contents)
def get_wishlist(self): """Retrieves all appids for games on a user's wishlist.""" url = "http://steamcommunity.com/profiles/%s/wishlist" % self.steam_id soup = BeautifulSoup(self._open_url(url)) wish_games = soup.findAll("div", "wishlistRow") all_games = [] for game in wish_games: current_id = game['id'] if current_id: search = re.search(r'([0-9]+)', current_id) if search: all_games.append(int(search.group(1))) return all_games
def get_games_from(self, url, url_property): url_info = self._open_url(url) all_games = [] if url_info is not None: soup = BeautifulSoup(url_info) # Finds all of the sections containing game info games = soup.findAll("a", "search_result_row") appid_re = r"http://store.steampowered.com/%s/([0-9]*).*" % url_property for game in games: price = 0 # If the price is in "<strike>" tags, it's on sale, and we have to get it with another method price_text = game.find('div', {'class': 'col search_price'}) sale = price_text.find('strike') name = game.find("h4").text if name: formatted_game = name.replace("®", '') # Replace the (R) symbol formatted_game = formatted_game.replace( '™', '') # Replace the (TM) symbol else: name = "" if sale: try: price = float(sale.text[5:]) except ValueError: price = 0.00 else: try: price = float(price_text.text[5:]) except ValueError: price = 0.00 appid = re.search(appid_re, game['href']) if appid: appid = appid.group(1) try: appid = int(appid) except: appid = -1 store_url = "http://store.steampowered.com/%s/%s" % ( url_property, appid) #logging.error("appid: %s store: %s" % (appid, store_url)) image_url = self._image_url(appid, url_property) current_game = { 'name': formatted_game, 'appid': appid, 'price': price, 'store_url': store_url, 'image_url': image_url } all_games.append(current_game) return all_games else: return None
def get_hltb(): stats = utils.retrieve_stats() total_main_with_hours = 0 total_main = 0.0 total_completion_with_hours = 0 total_with_hours = 0 total_completion = 0.0 num_url = urllib2.urlopen("http://74.63.212.37/hltb/num.html").read() file_range = range(1,int(num_url)) for i in file_range: #for i in range(1,2): # Use for testing curr_games = [] soup = BeautifulSoup(open_morpheus(i)) search_results = soup.find("div", {"class": "search_results"}) games = search_results.findAll("li", {"class": "backwhite radius shadow_box"}) for game in games: title = game.find("a", {"class": "textwhite"}) try: url = title['href'] except KeyError: url = None title = title.text main = None completion = None combined = None tidbits = game.findAll("div", {'class': tidbit_re}) if len(tidbits) > 1: total_with_hours += 1 main_recorded = False for i in range(len(tidbits)): if tidbits[i].text == "Main Story": main_recorded = True main = tidbits[i+1].text main = validate_hours(main) if main is not None: total_main += main total_main_with_hours += 1 elif tidbits[i].text == "Completionist": completion = tidbits[i+1].text completion = validate_hours(completion) if completion is not None: total_completion += completion total_completion_with_hours += 1 elif tidbits[i].text == "Combined": combined = tidbits[i+1].text combined = validate_hours(combined) if main_recorded is False: if combined is not None: main = combined this_game = {'title': title, 'url': url, 'main': main, 'completion': completion} curr_games.append(this_game) update_hltb(curr_games) average_main = total_main / total_main_with_hours average_completion = total_completion / total_completion_with_hours stats.total_with_hours = total_with_hours stats.average_main = average_main - 2 stats.average_completion = average_completion - 2 stats.hltb_last_updated = datetime.now() stats.put() return None, None
def get_user(steam_id, stats=None): """ Return codes for get_user(): 1 - New user succesfully added 2 - User update succeeded 3 - New user was private, not added 4 - Current user, no need for update, sucesfully returned 5 - Update succeeded, but private profile 6 - Update failed - too soon since last update 7 - Bad Steam ID """ # If the user inputs a URL that doesn't have the 64 bit id, then we have to retrieve that # with a call to Steam to see if it's valid. Since we don't always want to do that, # we store the "orig_id" (the full url) in memcache, mapped to the actual id if needed. # Cuts down on the amount of Steam API calls needed per user lookup. if stats is None: stats = utils.retrieve_stats() orig_id = steam_id id_retrieved = False cached_id = memcache.get(orig_id) if cached_id is not None: id_retrieved = True steam_id = cached_id if id_retrieved is False: steam_match = re.match(steam_re, steam_id) if steam_match: steam_id = steam_match.string else: if re.match(r'https?://steamcommunity.com/.*', steam_id): try: profile = urllib2.urlopen(steam_id) except ValueError: return None, 7 soup = BeautifulSoup(profile) scripts = soup.findAll('script') found = False for script in scripts: text = script.text.strip() if text[:15] == 'g_rgProfileData': json_info = json.loads(text[18:-1]) steam_id = json_info['steamid'] found = True if found is False: return None, 7 else: try: profile = urllib2.urlopen( "http://steamcommunity.com/id/%s" % steam_id) except ValueError: return None, 7 soup = BeautifulSoup(profile) scripts = soup.findAll('script') found = False for script in scripts: text = script.text.strip() if text[:15] == 'g_rgProfileData': json_info = json.loads(text[18:-1]) steam_id = json_info['steamid'] found = True if found is False: return None, 7 memcache.add(orig_id, steam_id) user = SteamIds.get_by_id(steam_id) counters.pingpong_incr(queries_counter) # User already exists, decide what to do if user: # If this is true, there have been updates to the db. Update the user, if possible. if stats.games_last_updated > user.last_updated or stats.hltb_last_updated > user.last_updated: info_to_update = SteamUsers(steam_id, api_key) # User profile is invisible. Still update what we have on record, but warn the user # to update w/public profile. if info_to_update.visibility is False: user, rc = _update_user(user, info_to_update, stats) return user, rc #User's profile was visible, fully sucessful update. else: user, rc = _update_user(user, info_to_update, stats) return user, rc # Current user, no need for update, just return for display. else: return user, 4 else: user_info = SteamUsers(steam_id, api_key) # This is not a Steam ID if user_info.good_id is False: return None, 7 # This is not a public profile. Can't view. elif user_info.visibility is False: return None, 3 # New user was succesfully added. FTW! else: user = add_user_to_ndb(user_info, stats) #increment_steamids() counters.pingpong_incr(steamids_counter) return user, 1
def get_user(steam_id, stats=None): """ Return codes for get_user(): 1 - New user succesfully added 2 - User update succeeded 3 - New user was private, not added 4 - Current user, no need for update, sucesfully returned 5 - Update succeeded, but private profile 6 - Update failed - too soon since last update 7 - Bad Steam ID """ # If the user inputs a URL that doesn't have the 64 bit id, then we have to retrieve that # with a call to Steam to see if it's valid. Since we don't always want to do that, # we store the "orig_id" (the full url) in memcache, mapped to the actual id if needed. # Cuts down on the amount of Steam API calls needed per user lookup. if stats is None: stats = utils.retrieve_stats() orig_id = steam_id id_retrieved = False cached_id = memcache.get(orig_id) if cached_id is not None: id_retrieved = True steam_id = cached_id if id_retrieved is False: steam_match = re.match(steam_re, steam_id) if steam_match: steam_id = steam_match.string else: if re.match(r'https?://steamcommunity.com/.*', steam_id): try: profile = urllib2.urlopen(steam_id) except ValueError: return None, 7 soup = BeautifulSoup(profile) scripts = soup.findAll('script') found = False for script in scripts: text = script.text.strip() if text[:15] == 'g_rgProfileData': json_info = json.loads(text[18:-1]) steam_id = json_info['steamid'] found = True if found is False: return None, 7 else: try: profile = urllib2.urlopen("http://steamcommunity.com/id/%s" % steam_id) except ValueError: return None, 7 soup = BeautifulSoup(profile) scripts = soup.findAll('script') found = False for script in scripts: text = script.text.strip() if text[:15] == 'g_rgProfileData': json_info = json.loads(text[18:-1]) steam_id = json_info['steamid'] found = True if found is False: return None, 7 memcache.add(orig_id, steam_id) user = SteamIds.get_by_id(steam_id) counters.pingpong_incr(queries_counter) # User already exists, decide what to do if user: # If this is true, there have been updates to the db. Update the user, if possible. if stats.games_last_updated > user.last_updated or stats.hltb_last_updated > user.last_updated: info_to_update = SteamUsers(steam_id, api_key) # User profile is invisible. Still update what we have on record, but warn the user # to update w/public profile. if info_to_update.visibility is False: user, rc = _update_user(user, info_to_update, stats) return user, rc #User's profile was visible, fully sucessful update. else: user, rc = _update_user(user, info_to_update, stats) return user, rc # Current user, no need for update, just return for display. else: return user, 4 else: user_info = SteamUsers(steam_id, api_key) # This is not a Steam ID if user_info.good_id is False: return None, 7 # This is not a public profile. Can't view. elif user_info.visibility is False: return None, 3 # New user was succesfully added. FTW! else: user = add_user_to_ndb(user_info, stats) #increment_steamids() counters.pingpong_incr(steamids_counter) return user, 1