Esempi in Python per BeautifulSoup, esempi in Python per bs3.BeautifulSoup.BeautifulSoup

Esempio n. 1

0

Mostra file

File: rssparser.py Progetto: mehmettekn/news_parser

 def get_data(self, name, src):
     try:        
         _data = src.findAll(name)[0]  
     except(IndexError):
         _data = BeautifulSoup("No Data Received") 
     data = _data.getText()
     return data

Esempio n. 2

0

Mostra file

File: rssparser.py Progetto: mehmettekn/news_parser

 def _parse_rss(self, urltype):
     result = namedtuple('Result', ['title', 'description', 'link',
                                    'content', 'image', 'pubDate',
                                    'src', 'newstype', 'urlkey'])
     results = []
     for url in urllist[urltype]:            
         try:                
             _page_ = urlfetch.fetch(url)  
         except urlfetch.DeadlineExceededError:
             logging.error("DeadlineExceededError")
             continue                
         page = _page_.content                            
         xml = BeautifulStoneSoup(page, convertEntities=BeautifulSoup.HTML_ENTITIES)           
         posts = xml.findAll("item")[:POST_LIMIT]
         urlkey = urllist[urltype][url]
         source = srcmap[urlkey]                        
         for post in posts:
             newstype = urltype                
             #title = self.get_data("title", post)
             title = post.find("title").getText()                
             _desc = post.find("description").getText()
             #desc = post.find("description").getText()                
             #desc = self.get_data("description", post)                
             link = post.find("link").getText()         
             try:
                 image = BeautifulSoup(_desc, convertEntities=BeautifulSoup.HTML_ENTITIES).find('img')['src']
             except(TypeError):                
                 try:
                     image = post.find("image").getText()
                 except(AttributeError):
                     try:
                         image = post.find("ipimage").getText()
                     except(AttributeError):                
                         try:
                             image = xml.find("image").find('url').getText()
                             logging.error('bokllu resim')
                         except(AttributeError):
                             image = ''
             desc = BeautifulSoup(_desc).find(text = True)
             image = image.replace("htufak", "detay")                
             if not desc:
                 desc = ""
                 logging.info("Empty Desc")                
             _pubDate_ = post.find("pubDate")
             if not _pubDate_:
                 _pubDate_ = post.find("pubdate")
             try:
                 pubDate = parser.parse(_pubDate_.getText())           
             except (ValueError):
                 logging.info("Auto pubDate added")                    
                 pubDate = datetime.now()  
             content = "no content yet" #add parsing for content later"
             src = source      
             urlkey = urllist[urltype][url]                
             if link:                
                 results.append(result(title, desc, link, content, image, pubDate, src, newstype, urlkey))
             else: logging.info("no link")        
     return results

Esempio n. 3

0

Mostra file

File: Games.py Progetto: Gorgoroth/steamplaytime

	def get_games_from(self, url, url_property):
		url_info = self._open_url(url)
	
		all_games = []
		if url_info is not None:
			soup = BeautifulSoup(url_info)
			# Finds all of the sections containing game info
			games = soup.findAll("a", "search_result_row")
			appid_re = r"http://store.steampowered.com/%s/([0-9]*).*" % url_property
			
			for game in games:
				price = 0
				# If the price is in "<strike>" tags, it's on sale, and we have to get it with another method
				price_text = game.find('div', {'class': 'col search_price'})
				sale = price_text.find('strike')
	
				name = game.find("h4").text
				if name:
					formatted_game = name.replace("&reg;", '') # Replace the (R) symbol
					formatted_game = formatted_game.replace('&trade;', '') # Replace the (TM) symbol
				else:
					name = ""
	
				if sale:
					try:
						price = float(sale.text[5:])
					except ValueError:
						price = 0.00
				else:
					try:
						price = float(price_text.text[5:])
					except ValueError:
						price = 0.00
					
				appid = re.search(appid_re, game['href'])
				if appid:
				    appid = appid.group(1)
				try:
					appid = int(appid)
				except:
					appid = -1
	
				store_url = "http://store.steampowered.com/%s/%s" % (url_property, appid)
	
				#logging.error("appid: %s store: %s" % (appid, store_url))
	
				image_url = self._image_url(appid, url_property)
	
				current_game = {'name': formatted_game, 'appid': appid, 'price': price, 'store_url': store_url, 'image_url': image_url}
				all_games.append(current_game)
			return all_games
		else:
			return None

Esempio n. 4

0

Mostra file

File: Users.py Progetto: naiyt/steamplaytime

    def get_groups(self):
        url = "http://steamcommunity.com/profiles/%s/groups/" % self.steam_id
        soup = BeautifulSoup(self._open_url(url))

        groups = soup.findAll('div', 'groupBlockMedium')
        all_groups = []
        for group in groups:
            group = group.find('a')
            if group:
                group_url = group['href']
                if group_url:

                    all_groups.append(group_url)
        return all_groups

Esempio n. 5

0

Mostra file

File: Users.py Progetto: Gorgoroth/steamplaytime

	def get_groups(self):
		url = "http://steamcommunity.com/profiles/%s/groups/" % self.steam_id
		soup = BeautifulSoup(self._open_url(url))

		groups = soup.findAll('div', 'groupBlockMedium')
		all_groups = []
		for group in groups:
			group = group.find('a')
			if group:
				group_url = group['href']
				if group_url:
					
					all_groups.append(group_url)
		return all_groups

Esempio n. 6

0

Mostra file

def scrape(
    url="http://money.cnn.com/2012/02/20/news/economy/david_walker_third_party/index.htm"
):
    response = requests.get(url)
    soup = BeautifulSoup(response.content)

    container = soup.find("div", id="storytext")
    content_list = [p.string for p in container.findAll("p") if p.string]
    content = "\n".join(content_list)

    # Also convert any HTML or XML entitied
    stoned_content = BeautifulStoneSoup(
        content, convertEntities=BeautifulStoneSoup.ALL_ENTITIES)

    return "".join(stoned_content.contents)

Esempio n. 7

0

Mostra file

File: Users.py Progetto: naiyt/steamplaytime

    def get_wishlist(self):
        """Retrieves all appids for games on a user's wishlist."""
        url = "http://steamcommunity.com/profiles/%s/wishlist" % self.steam_id
        soup = BeautifulSoup(self._open_url(url))

        wish_games = soup.findAll("div", "wishlistRow")
        all_games = []

        for game in wish_games:
            current_id = game['id']
            if current_id:
                search = re.search(r'([0-9]+)', current_id)
                if search:
                    all_games.append(int(search.group(1)))

        return all_games

Esempio n. 8

0

Mostra file

File: Users.py Progetto: Gorgoroth/steamplaytime

	def get_wishlist(self):
		"""Retrieves all appids for games on a user's wishlist."""
		url = "http://steamcommunity.com/profiles/%s/wishlist" % self.steam_id	
		soup = BeautifulSoup(self._open_url(url))

		wish_games = soup.findAll("div", "wishlistRow")
		all_games = []

		for game in wish_games:
			current_id = game['id']
			if current_id:
				search = re.search(r'([0-9]+)', current_id)
				if search:
					all_games.append(int(search.group(1)))

		return all_games

Esempio n. 9

0

Mostra file

File: Games.py Progetto: naiyt/steamplaytime

    def get_games_from(self, url, url_property):
        url_info = self._open_url(url)

        all_games = []
        if url_info is not None:
            soup = BeautifulSoup(url_info)
            # Finds all of the sections containing game info
            games = soup.findAll("a", "search_result_row")
            appid_re = r"http://store.steampowered.com/%s/([0-9]*).*" % url_property

            for game in games:
                price = 0
                # If the price is in "<strike>" tags, it's on sale, and we have to get it with another method
                price_text = game.find('div', {'class': 'col search_price'})
                sale = price_text.find('strike')

                name = game.find("h4").text
                if name:
                    formatted_game = name.replace("&reg;",
                                                  '')  # Replace the (R) symbol
                    formatted_game = formatted_game.replace(
                        '&trade;', '')  # Replace the (TM) symbol
                else:
                    name = ""

                if sale:
                    try:
                        price = float(sale.text[5:])
                    except ValueError:
                        price = 0.00
                else:
                    try:
                        price = float(price_text.text[5:])
                    except ValueError:
                        price = 0.00

                appid = re.search(appid_re, game['href'])
                if appid:
                    appid = appid.group(1)
                try:
                    appid = int(appid)
                except:
                    appid = -1

                store_url = "http://store.steampowered.com/%s/%s" % (
                    url_property, appid)

                #logging.error("appid: %s store: %s" % (appid, store_url))

                image_url = self._image_url(appid, url_property)

                current_game = {
                    'name': formatted_game,
                    'appid': appid,
                    'price': price,
                    'store_url': store_url,
                    'image_url': image_url
                }
                all_games.append(current_game)
            return all_games
        else:
            return None

Esempio n. 10

0

Mostra file

def get_hltb():
    stats = utils.retrieve_stats()

    total_main_with_hours = 0
    total_main = 0.0
    total_completion_with_hours = 0
    total_with_hours = 0
    total_completion = 0.0


    num_url = urllib2.urlopen("http://74.63.212.37/hltb/num.html").read()

    file_range = range(1,int(num_url))
    for i in file_range:
    #for i in range(1,2): # Use for testing
        curr_games = []
        soup = BeautifulSoup(open_morpheus(i))
        search_results = soup.find("div", {"class": "search_results"})
        games = search_results.findAll("li", {"class": "backwhite radius shadow_box"})

        for game in games:

            title = game.find("a", {"class": "textwhite"})
            
            try:
                url = title['href']
            except KeyError:
                url = None
            title = title.text      
            main = None
            completion = None
            combined = None
            tidbits = game.findAll("div",  {'class': tidbit_re})

            if len(tidbits) > 1:
                total_with_hours += 1
                main_recorded = False
                for i in range(len(tidbits)):
                    if tidbits[i].text == "Main Story":
                        main_recorded = True
                        main = tidbits[i+1].text
                        main = validate_hours(main)
                        if main is not None:
                            total_main += main
                            total_main_with_hours += 1
                    elif tidbits[i].text == "Completionist":
                        completion = tidbits[i+1].text
                        completion = validate_hours(completion)
                        if completion is not None:
                            total_completion += completion
                            total_completion_with_hours += 1
                    elif tidbits[i].text == "Combined":
                        combined = tidbits[i+1].text
                        combined = validate_hours(combined)
                    if main_recorded is False:
                        if combined is not None:
                            main = combined

            this_game = {'title': title, 'url': url, 'main': main, 'completion': completion}
            curr_games.append(this_game)
        update_hltb(curr_games)

    average_main = total_main / total_main_with_hours
    average_completion = total_completion / total_completion_with_hours
    stats.total_with_hours = total_with_hours
    stats.average_main = average_main - 2
    stats.average_completion = average_completion - 2
    stats.hltb_last_updated = datetime.now()
    stats.put()
    return None, None

Esempio n. 11

0

Mostra file

def get_user(steam_id, stats=None):
    """
    Return codes for get_user():
    1 - New user succesfully added
    2 - User update succeeded
    3 - New user was private, not added
    4 - Current user, no need for update, sucesfully returned
    5 - Update succeeded, but private profile
    6 - Update failed - too soon since last update
    7 - Bad Steam ID

    """

    # If the user inputs a URL that doesn't have the 64 bit id, then we have to retrieve that
    # with a call to Steam to see if it's valid. Since we don't always want to do that,
    # we store the "orig_id" (the full url) in memcache, mapped to the actual id if needed.
    # Cuts down on the amount of Steam API calls needed per user lookup.
    if stats is None:
        stats = utils.retrieve_stats()
    orig_id = steam_id
    id_retrieved = False
    cached_id = memcache.get(orig_id)
    if cached_id is not None:
        id_retrieved = True
        steam_id = cached_id

    if id_retrieved is False:
        steam_match = re.match(steam_re, steam_id)
        if steam_match:
            steam_id = steam_match.string
        else:
            if re.match(r'https?://steamcommunity.com/.*', steam_id):
                try:
                    profile = urllib2.urlopen(steam_id)
                except ValueError:
                    return None, 7
                soup = BeautifulSoup(profile)
                scripts = soup.findAll('script')
                found = False
                for script in scripts:
                    text = script.text.strip()
                    if text[:15] == 'g_rgProfileData':
                        json_info = json.loads(text[18:-1])
                        steam_id = json_info['steamid']
                        found = True
                if found is False:
                    return None, 7
            else:
                try:
                    profile = urllib2.urlopen(
                        "http://steamcommunity.com/id/%s" % steam_id)
                except ValueError:
                    return None, 7

                soup = BeautifulSoup(profile)
                scripts = soup.findAll('script')
                found = False
                for script in scripts:
                    text = script.text.strip()
                    if text[:15] == 'g_rgProfileData':
                        json_info = json.loads(text[18:-1])
                        steam_id = json_info['steamid']
                        found = True
                if found is False:
                    return None, 7
        memcache.add(orig_id, steam_id)

    user = SteamIds.get_by_id(steam_id)
    counters.pingpong_incr(queries_counter)
    # User already exists, decide what to do
    if user:
        # If this is true, there have been updates to the db. Update the user, if possible.
        if stats.games_last_updated > user.last_updated or stats.hltb_last_updated > user.last_updated:
            info_to_update = SteamUsers(steam_id, api_key)
            # User profile is invisible. Still update what we have on record, but warn the user
            # to update w/public profile.
            if info_to_update.visibility is False:
                user, rc = _update_user(user, info_to_update, stats)
                return user, rc
            #User's profile was visible, fully sucessful update.
            else:
                user, rc = _update_user(user, info_to_update, stats)
                return user, rc
        # Current user, no need for update, just return for display.
        else:
            return user, 4

    else:
        user_info = SteamUsers(steam_id, api_key)
        # This is not a Steam ID
        if user_info.good_id is False:
            return None, 7
        # This is not a public profile. Can't view.
        elif user_info.visibility is False:
            return None, 3
        # New user was succesfully added. FTW!
        else:
            user = add_user_to_ndb(user_info, stats)
            #increment_steamids()
            counters.pingpong_incr(steamids_counter)
            return user, 1

Esempio n. 12

0

Mostra file

File: users.py Progetto: Gorgoroth/steamplaytime

def get_user(steam_id, stats=None):
    """
    Return codes for get_user():
    1 - New user succesfully added
    2 - User update succeeded
    3 - New user was private, not added
    4 - Current user, no need for update, sucesfully returned
    5 - Update succeeded, but private profile
    6 - Update failed - too soon since last update
    7 - Bad Steam ID

    """

    # If the user inputs a URL that doesn't have the 64 bit id, then we have to retrieve that
    # with a call to Steam to see if it's valid. Since we don't always want to do that,
    # we store the "orig_id" (the full url) in memcache, mapped to the actual id if needed.
    # Cuts down on the amount of Steam API calls needed per user lookup.
    if stats is None:
        stats = utils.retrieve_stats()
    orig_id = steam_id
    id_retrieved = False
    cached_id = memcache.get(orig_id)
    if cached_id is not None:
        id_retrieved = True
        steam_id = cached_id

    if id_retrieved is False:
        steam_match = re.match(steam_re, steam_id)
        if steam_match:
            steam_id = steam_match.string
        else:
            if re.match(r'https?://steamcommunity.com/.*', steam_id):
                try:
                    profile = urllib2.urlopen(steam_id)
                except ValueError:
                    return None, 7
                soup = BeautifulSoup(profile)
                scripts = soup.findAll('script')
                found = False
                for script in scripts:
                    text = script.text.strip()
                    if text[:15] == 'g_rgProfileData':
                        json_info = json.loads(text[18:-1])
                        steam_id = json_info['steamid']
                        found = True
                if found is False:
                    return None, 7
            else:
                try:
                    profile = urllib2.urlopen("http://steamcommunity.com/id/%s" % steam_id)
                except ValueError:
                    return None, 7
    
                soup = BeautifulSoup(profile)
                scripts = soup.findAll('script')
                found = False
                for script in scripts:
                    text = script.text.strip()
                    if text[:15] == 'g_rgProfileData':
                        json_info = json.loads(text[18:-1])
                        steam_id = json_info['steamid']
                        found = True
                if found is False:
                    return None, 7
        memcache.add(orig_id, steam_id)

    user = SteamIds.get_by_id(steam_id)
    counters.pingpong_incr(queries_counter)
    # User already exists, decide what to do
    if user:
        # If this is true, there have been updates to the db. Update the user, if possible.
        if stats.games_last_updated > user.last_updated or stats.hltb_last_updated > user.last_updated:
            info_to_update = SteamUsers(steam_id, api_key)
            # User profile is invisible. Still update what we have on record, but warn the user
            # to update w/public profile.
            if info_to_update.visibility is False:
                user, rc = _update_user(user, info_to_update, stats)
                return user, rc
            #User's profile was visible, fully sucessful update.
            else:
                user, rc = _update_user(user, info_to_update, stats)
                return user, rc
        # Current user, no need for update, just return for display.
        else:
            return user, 4

    else:
        user_info = SteamUsers(steam_id, api_key)
        # This is not a Steam ID
        if user_info.good_id is False:
            return None, 7
        # This is not a public profile. Can't view.
        elif user_info.visibility is False:
            return None, 3
        # New user was succesfully added. FTW!
        else:
            user = add_user_to_ndb(user_info, stats)
            #increment_steamids()
            counters.pingpong_incr(steamids_counter)
            return user, 1