Python BeautifulSoup Exemples, utils.BeautifulSoup.BeautifulSoup Python Exemples

Exemple #1

0

Afficher le fichier

def get_onepage_poclist(page):
    info = getHtml("http://beebeeto.com/pdb" + '/?page=' + str(page))
    if '' == info:
        return ''

    bt = BeautifulSoup(info)
    end = bt.find(
        'a', {
            'style':
            "font-size: 20px;font-weight: bold; border-bottom: 3px solid #777777;"
        })
    if '1' == end.renderContents() and page != 1:
        return ''

    ret = bt.find('div', {'class': 'mainlist'})
    ret = ret.renderContents()
    if ret == "":
        return ""

    retlist = []
    rets = re.findall('<a href=.*?>', ret)
    for one in rets:
        if "poc-" in one:
            one = one.replace('<a href="', "")
            one = one.replace('">', "")
            one = one.strip()
            retlist.append(one)

    return retlist

Exemple #2

0

Afficher le fichier

    def __parse_genres(self, data):
        """
        Parses the list of genres.
        """

        self.call_service(msgs.UI_ACT_SHOW_INFO,
                          "SHOUTcast made it illegal for free software to access\n" \
                          "their full directory.\n" \
                          "You will only get the Top 10 stations listed per genre.")

        genres = []
        soup = BeautifulSoup(data)
        radiopicker = soup.find("div", {"id": "radiopicker"})
        #print radiopicker
        if (radiopicker):
            for genre_tag in radiopicker.findAll("li", {"class": "prigen"}):
                #print genre_tag
                name = genre_tag.a.contents[0]
                name = name.replace("&amp;", "&")
                genres.append(name)
            #end for
        #end if

        if (not genres):
            self.__current_folder.message = "genre list not available"
            logging.error("SHOUTcast genre listing download failed:\n%s",
                          logging.stacktrace())

        genres.sort()
        return genres

Exemple #3

0

Afficher le fichier

Fichier : update.py Projet : nx4dm1n/Beebeeto2

def get_onepage_poclist(page):
    info = getHtml("http://beebeeto.com/pdb" + '/?page=' + str(page))
    if '' == info:
        return ''

    bt = BeautifulSoup(info)
    end = bt.find('a', {'style' : "font-size: 20px;font-weight: bold; border-bottom: 3px solid #777777;"})
    if '1' == end.renderContents() and page != 1:
        return ''

    ret = bt.find('div', {'class' : 'mainlist'})
    ret = ret.renderContents()
    if ret == "":
        return ""

    retlist = []
    rets = re.findall('<a href=.*?>', ret)
    for one in rets:
        if "poc-" in one:
            one = one.replace('<a href="', "")
            one = one.replace('">', "")
            one = one.strip()
            retlist.append(one)
            
    return retlist

Exemple #4

0

Afficher le fichier

Fichier : megavideo_crawler.py Projet : rauljuarez/vxvCrawler

 def _get_video_links(self,html_data):
     soup = BeautifulSoup(''.join(html_data),convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
     link_tds=soup.findAll('td',width='420')
     link_a=[]
     for td in link_tds:
         link_a.append(td.find('a')['href'])
     return link_a

Exemple #5

0

Afficher le fichier

Fichier : IcecastDirectory.py Projet : buzztiaan/mediabox

    def __parse_genres(self, data):
        """
        Parses the list of genres.
        """

        genres = []
        soup = BeautifulSoup(data)
        tagcloud = soup.find("ul", {"class": "tag-cloud"})
        #print tagcloud

        if (tagcloud):
            for genre_tag in tagcloud.findAll("a", {"class": "tag"}):
                #print genre_tag
                name = genre_tag["title"]
                href = genre_tag["href"]
                genres.append((name, href))
            #end for
        #end if

        if (not genres):
            self.__current_folder.message = "genre list not available"
            logging.error("icecast genre listing download failed:\n%s",
                          logging.stacktrace())

        genres.sort(lambda a, b: cmp(a[0], b[0]))
        return genres

Exemple #6

0

Afficher le fichier

Fichier : dailymotion_crawler.py Projet : rauljuarez/vxvCrawler

 def _get_video_links(self,html_data):
     soup = BeautifulSoup(''.join(html_data),convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
     div=soup.find('div',id='dual_list')
     link_divs=div.findAll('div',{'class':re.compile('^dmpi_video_item')})
     link_a=[]
     for div in link_divs:
         link_a.append(div.find('a',{'class':re.compile('^dmco_simplelink video_title')})['href'])
     return link_a

Exemple #7

0

Afficher le fichier

Fichier : crunchyDec.py Projet : KasaiDot/plugin.video.crunchyroll-takeout

	def strainSoup(self, xml):
		soup = BeautifulSoup(xml)
		subtitle = soup.find('subtitle', attrs={'link': None})
		if subtitle:
			_id = int(subtitle['id'])
			_iv = subtitle.find('iv').contents[0]
			_data = subtitle.data.string
			return _id, _iv, _data
		else:
			print "CRUNCHYROLL: --> Couldn't parse XML file."

Exemple #8

0

Afficher le fichier

    def __parse_stations(self, data, genre):
        """
        Parses the list of stations.
        """

        stations = []
        soup = BeautifulSoup(data)
        resulttable = soup.find("div", {"id": "resulttable"})
        if (resulttable):
            for entry in resulttable.findAll("div", {"class": "dirlist"}):
                #print entry
                station = File(self)
                a_tag = entry.find("a", {"class": "playbutton playimage"})
                playing_tag = entry.find("div", {"class": "playingtext"})
                bitrate_tag = entry.find("div", {"class": "dirbitrate"})
                type_tag = entry.find("div", {"class": "dirtype"})

                if (not a_tag or not playing_tag or not bitrate_tag
                        or not type_tag):
                    continue

                station.resource = a_tag["href"]
                station.name = a_tag["title"]
                now_playing = playing_tag["title"]
                bitrate = bitrate_tag.contents[0].strip()

                typename = type_tag.contents[0].strip()
                if (typename == "MP3"):
                    station.mimetype = "audio/mpeg"
                elif (typename == "AAC+"):
                    station.mimetype = "audio/mp4"
                else:
                    station.mimetype = "audio/x-unknown"

                station.path = File.pack_path("/" + urlquote.quote(genre, ""),
                                              station.name, bitrate,
                                              station.mimetype,
                                              station.resource, genre)
                station.info = "Bitrate: %s kb\n" \
                               "Now playing: %s" % (bitrate, now_playing)
                station.icon = theme.shoutcast_station.get_path()
                stations.append(station)
            #end for
        #end if

        if (not stations):
            self.__current_folder.message = "station list not available"
            logging.error("SHOUTcast station listing download failed\n%s",
                          logging.stacktrace())

        stations.sort()
        return stations

Exemple #9

0

Afficher le fichier

Fichier : update.py Projet : gamehacker/Beebeeto-POC-Downloader

def getPoc(poc):
    info = getHtml("http://beebeeto.com/pdb/" + poc + "/")
    if '' == info:
        return ''
    if '<img src="/static/img/test.jpg"' in info:
        return ''

    bt = BeautifulSoup(info)
    ret = bt.find('pre', {'class' : "brush: python;"})
    ret = ret.renderContents()
    if ret: 
        return strip_tags(ret)
    else:
        return ''

Exemple #10

0

Afficher le fichier

Fichier : views.py Projet : jamslevy/PQ

 def get_recent_updates(self):
 	WIKIRAGE_URL = "http://www.wikirage.com/rss/1/"
 	from google.appengine.api import urlfetch
 	fetch_page = urlfetch.fetch(WIKIRAGE_URL, follow_redirects=False)
 	from utils.BeautifulSoup import BeautifulSoup
 	soup = BeautifulSoup(fetch_page.content) 
 	updates = []
 	wiki_topics = [ guid.contents[0].split('/')[-2] for guid in soup.findAll('guid') ]
 	import urllib
 	for topic in wiki_topics:
 		topic = urllib.unquote(topic)
 		topic_name = topic.replace('_', ' ')
 		updates.append( { 'topic_path': topic, 'topic_name': topic_name } )
 	return updates

Exemple #11

0

Afficher le fichier

def getPoc(poc):
    info = getHtml("http://beebeeto.com/pdb/" + poc + "/")
    if '' == info:
        return ''
    if '<img src="/static/img/test.jpg"' in info:
        return ''

    bt = BeautifulSoup(info)
    ret = bt.find('pre', {'class': "brush: python;"})
    ret = ret.renderContents()
    if ret:
        return strip_tags(ret)
    else:
        return ''

Exemple #12

0

Afficher le fichier

Fichier : megavideo_crawler.py Projet : rauljuarez/vxvCrawler

 def _get_video_details(self,html_data):
     soup= BeautifulSoup(''.join(html_data),convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
     script=soup.find('script',text=re.compile('flashvars'))
     
     title=re.compile('flashvars.title = "(.+?)";').findall(script.string)
     description=re.compile('flashvars.description = "(.+?)";').findall(script.string)
     tags=re.compile('flashvars.tags = "(.+?)";').findall(script.string)
     category=re.compile('flashvars.category = "(.+?)";').findall(script.string)
     
     video=MegaVideoVideo()
     video.title=strip_accents(urllib.unquote(title[0].replace('+', ' ')))
     video.description=strip_accents(urllib.unquote(description[0].replace('+', ' ')))
     video.category=strip_accents(urllib.unquote(category[0].replace('+', ' ')))
     video.tags=strip_accents(urllib.unquote(tags[0].replace('+', ' ')))
     
     return video

Exemple #13

0

Afficher le fichier

Fichier : views.py Projet : theoboyd/mvspt-webapp

def _GetPostingPage(page_num):
  url = urllib.urlopen(settings.SOURCE_ALL_URL + str(page_num))
  data = BeautifulSoup(url.read())
  postings = data.findAll('li', {'class': re.compile('hlisting')})

  for posting in postings:
    posting_object = models.Posting()
    try:
      descr_div = BeautifulSoup(posting).findAll('div', {'class': re.compile('description')})[0]
    except TypeError:
      continue
    descr_div = descr_div.findAll('h3')[0]
    description = BeautifulSoup(descr_div).findAll('a')[0]
    url = BeautifulSoup(descr_div).get('href')
    posting_object.content = posting
    logging.critical(description)
    logging.critical(url)

Exemple #14

0

Afficher le fichier

Fichier : induction.py Projet : jamslevy/PQ

  def get_soup(self, page):
	#in case we need to meet 100k limit, truncate page.
	soup_url = SEMANTICPROXY_URL +  str(page.url)    # + TRUNCATE URL + 
	# timeout for fetch_page (and all fetch pages)
	try: 
	    logging.debug('Fetching calais response')
	    fetch_page = urlfetch.fetch(soup_url)              # perform semantic analysis
	except:
		logging.debug('Unable to fetch calais response')
		return False 
	
	soup = BeautifulSoup(fetch_page.content) #whole page
	try: # look for error
		exception = soup.findAll('exception')[0].contents[0]
		print exception
		return False
	except: return soup

Exemple #15

0

Afficher le fichier

Fichier : metacafe_downloader.py Projet : rauljuarez/vxvCrawler

 def _get_video_info(self,video_url):  
     ''' 
     Return direct URL to video.
     '''  
     #get the page
     data=urllib.urlopen(video_url)
     soup = BeautifulSoup(''.join(data.read()),convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
     #find the location of the embed code
     div=soup.find('noscript')
     if div!=None:
         rex= re.compile(r'mediaURL=(.*?)&', re.M)
         flashvars=div.contents[1].attrs[9][1].encode('utf-8')
         self._logger.debug('Metacafe flashvars:%s',flashvars)
         match=rex.search(flashvars)
         if match!=None:
             return urllib.unquote(match.group(1))
         else:
             return None
     else:
         return None

Exemple #16

0

Afficher le fichier

Fichier : crunchyDec.py Projet : KasaiDot/plugin.video.crunchyroll-takeout

	def convertToASS(self, script):
		soup = BeautifulSoup(script, convertEntities=BeautifulSoup.HTML_ENTITIES)
		header = soup.find('subtitle_script')
		header = "[Script Info]\nTitle: "+header['title']+"\nScriptType: v4.00+\nWrapStyle: "+header['wrap_style']+"\nPlayResX: 624\nPlayResY: 366\n\n";
		#styles = "[V4 Styles]\nFormat: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding\n";
                styles = "[V4+ Styles]\nFormat: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding\n";
		events = "[Events]\nFormat: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n";
		stylelist = soup.findAll('style')
		eventlist = soup.findAll('event')
		
		for style in stylelist:
			#styles += "Style: " + style['name'] + ", " + style['font_name'] + ", " + style['font_size'] + ", " + style['primary_colour'] + ", " + style['secondary_colour'] + ", " + style['outline_colour'] + ", " + style['back_colour'] + ", " + style['bold'] + ", " + style['italic'] + ", " + style['underline'] + ", " + style['strikeout'] + ", " + style['scale_x'] + ", " + style['scale_y'] + ", " + style['spacing'] + ", " + style['angle'] + ", " + style['border_style'] + ", " + style['outline'] + ", " + style['shadow'] + ", " + style['alignment'] + ", " + style['margin_l'] + ", " + style['margin_r'] + ", " + style['margin_v'] + ", " + style['encoding'] + "\n"
                        style['strikeout'] + ", " + style['scale_x'] + ", " + style['scale_y'] + ", " + style['spacing'] + ", " + style['angle'] + ", " + style['border_style'] + ", " + style['outline'] + ", " + style['shadow'] + ", " + style['alignment'] + ", " + style['margin_l'] + ", " + style['margin_r'] + ", " + style['margin_v'] + ", " + style['encoding'] + "\n"
			styles += "Style: " + style['name'] + "," + style['font_name'] + "," + style['font_size'] + "," + style['primary_colour'] + "," + style['secondary_colour'] + "," + style['outline_colour'] + "," + style['back_colour'] + "," + style['bold'] + "," + style['italic'] + "," + style['underline'] + "," + style['strikeout'] + "," + style['scale_x'] + "," + style['scale_y'] + "," + style['spacing'] + "," + style['angle'] + "," + style['border_style'] + "," + style['outline'] + "," + style['shadow'] + "," + style['alignment'] + "," + style['margin_l'] + "," + style['margin_r'] + "," + style['margin_v'] + "," + style['encoding'] + "\n"
		
		for event in eventlist:
			events += "Dialogue: 0,"+event['start']+","+event['end']+","+event['style']+","+event['name']+","+event['margin_l']+","+event['margin_r']+","+event['margin_v']+","+event['effect']+","+event['text']+"\n"
	
		formattedSubs = header+styles+events
		return formattedSubs

Exemple #17

0

Afficher le fichier

Fichier : dailymotion_downloader.py Projet : rauljuarez/vxvCrawler

 def _get_video_info(self,video_url):
      #class="dm_widget_videoplayer"
      
     opener=CustomUrlOpener()
     page=opener.open(video_url)
     response=page.read()
     page.close()
     
     soup = BeautifulSoup(''.join(response),convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
     
     div=soup.find('div',{'class':'dm_widget_videoplayer'})
     script=div.find('script')
     if script!=None:
         urls= re.compile('addVariable\("video", "(.*?)"\);').findall(script.string)
         if urls!=None and len(urls)>0:
             return self._split_urls(urls[0])
         else:
             return None
     else:
         self._logger.error('We couldn''t get the dailymotion url of this video: %s',video_id)
         return None

Exemple #18

0

Afficher le fichier

Fichier : dailymotion_crawler.py Projet : rauljuarez/vxvCrawler

 def _get_video_details(self,html_data):
     soup= BeautifulSoup(''.join(html_data),convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
     script=soup.find('script',text=re.compile('flashvars'))
     
     t=soup.find('h1',{'class':'dmco_title'})
     title=t.string if t != None else ''
     
     d=soup.find('div',id='video_description')
     description=d.string if d!=None else None
     
     c=soup.find('a',{'class':re.compile('fromchannel_link')})
     category=c.string if c!=None else None
     
     tags_el=soup.find('div',{'class':re.compile('tags_cont')}).findAll('a')
     tags_list=[]
     for a in tags_el:
         tags_list.append(a.string)
     tags=','.join(tags_list)    
         
     
     video=DailyMotionVideo()
     video.title=strip_accents(title)
     video.description=strip_accents(description) if description!=None else None
     video.category=strip_accents(category)
     video.tags=strip_accents(tags)
     
     return video

Exemple #19

0

Afficher le fichier

Fichier : feed2mobi.py Projet : KyoZhang/feed2mobi

    def parse_summary(self, summary, link):
        
        #summary = escape.utf8(summary)
        soup = BeautifulSoup(summary)
        
        for script in list(soup.findAll('script')):
            script.extract()
            
        for o in soup.findAll(onload=True):
            del o['onload']
            
        for script in list(soup.findAll('noscript')):
            script.extract()
            
        for attr in self.remove_attributes:
            for x in soup.findAll(attrs={attr:True}):
                del x[attr]
                
        for tag in self.remove_tags:
            for x in soup.findAll(tag['name']):
                x.extract()
                
        for base in list(soup.findAll(['base', 'iframe'])):
            base.extract()
            
        #for p in list(soup.findAll(['p', 'div'])):
        #    p['style'] = 'text-indent:2em'
        
        img_count = 1
        for img in list(soup.findAll('img')):
            
            if self.noimage or img_count >= self.max_images:
                img.extract()
            else:
                image_url = absolute_path(img['src'], link)
                image = self.down_image(image_url, link)

                if image:
                    img['src'] = image
                else:
                    img.extract()
                    
            img_count = img_count + 1
        
        return soup.renderContents('utf-8')

Exemple #20

0

Afficher le fichier

Fichier : methods.py Projet : jamslevy/quizthebill

  def update_bill(self, bill):
	"""
	Check if a bill exists in datastore, and update its stats.
	"""
	this_bill = Bill.get_by_key_name(bill['title']) 
	logging.info(bill['title']) 
	if this_bill is None: 
		this_bill = self.create_bill(bill)
		is_new_bill = True
	else: is_new_bill = False
	this_bill.rank = bill['rank']
	import urllib
	self.request_args = {'bill_id':  bill['id']}
	self.formatted_args = urllib.urlencode(self.request_args)
	from google.appengine.api import urlfetch
	fetch_page = urlfetch.fetch( url = OPENCONGRESS_INFO_URL + self.formatted_args,
								method = urlfetch.GET) 
	from utils.BeautifulSoup import BeautifulSoup	
	document = BeautifulSoup(fetch_page.content)
	property_count = 0	 
	this_bill.introduction_date = str(document.findAll('li')[property_count]).split('</strong> ')[1].split('</li>')[0]
	this_bill.status = str(document.findAll('li')[property_count + 1]).split('</strong> ')[1].split('</li>')[0]
	if this_bill.status == "This Bill Has Become Law":  property_count = -1 # no next step
	else: this_bill.next_step = str(document.findAll('li')[property_count + 2]).split('</strong> ')[1].split('</li>')[0]
	this_bill.latest_action = str(document.findAll('li')[property_count + 3]).split('</strong> ')[1].split('</li>')[0]
	if len( this_bill.latest_action ) > 68: this_bill.latest_action = " ".join(this_bill.latest_action.split(' ')[:9]) + "..."
	this_bill.sponsor = str(document.findAll('li')[property_count + 4]).split('</strong> ')[1].split('</li>')[0].decode('utf-8')
	this_bill.sponsor_name = this_bill.sponsor.split("[")[0] 
	self.save.append(this_bill)
	if is_new_bill: self.send_email_updates(this_bill)
	return

Exemple #21

0

Afficher le fichier

Fichier : base.py Projet : zy-sunshine/sunblackshineblog

def get_excerpt(content):
    soup = BeautifulSoup(content)
    return soup.getText()[:100]

Exemple #22

0

Afficher le fichier

Fichier : base.py Projet : jpedraza/wordprocessor

def get_excerpt(content):
    soup = BeautifulSoup(content)
    return soup.getText()[:100]

Exemple #23

0

Afficher le fichier

Fichier : IcecastDirectory.py Projet : buzztiaan/mediabox

    def __parse_stations(self, data, genre):
        """
        Parses the list of stations.
        """

        stations = []
        next_page_url = ""

        soup = BeautifulSoup(data)
        resulttable = soup.find("div", {"id": "content"})

        if (resulttable):
            for entry in resulttable.findAll("tr"):
                #print entry

                station = File(self)
                try:
                    station.name = entry.find("span", {
                        "class": "name"
                    }).a.contents[0]
                except:
                    continue
                try:
                    now_playing = entry.find("p", {
                        "class": "stream-onair"
                    }).contents[1]
                except:
                    now_playing = ""
                station.resource = _ICECAST_BASE + entry.find(
                    "td", {
                        "class": "tune-in"
                    }).find("a")["href"]
                try:
                    bitrate = entry.find("td", {
                        "class": "tune-in"
                    }).findAll("p", {"class": "format"})[1]["title"]
                except:
                    bitrate = "-"

                try:
                    typename = entry.find("a", {
                        "class": "no-link"
                    }).contents[0].strip()
                except:
                    typename = ""

                if (typename == "MP3"):
                    station.mimetype = "audio/mpeg"
                elif (typename == "AAC+"):
                    station.mimetype = "audio/mp4"
                else:
                    station.mimetype = "audio/x-unknown"

                station.path = "/" + genre + "/" + \
                    self.__encode_station(station.name,
                                          bitrate,
                                          station.mimetype,
                                          station.resource,
                                          genre)
                station.info = "Bitrate: %s\n" \
                               "Now playing: %s" % (bitrate, now_playing)
                station.icon = theme.icecast_station.get_path()
                stations.append(station)
            #end for

            pager_tag = resulttable.find("ul", {"class": "pager"})
            if (pager_tag):
                link = pager_tag.findAll("a")[-1]
                if (not link.contents[0].isdigit()):
                    # must be an arrow
                    next_page_url = link["href"]
                #end if
            #end if

        #end if

        if (not stations):
            self.__current_folder.message = "station list not available"
            logging.error("icecast station listing download failed\n%s",
                          logging.stacktrace())

        return (stations, next_page_url)