Python BeautifulSoup Examples, util.BeautifulSoup.BeautifulSoup Python Examples

Example #1

0

Show file

File: YTrendCron.py Project: joeykblack/Appspot

def doCron(videos):
    raw_data = http.getHttp("https://gdata.youtube.com/feeds/api/standardfeeds/on_the_web")
    soup = BeautifulSoup(raw_data, selfClosingTags=['category'])
    entries=soup.findAll('entry')
    for entry in entries:
        if len(entry('title'))>0:
            mykey=entry('title')[0].text if len(entry('title'))>0 else None
            if mykey and not getVideo(videos, mykey):
                video=Video()
                video.title=entry('title')[0].text
                video.mykey=mykey
                video.text=entry('content')[0].text if len(entry('content'))>0 else ''
                links=entry(lambda tag: tag.name=='link' and tag.attrs[2][0]=='href' and '/watch?' in tag.attrs[2][1])
                if len(links)==0:
                    continue
                video.link=links[0].attrs[2][1]
                imgs=entry('media:thumbnail',  height="90", width="120")
                if len(imgs)==0:
                    continue
                video.img=imgs[0].attrs[0][1] 
                imgsBig=entry('media:thumbnail',  height='360', width='480')
                if len(imgsBig)==0:
                    continue
                video.imgBig=imgsBig[0].attrs[0][1]
                video.tags=getTags(entry)
                video.categories=getCategories(entry)
                video.save();

Example #2

0

Show file

File: update.py Project: w3h/Beebeeto2

def get_onepage_poclist(page):
    info = getHtml("http://beebeeto.com/pdb" + '/?page=' + str(page))
    if '' == info:
        return ''

    bt = BeautifulSoup(info)
    end = bt.find('a', {'style' : "font-size: 20px;font-weight: bold; border-bottom: 3px solid #777777;"})
    if '1' == end.renderContents() and page != 1:
        return ''

    ret = bt.find('div', {'class' : 'mainlist'})
    ret = ret.renderContents()
    if ret == "":
        return ""

    retlist = []
    rets = re.findall('<a href=.*?>', ret)
    for one in rets:
        if "poc-" in one:
            one = one.replace('<a href="', "")
            one = one.replace('">', "")
            one = one.strip()
            retlist.append(one)
      
    return retlist

Example #3

0

Show file

File: locations.py Project: mjumbewu/pcs-api

 def parse_locations_from_preferences_body(self, response_body):
     location_profiles = []
     
     response_doc = BeautifulSoup(response_body)
     tbody_tag = response_doc.find('tbody', 
         {'id':'dpref_driver_pk__preferences_pk__driver_locations_pk__profiles'})
     
     if tbody_tag is None:
         raise ScreenscrapeParseError('No tbody found: %r' % response_body)
     
     tr_tags = tbody_tag.findAll('tr')
     for tr_tag in tr_tags:
         profile_name_td_tag = tr_tag.findAll('td', {'class':'profile_name'})[0]
         profile_desc_td_tag = tr_tag.findAll('td', {'class':'profile_descr'})[0]
         profile_id_radio_tag = tr_tag.findAll('input', {'class':'profile_default'})[0]
         
         profile_name = profile_name_td_tag.text
         profile_desc = profile_desc_td_tag.text
         profile_id = profile_id_radio_tag['value']
         profile_def = (profile_id_radio_tag.get('checked',None) == 'checked')
         
         location_profile = LocationProfile(profile_name,
                                   profile_id,
                                   profile_desc)
         location_profile.is_default = profile_def
         location_profiles.append(location_profile)
     
     return location_profiles

Example #4

0

Show file

File: update.py Project: yantoumu/Beebeeto2

def get_onepage_poclist(page):
    info = getHtml("http://beebeeto.com/pdb" + '/?page=' + str(page))
    if '' == info:
        return ''

    bt = BeautifulSoup(info)
    end = bt.find(
        'a', {
            'style':
            "font-size: 20px;font-weight: bold; border-bottom: 3px solid #777777;"
        })
    if '1' == end.renderContents() and page != 1:
        return ''

    ret = bt.find('div', {'class': 'mainlist'})
    ret = ret.renderContents()
    if ret == "":
        return ""

    retlist = []
    rets = re.findall('<a href=.*?>', ret)
    for one in rets:
        if "poc-" in one:
            one = one.replace('<a href="', "")
            one = one.replace('">', "")
            one = one.strip()
            retlist.append(one)

    return retlist

Example #5

0

Show file

File: htmlfilter.py Project: Saectar/tallstreet

def sanitize_html(value):
    valid_tags = ''.split()
    valid_attrs = ''.split()
    soup = BeautifulSoup(value)
    for comment in soup.findAll(
        text=lambda text: isinstance(text, Comment)):
        comment.extract()
    for tag in soup.findAll(True):
        if tag.name not in valid_tags:
            tag.hidden = True
        tag.attrs = [(attr, val) for attr, val in tag.attrs
                     if attr in valid_attrs]
    return soup.renderContents().decode('utf8').replace('javascript:', '')

Example #6

0

Show file

File: strings.py Project: AlexUlrich/digsby

def strip_html_and_tags(s, invalid_tags):
    '''
    content between "invalid_tags" is removed
    '''
    if not s: return s

    from util.BeautifulSoup import BeautifulSoup
    soup = BeautifulSoup(s.replace('<br>','\n').replace('<br/>','\n').replace('<br />', '\n'))
    for tag in invalid_tags:
        for result in soup.findAll(name=tag):
            result.replaceWith("")

    return ''.join(e for e in soup.recursiveChildGenerator()
                   if isinstance(e,unicode))

Example #7

0

Show file

def strip_html_and_tags(s, invalid_tags):
    '''
    content between "invalid_tags" is removed
    '''
    if not s: return s

    from util.BeautifulSoup import BeautifulSoup
    soup = BeautifulSoup(
        s.replace('<br>', '\n').replace('<br/>', '\n').replace('<br />', '\n'))
    for tag in invalid_tags:
        for result in soup.findAll(name=tag):
            result.replaceWith("")

    return ''.join(e for e in soup.recursiveChildGenerator()
                   if isinstance(e, unicode))

Example #8

0

Show file

File: update.py Project: w3h/Beebeeto2

def getPoc(poc):
    info = getHtml("http://beebeeto.com/pdb/" + poc + "/")
    if '' == info:
        return ''
    if '<img src="/static/img/test.jpg"' in info:
        return ''
    if "立即查看" in info:
        return ''

    try:
        bt = BeautifulSoup(info)
        ret = bt.find('pre', {'class' : "brush: python;"})
        ret = ret.renderContents()
        if ret: 
            return strip_tags(ret)
        else:
            return ''
    except:
        return ''

Example #9

0

Show file

File: update.py Project: yantoumu/Beebeeto2

def getPoc(poc):
    info = getHtml("http://beebeeto.com/pdb/" + poc + "/")
    if '' == info:
        return ''
    if '<img src="/static/img/test.jpg"' in info:
        return ''
    if "立即查看" in info:
        return ''

    try:
        bt = BeautifulSoup(info)
        ret = bt.find('pre', {'class': "brush: python;"})
        ret = ret.renderContents()
        if ret:
            return strip_tags(ret)
        else:
            return ''
    except:
        return ''

Example #10

0

Show file

File: module_posti.py Project: nigeljonez/newpyfibot

def getstatus(code, count=None):
    url = baseurl % code
    f = urllib2.urlopen(url)
    d = f.read()
    f.close()

    bs = BeautifulSoup(d)

    res = []

    statuslist = (
        bs.find("div", {"class": "result_up"}).find("table", {"width": "500"}).findAll("p", {"class": "resulttext"})
    )
    for status in statuslist:
        date, statustext, location = status.contents
        statustext = statustext.string
        date = time.strptime(date, "%d.%m.%Y, klo %H:%M&nbsp;")
        location = location[6:].strip()

        dt = datetime.datetime(*date[0:6])
        now = datetime.datetime.now()
        age = now - dt

        agestr = []

        if age.days > 0:
            agestr.append("%dd" % age.days)

        secs = age.seconds
        hours, minutes, seconds = secs // 3600, secs // 60 % 60, secs % 60

        if hours > 0:
            agestr.append("%dh" % hours)
        if minutes > 0:
            agestr.append("%dm" % minutes)

        res.append("%s - %s - %s" % (" ".join(agestr) + " ago", statustext, location))

    if count:
        return res[:count]
    else:
        return res

Example #11

0

Show file

def strip_html2(s):
    '''
    Strips out HTML with the BeautifulSoup library.

    >>> strip_html2('<html><body><b>Some <i>ugly</i></b> html.</body></html>')
    u'Some ugly html.'
    '''
    if not s: return s

    from util.BeautifulSoup import BeautifulSoup
    soup = BeautifulSoup(s)

    text_pieces = []
    for pc in soup.recursiveChildGenerator():
        if isinstance(pc, unicode):
            text_pieces.append(pc)
        elif pc.name == 'br':
            text_pieces.append('\n')

    return ''.join(text_pieces)

Example #12

0

Show file

File: strings.py Project: AlexUlrich/digsby

def strip_html2(s):
    '''
    Strips out HTML with the BeautifulSoup library.

    >>> strip_html2('<html><body><b>Some <i>ugly</i></b> html.</body></html>')
    u'Some ugly html.'
    '''
    if not s: return s

    from util.BeautifulSoup import BeautifulSoup
    soup = BeautifulSoup(s)

    text_pieces = []
    for pc in soup.recursiveChildGenerator():
        if isinstance(pc, unicode):
            text_pieces.append(pc)
        elif pc.name == 'br':
            text_pieces.append('\n')

    return ''.join(text_pieces)

Example #13

0

Show file

File: GTrendCron.py Project: joeykblack/Appspot

def buildStoryFromString(data, stories):
    story=findStory(data, stories)
    if not story:
        url="http://www.google.com/search?q="+data.replace(' ', '+')
        logging.info(url)
        try:
            raw_data = http.getHttp(url)
            soup = BeautifulSoup(raw_data)
            story=None
            a=soup.find(lambda tag: tag.name=='a' and tag.attrs[0][0]=='href' and not tag.attrs[0][1].startswith('/') and not 'google' in tag.attrs[0][1])
            if a and a.text:
                story=Story()
                story.deleteFlag=False
                story.mykey=data
                story.title=''
                for c in a.contents:
                    if type(c) == Tag:
                        story.title+=c.text
                    else:
                        story.title+=c
                story.link=a.attrs[0][1]
                story.text=''
                for c in a.parent.contents[4].contents:
                    if type(c) == Tag:
                        story.text+=c.text
                    else:
                        story.text+=c
                story.put()
        except DownloadError: #@UndefinedVariable
            logging.error(url + ' failed to load')
    
    '''
    scraper=SearchScraper()
    scraper.feed(raw_data)
    return scraper.story
    '''

Example #14

0

Show file

File: directim.py Project: sgricci/digsby

    def odc_body_received(self, data):
        info('odc_body_received')

        # Get a place to store the images.
        import stdpaths
        assetdir = stdpaths.userdata

        # Did the message include an inline image?
        if '<BINARY>' in data:
            j = data.find('<BINARY>')

            # Parse the HTML _before_ <BINARY>
            soup = BeautifulSoup(data[:j])
            for img in soup.html.body('img'):  # may have more than one <img>

                # For each <IMG> tag
                imgdata = data[j:]
                findme = ' ID="%s" SIZE="%s">' % (str(
                    img['id']), str(img['datasize']))
                i = imgdata.find(findme)
                imgbytes = imgdata[i + len(findme):int(img['datasize']) + 33]

                # os.path.split the img src, because some clients send their
                # full paths. (file:///c:/blah.jpg)
                imgpath = os.path.join(assetdir, os.path.split(img['src'])[1])

                img['src'] = imgpath
                del img['width']
                del img['height']

                with open(imgpath, 'wb') as f:
                    f.write(imgbytes)

            msg = unicode(soup.html)
        else:
            msg = data

        self.convo.incoming_message(self.screenname, msg)
        self.socket.receive_next(ODCHeader, self.odc_header_received)

Example #15

0

Show file

File: views.py Project: Saectar/tallstreet

	if not(dburl):
		dburl = TallstreetUrls.get_url(url[0:-1])	
		if dburl:
			url = url[0:-1]
	if dburl:
		payload["url"] = dburl.url
		payload["title"] = dburl.title 
		payload["description"] = dburl.description
		payload["new"] = False
			
		for keyword in dburl.related_keywords:
			payload["tags"][keyword.tag.tag] = min(keyword.money / 1000 + 10, 30)
	else:
		page = fetch(url)
		soup = BeautifulSoup(page.content)
		payload["title"] = soup.html.head.title.string
		desc = soup.find("meta", {"name": "description"})
		if desc:
			payload["description"] = desc["content"]
		payload["url"] = url
		payload["new"] = True
		
	
	if keywords == []:
		invested = TallstreetPortfolio.get_keywords(request.user, dburl)
		for keyword in invested:
			if payload["tags"].has_key(keyword.keyword.tag):
				del payload["tags"][keyword.keyword.tag]	
			if keyword.keyword.tag == new_keyword:
				new_keyword = ""