def doCron(videos): raw_data = http.getHttp("https://gdata.youtube.com/feeds/api/standardfeeds/on_the_web") soup = BeautifulSoup(raw_data, selfClosingTags=['category']) entries=soup.findAll('entry') for entry in entries: if len(entry('title'))>0: mykey=entry('title')[0].text if len(entry('title'))>0 else None if mykey and not getVideo(videos, mykey): video=Video() video.title=entry('title')[0].text video.mykey=mykey video.text=entry('content')[0].text if len(entry('content'))>0 else '' links=entry(lambda tag: tag.name=='link' and tag.attrs[2][0]=='href' and '/watch?' in tag.attrs[2][1]) if len(links)==0: continue video.link=links[0].attrs[2][1] imgs=entry('media:thumbnail', height="90", width="120") if len(imgs)==0: continue video.img=imgs[0].attrs[0][1] imgsBig=entry('media:thumbnail', height='360', width='480') if len(imgsBig)==0: continue video.imgBig=imgsBig[0].attrs[0][1] video.tags=getTags(entry) video.categories=getCategories(entry) video.save();
def get_onepage_poclist(page): info = getHtml("http://beebeeto.com/pdb" + '/?page=' + str(page)) if '' == info: return '' bt = BeautifulSoup(info) end = bt.find('a', {'style' : "font-size: 20px;font-weight: bold; border-bottom: 3px solid #777777;"}) if '1' == end.renderContents() and page != 1: return '' ret = bt.find('div', {'class' : 'mainlist'}) ret = ret.renderContents() if ret == "": return "" retlist = [] rets = re.findall('<a href=.*?>', ret) for one in rets: if "poc-" in one: one = one.replace('<a href="', "") one = one.replace('">', "") one = one.strip() retlist.append(one) return retlist
def parse_locations_from_preferences_body(self, response_body): location_profiles = [] response_doc = BeautifulSoup(response_body) tbody_tag = response_doc.find('tbody', {'id':'dpref_driver_pk__preferences_pk__driver_locations_pk__profiles'}) if tbody_tag is None: raise ScreenscrapeParseError('No tbody found: %r' % response_body) tr_tags = tbody_tag.findAll('tr') for tr_tag in tr_tags: profile_name_td_tag = tr_tag.findAll('td', {'class':'profile_name'})[0] profile_desc_td_tag = tr_tag.findAll('td', {'class':'profile_descr'})[0] profile_id_radio_tag = tr_tag.findAll('input', {'class':'profile_default'})[0] profile_name = profile_name_td_tag.text profile_desc = profile_desc_td_tag.text profile_id = profile_id_radio_tag['value'] profile_def = (profile_id_radio_tag.get('checked',None) == 'checked') location_profile = LocationProfile(profile_name, profile_id, profile_desc) location_profile.is_default = profile_def location_profiles.append(location_profile) return location_profiles
def get_onepage_poclist(page): info = getHtml("http://beebeeto.com/pdb" + '/?page=' + str(page)) if '' == info: return '' bt = BeautifulSoup(info) end = bt.find( 'a', { 'style': "font-size: 20px;font-weight: bold; border-bottom: 3px solid #777777;" }) if '1' == end.renderContents() and page != 1: return '' ret = bt.find('div', {'class': 'mainlist'}) ret = ret.renderContents() if ret == "": return "" retlist = [] rets = re.findall('<a href=.*?>', ret) for one in rets: if "poc-" in one: one = one.replace('<a href="', "") one = one.replace('">', "") one = one.strip() retlist.append(one) return retlist
def sanitize_html(value): valid_tags = ''.split() valid_attrs = ''.split() soup = BeautifulSoup(value) for comment in soup.findAll( text=lambda text: isinstance(text, Comment)): comment.extract() for tag in soup.findAll(True): if tag.name not in valid_tags: tag.hidden = True tag.attrs = [(attr, val) for attr, val in tag.attrs if attr in valid_attrs] return soup.renderContents().decode('utf8').replace('javascript:', '')
def strip_html_and_tags(s, invalid_tags): ''' content between "invalid_tags" is removed ''' if not s: return s from util.BeautifulSoup import BeautifulSoup soup = BeautifulSoup(s.replace('<br>','\n').replace('<br/>','\n').replace('<br />', '\n')) for tag in invalid_tags: for result in soup.findAll(name=tag): result.replaceWith("") return ''.join(e for e in soup.recursiveChildGenerator() if isinstance(e,unicode))
def strip_html_and_tags(s, invalid_tags): ''' content between "invalid_tags" is removed ''' if not s: return s from util.BeautifulSoup import BeautifulSoup soup = BeautifulSoup( s.replace('<br>', '\n').replace('<br/>', '\n').replace('<br />', '\n')) for tag in invalid_tags: for result in soup.findAll(name=tag): result.replaceWith("") return ''.join(e for e in soup.recursiveChildGenerator() if isinstance(e, unicode))
def getPoc(poc): info = getHtml("http://beebeeto.com/pdb/" + poc + "/") if '' == info: return '' if '<img src="/static/img/test.jpg"' in info: return '' if "立即查看" in info: return '' try: bt = BeautifulSoup(info) ret = bt.find('pre', {'class' : "brush: python;"}) ret = ret.renderContents() if ret: return strip_tags(ret) else: return '' except: return ''
def getPoc(poc): info = getHtml("http://beebeeto.com/pdb/" + poc + "/") if '' == info: return '' if '<img src="/static/img/test.jpg"' in info: return '' if "立即查看" in info: return '' try: bt = BeautifulSoup(info) ret = bt.find('pre', {'class': "brush: python;"}) ret = ret.renderContents() if ret: return strip_tags(ret) else: return '' except: return ''
def getstatus(code, count=None): url = baseurl % code f = urllib2.urlopen(url) d = f.read() f.close() bs = BeautifulSoup(d) res = [] statuslist = ( bs.find("div", {"class": "result_up"}).find("table", {"width": "500"}).findAll("p", {"class": "resulttext"}) ) for status in statuslist: date, statustext, location = status.contents statustext = statustext.string date = time.strptime(date, "%d.%m.%Y, klo %H:%M ") location = location[6:].strip() dt = datetime.datetime(*date[0:6]) now = datetime.datetime.now() age = now - dt agestr = [] if age.days > 0: agestr.append("%dd" % age.days) secs = age.seconds hours, minutes, seconds = secs // 3600, secs // 60 % 60, secs % 60 if hours > 0: agestr.append("%dh" % hours) if minutes > 0: agestr.append("%dm" % minutes) res.append("%s - %s - %s" % (" ".join(agestr) + " ago", statustext, location)) if count: return res[:count] else: return res
def strip_html2(s): ''' Strips out HTML with the BeautifulSoup library. >>> strip_html2('<html><body><b>Some <i>ugly</i></b> html.</body></html>') u'Some ugly html.' ''' if not s: return s from util.BeautifulSoup import BeautifulSoup soup = BeautifulSoup(s) text_pieces = [] for pc in soup.recursiveChildGenerator(): if isinstance(pc, unicode): text_pieces.append(pc) elif pc.name == 'br': text_pieces.append('\n') return ''.join(text_pieces)
def buildStoryFromString(data, stories): story=findStory(data, stories) if not story: url="http://www.google.com/search?q="+data.replace(' ', '+') logging.info(url) try: raw_data = http.getHttp(url) soup = BeautifulSoup(raw_data) story=None a=soup.find(lambda tag: tag.name=='a' and tag.attrs[0][0]=='href' and not tag.attrs[0][1].startswith('/') and not 'google' in tag.attrs[0][1]) if a and a.text: story=Story() story.deleteFlag=False story.mykey=data story.title='' for c in a.contents: if type(c) == Tag: story.title+=c.text else: story.title+=c story.link=a.attrs[0][1] story.text='' for c in a.parent.contents[4].contents: if type(c) == Tag: story.text+=c.text else: story.text+=c story.put() except DownloadError: #@UndefinedVariable logging.error(url + ' failed to load') ''' scraper=SearchScraper() scraper.feed(raw_data) return scraper.story '''
def odc_body_received(self, data): info('odc_body_received') # Get a place to store the images. import stdpaths assetdir = stdpaths.userdata # Did the message include an inline image? if '<BINARY>' in data: j = data.find('<BINARY>') # Parse the HTML _before_ <BINARY> soup = BeautifulSoup(data[:j]) for img in soup.html.body('img'): # may have more than one <img> # For each <IMG> tag imgdata = data[j:] findme = ' ID="%s" SIZE="%s">' % (str( img['id']), str(img['datasize'])) i = imgdata.find(findme) imgbytes = imgdata[i + len(findme):int(img['datasize']) + 33] # os.path.split the img src, because some clients send their # full paths. (file:///c:/blah.jpg) imgpath = os.path.join(assetdir, os.path.split(img['src'])[1]) img['src'] = imgpath del img['width'] del img['height'] with open(imgpath, 'wb') as f: f.write(imgbytes) msg = unicode(soup.html) else: msg = data self.convo.incoming_message(self.screenname, msg) self.socket.receive_next(ODCHeader, self.odc_header_received)
if not(dburl): dburl = TallstreetUrls.get_url(url[0:-1]) if dburl: url = url[0:-1] if dburl: payload["url"] = dburl.url payload["title"] = dburl.title payload["description"] = dburl.description payload["new"] = False for keyword in dburl.related_keywords: payload["tags"][keyword.tag.tag] = min(keyword.money / 1000 + 10, 30) else: page = fetch(url) soup = BeautifulSoup(page.content) payload["title"] = soup.html.head.title.string desc = soup.find("meta", {"name": "description"}) if desc: payload["description"] = desc["content"] payload["url"] = url payload["new"] = True if keywords == []: invested = TallstreetPortfolio.get_keywords(request.user, dburl) for keyword in invested: if payload["tags"].has_key(keyword.keyword.tag): del payload["tags"][keyword.keyword.tag] if keyword.keyword.tag == new_keyword: new_keyword = ""