def settings(self, stype): BASE = 'http://weibo.cn/dpool/ttt/' """stype: school, birth , company, tag, intro""" link = BASE + self.current_page.findAll( href=re.compile('setting'))[0]['href'] self.current_page = BSS(self.br.open(link).read()) self.rand_sleep() BASE = 'http://weibo.cn' link = BASE + self.current_page.findAll( href=re.compile(stype))[0]['href'] self.current_page = BSS(self.br.open(link).read()) self.rand_sleep() if stype == 'school': return self.set_school() elif stype == 'company': return self.set_company() elif stype == 'birth': return self.set_birth() elif stype == 'tag': pass elif stype == 'intro': pass print self.current_page
def set_company(self): BASE = 'http://weibo.cn' company_kw = ['有限', '公司', '集团', '电子', '外贸', '上海', '北京', '银行'] data = {} submit_link = BASE + self.current_page.findAll('go')[0]['href'] for postfield in self.current_page.findAll('postfield'): data[str(postfield['name'])] = postfield['value'].encode( 'utf8', 'ignore') data['keyword'] = random.choice(company_kw) self.current_page = self.xml_submit(submit_link, data) self.rand_sleep() company = random.choice( self.current_page.findAll(href=re.compile('scn='))) link = BASE + company['href'] name = company.text self.current_page = BSS(self.br.open(link).read()) data = {} submit_link = BASE + self.current_page.findAll('go')[0]['href'] for postfield in self.current_page.findAll('postfield'): data[str(postfield['name'])] = postfield['value'].encode( 'utf8', 'ignore') data['scremark'] = '' start = random.randint(2004, 2011) end = random.randint(start, 2011) data['scend'] = str(end) data['scstart'] = str(start) self.rand_sleep() self.current_page = self.xml_submit(submit_link, data) if self.current_page.findAll(href=re.compile('subact=del')) != None: return (name, start) else: return (0)
def wotd(self, irc, msg, args): """ returns Merriam-Webster's Word of the Day, including link to mp3 audio usage example """ try: idx = args[0] except: idx = 0 r = Request('http://www.merriam-webster.com/word/index.xml') doc = urlopen(r) html = doc.read() soup = BSS(html, convertEntities=BSS.XML_ENTITIES) item = soup.findAll('item')[int(idx)] mp3url = tinyurl(item.enclosure['url']) itemurl = tinyurl(item.link.string) # description is HTML in a CDATA section dsoup = BS(item.description.string, convertEntities=BS.HTML_ENTITIES) summary = ''.join(dsoup.findAll(text=True)) summary = re.sub('\s+', ' ', summary) match = re.search('\d{2}, \d+ is: (.+?) Example sentence:', summary, re.I | re.M | re.S) worddef = match.group(1).encode('ascii', 'ignore') worddef = re.sub('^\s*(?P<wotd>[\w\s]+)', '\g<wotd>: ', worddef) resp = '%s (audio:%s, link:%s)' % (worddef, mp3url, itemurl) irc.reply(resp, prefixNick=False)
def _fetch_xml(self, function, query): url = "http://api.wunderground.com/auto/wui/geo/%sXML/index.xml?%s" % ( function, urlencode({'query': query})) print url doc = web.getUrl(url, headers=HEADERS) # Wunderground double-encodes some of its entities, so we'll double-decode. return BSS(doc, convertEntities=BSS.HTML_ENTITIES)
def PhotoMenu(): oc = ObjectContainer(title2="Photos") for item in XML.ElementFromURL(RSS_FEED).xpath('//item'): url = item.find('link').text title = item.find('title').text date = Datetime.ParseDate(item.find('pubDate').text) thumb = R(ICON) try: thumb = FindPhotos( item.xpath('c:encoded', namespaces=PHOTO_NS)[0].text)[0] except: continue summary = item.xpath('description')[0].text.replace('<p>', '').replace( '</p>', '').replace('<br />', "\n").replace(' [...]', '...') soup = BSS(summary, convertEntities=BSS.HTML_ENTITIES) summary = soup.contents[0] # Technically, I should use the url parameter of the PhotoAlbumObject to perform a service lookup. # However, this currently introduces an additional level in the structure which is undesired. # Therefore, I'm doing this all manually. oc.add( PhotoAlbumObject(key=Callback(PhotoList, url=url, title=title), rating_key=url, title=title, thumb=thumb, originally_available_at=date)) return oc
def parse_links(contents, rel): """Define a helper function for parsing feed links.""" strainer = SoupStrainer('link', rel=rel) entries = [ tag for tag in BSS( contents, parseOnlyThese=strainer, selfClosingTags=['link']) ] return entries
def login(self, usr_name, passwd): self.passwd = passwd self.current_page = BSS(self.br.open(HOME_SOHU).read()) login_link = HOME_SOHU + self.br.find_link( text_regex=re.compile(LOGIN_TXT)).url self.current_page = BSS(self.br.open(login_link).read()) submit_link = HOME_SOHU + self.current_page.find('form')['action'] data = {} for postfield in self.current_page.findAll('input'): if postfield['type'] not in ['button', 'submit']: data[str(postfield['name'])] = postfield['value'].encode( 'utf8', 'ignore') data['u'] = usr_name data['p'] = passwd data['fr'] = 'null' self.current_page = self.xml_submit(submit_link, data) self.rand_sleep()
def login(self, usr_name, passwd): self.passwd = passwd self.current_page = BSS(self.br.open(HOME_SINA).read()) login_link = self.current_page('a', limit=1)[0]['href'] self.current_page = BSS(self.br.open(login_link).read()) submit_link = BASE + self.current_page.find('go')['href'] data = {} for postfield in self.current_page.findAll('postfield'): if postfield['value'].encode('utf8', 'ignore') != '$(password)': data[str(postfield['name'])] = postfield['value'].encode( 'utf8', 'ignore') else: data[str(postfield['name'])] = passwd data['remember'] = '1' data['mobile'] = usr_name self.current_page = self.xml_submit(submit_link, data) home_link = self.current_page.find('a')['href'] self.current_page = BSS(self.br.open(home_link).read()) self.rand_sleep() return 1
def change_pass(self, passwd): data = {} link = HOME_SOHU + self.br.find_link( text_regex=re.compile(SETTING_TXT)).url self.current_page = BSS(self.br.open(link).read()) self.rand_sleep() link = HOME_SOHU + self.current_page.findAll( href=re.compile('upass'))[0]['href'] self.current_page = BSS(self.br.open(link).read()) for postfield in self.current_page.findAll('input'): if postfield['type'] not in ['button', 'submit']: data[str(postfield['name'])] = postfield['value'].encode( 'utf8', 'ignore') data['password'] = self.passwd data['newpass'] = passwd # BASE = 'http://weibo.cn/dpool/ttt/' submit_link = HOME_SOHU + self.current_page.find('form')['action'] self.current_page = self.xml_submit(submit_link, data) print self.current_page self.rand_sleep()
def set_school(self): BASE = 'http://weibo.cn' link = BASE + self.current_page.findAll( href=re.compile('subact=search'))[0]['href'] self.current_page = BSS(self.br.open(link).read()) self.rand_sleep() link = BASE + self.current_page.findAll( href=re.compile('stype=1'))[0]['href'] self.current_page = BSS(self.br.open(link).read()) self.rand_sleep() link = BASE + self.current_page.findAll( href=re.compile('provid=31'))[0]['href'] self.current_page = BSS(self.br.open(link).read()) self.rand_sleep() link = BASE + random.choice( self.current_page.findAll(href=re.compile('scn=')))['href'] self.current_page = BSS(self.br.open(link).read()) self.rand_sleep() data = {} submit_link = BASE + self.current_page.findAll('go')[0]['href'] for postfield in self.current_page.findAll('postfield'): data[str(postfield['name'])] = postfield['value'].encode( 'utf8', 'ignore') name = data['scname'] data['scremark'] = '' start = random.randint(1995, 2004) data['scstart'] = str(start) self.current_page = self.xml_submit(submit_link, data) if self.current_page.findAll(href=re.compile('subact=del')) != None: return (name, start) else: return (0)
def PhotoMenu(): dir = MediaContainer(viewGroup='Details', title2="Photos") xml = HTTP.Request(RSS_FEED).content.replace('media:content', 'content') for item in XML.ElementFromString(xml).xpath('//item'): title = item.find('title').text summary = item.xpath('description')[0].text.replace('<p>', '').replace( '</p>', '').replace('<br />', "\n").replace(' [...]', '...') soup = BSS(summary, convertEntities=BSS.HTML_ENTITIES) summary = soup.contents[0] date = Datetime.ParseDate( item.find('pubDate').text).strftime('%a %b %d, %Y') thumb = item.xpath('content', namespaces=PHOTO_NS)[0].get('url') dir.Append( Function(DirectoryItem(PhotoList, title, date, summary, thumb), key=item.find('link').text)) return dir
def play(self, page, mode=''): if Debug: self.LOG('DEBUG: _play()\nurl: %s' % page) # Get current list item details... title = unicode(xbmc.getInfoLabel("ListItem.Title"), "utf-8") thumbnail = xbmc.getInfoImage("ListItem.Thumb") plot = unicode(xbmc.getInfoLabel("ListItem.Plot"), "utf-8") if mode == 'smil': smil = BSS(self._get(page)) rtmp = smil.meta['base'] video = smil.video['src'] swfUrl = 'http://medici.tv/medici.swf' # rtmpdump script for console use rtmpdump = "rtmpdump -r %s --swfUrl http://medici.tv/medici.swf --tcUrl '%s' --playpath '%s' -o '%s.mp4'" % \ (rtmp, rtmp, saxutils.unescape(video), saxutils.unescape(title)) # Build rtmp url... video_url = rtmp + ' swfUrl=' + swfUrl + ' tcUrl=' + rtmp + ' playpath=' + saxutils.unescape( video) if Debug: self.LOG('DEBUG: rtmp link details.\n\trtmp: %s\n\tswfUrl: %s\n\ttcUrl: %s\n\tplaypath: %s\n\trtmpdump: %s' % \ (rtmp, swfUrl, rtmp, saxutils.unescape(video), rtmpdump)) elif mode == 'rtmp_daily': video_url = page.split('&rtmp=1')[0] if Debug: self.LOG('DEBUG: video link details.\n\turl: %s' % video_url) else: video_url = '' if Debug: self.LOG('DEBUG: no video link!') raise # only need to add label, icon and thumbnail, setInfo() and addSortMethod() takes care of label2 listitem = xbmcgui.ListItem(title, iconImage="DefaultVideo.png", thumbnailImage=thumbnail) # set listitem information listitem.setInfo('video', { 'title': title, 'label': title, 'plot': plot, 'plotoutline': plot, }) # Play video... xbmc.Player().play(video_url, listitem)
def recode(text): return BSS(text.encode('utf8', 'ignore'), convertEntities=BSS.HTML_ENTITIES)
def parse_ids(contents): """Define a helper function for parsing ids.""" strainer = SoupStrainer('id') ids = [tag for tag in BSS(contents, parseOnlyThese=strainer)] return ids
def _extract_text(self, node): return (BSS(' '.join(node.findAll(text=True)), convertEntities=BSS.HTML_ENTITIES).find(text=True))
def parse_entries(contents): """Define a helper function for parsing feed entries.""" strainer = SoupStrainer('entry') entries = [tag for tag in BSS(contents, parseOnlyThese=strainer)] return entries
from BeautifulSoup import BeautifulStoneSoup as BSS import codecs import sys, os streamWriter = codecs.lookup('utf-8')[-1] sys.stdout = streamWriter(sys.stdout) inf = open(sys.argv[1], "rb").read() try: beta = float(sys.argv[2]) except: beta = 0.1 soup = BSS(inf) segs = soup.findAll(lambda t: t.name == u'seg') tot_paraphrases = 0.0 tot_segs = 0.0 for seg in segs: tot_segs += 1 if seg.get('complete') == 'true': best = seg.find(lambda p: p.name == 'best') ref = seg.ref.find(text=True) eye_dee = seg.get(u'id') paraphrases = set([ p.find(text=True) for p in seg.findAll(lambda e: e.name in [u'best', u'next']) ]) tot_paraphrases += len(paraphrases) sys.stdout.write(ref + u' ||| ' + u' <-> '.join(paraphrases))
def tinyurl(url): r = Request('http://tinyurl.com/api-create.php?url=%s' % url) doc = urlopen(r) soup = BSS(doc) return str(soup)
def follow(self, user_id, st): if user_id != 0: self.current_page = BSS( self.br.open(FOLLOW_LINK % (st, user_id)).read()) self.rand_sleep() return 1
def xml_submit(self, url, data): return BSS(self.br._mech_open(url, urllib.urlencode(data)).read())
s = w.strip().decode('ascii') words.append(s) except Exception: counter += 1 print "\t%d words contained non ascii characters and are ommited" % counter articles[k_word] = {} # the wikipedia api restricts queries to a length of 50 print "\tfound %d words in file" % len(words) for i in range((len(words) / 50) + 1): # create the query and parse it query = query_base % "|".join(words[(i * 50):(i * 50) + 50]) text = myopener.open(query).read() soup = BSS(text, convertEntities=BSS.ALL_ENTITIES) cont = soup.api.query # collect all missing words missing = cont.pages.findAll(missing=True) all_missing.append([m['title'] for m in missing]) # create dict containing all data from the available articles for page in cont.pages.findAll(missing=None): print 'title: ' + page['title'] title = page['title'] data = {} # check whether article was found through redirect if cont.redirects: redir = cont.redirects.findAll(to=title)
def home(self): self.current_page = BSS(self.br.open(HOME_SOHU).read()) self.rand_sleep()