def play(url=common.args.url): finalurl=False playpath=False vid= re.compile('(VD\d*)').findall(url)[0] rtmpdata = 'http://cdn.abc.go.com/vp2/ws/s/contents/2002/utils/video/mov/17496/9024/%s/432?v=05040017_1' % vid data = common.getURL(rtmpdata) tree=BeautifulStoneSoup(data, convertEntities=BeautifulStoneSoup.HTML_ENTITIES) hosts = tree.findAll('host') for host in hosts: if 'Akamai' in host['name']: rtmp = 'rtmpe://%s/%s' % (host['url'], host['app']) filenames = tree.findAll('video') hbitrate = -1 sbitrate = int(common.settings['quality']) for filename in filenames: if filename['src'] <> '': bitrate = int(filename['bitrate']) if bitrate > hbitrate and bitrate <= sbitrate: hbitrate = bitrate playpath = filename['src'] if playpath: swfUrl = 'http://ll.static.abc.com/m/vp2/prod/flash/VP2_05040017_0_1254.swf' finalurl = rtmp+' playpath='+playpath + " swfurl=" + swfUrl + " swfvfy=true" else: plid= re.compile('(PL\d*)').findall(url)[0] clipurl = 'http://abc.go.com/vp2/ws/s/contents/1000/videomrss?brand=002&device=001&start=0&limit=100&fk=CATEGORIES&fv='+plid data = common.getURL(clipurl) tree=BeautifulStoneSoup(data, convertEntities=BeautifulStoneSoup.HTML_ENTITIES) for video in tree.findAll('item'): if video.find('guid').string == vid: finalurl = video.find('media:content')['url'] if finalurl: item = xbmcgui.ListItem(path=finalurl) xbmcplugin.setResolvedUrl(pluginhandle, True, item)
def getFileTypes(url): #list filetypes p = re.compile('/details/(.*)') match = p.findall(url) for name in match: temp = 'http://www.archive.org/download/' + name + '/' + name + '_files.xml' link = getLink(temp) tree = BeautifulStoneSoup(link) shn = tree.findAll('file', attrs={"name": re.compile('(.+?\.shn$)')}) m3u = tree.findAll('file', attrs={"name": re.compile('(.+?\.m3u$)')}) flac = tree.findAll('file', attrs={"name": re.compile('(.+?\.flac$)')}) mp3 = tree.findAll('file', attrs={"name": re.compile('(.+?64kb\.mp3$)')}) vbr = tree.findAll('file', attrs={"name": re.compile('(.+?vbr\.mp3$)')}) if len(m3u) > 0: addDir('.m3u Playlists', temp, 7) if len(flac) > 0: addDir('1. Flac Files', temp, 7) if len(mp3) > 0: addDir('2. VBR mp3', temp, 7) if len(vbr) > 0: addDir('3. 64kB mp3', temp, 7) if len(shn) > 0: addDir('1. Shorten Files', temp, 7)
def scrape_results(data): """ Reads the contents of results.xml. This is the file that has all the changing information, so this is the method that should get run to update the values. """ if OFFLINE: html = file("results.xml").read() else: try: html = urlopen("%sresults.xml" % BASE_URL) except URLError: return data soup = BeautifulStoneSoup(html) election = soup.find('results') data['election'].update({'pol': election['pol'], 'clpol': election['clpol'], 'ts': election['ts'], 'fin': election['fin']}) areas = soup.findAll('area') area = soup_to_dict(areas, 'id', ['bal', 'vot', 'pol', 'clpol'], data['area']) contests = soup.findAll('contest') cont = soup_to_dict(contests, 'id', ['bal', 'bl', 'uv', 'ov'], data['contest']) choices = soup.findAll('choice') cand = soup_to_dict(choices, 'id', ['vot', 'e'], data['choice']) return {'election': data['election'], 'areatype': data['areatype'], 'area': area, 'contest': cont, 'choice': cand, 'party': data['party']}
def resetfonts(app): xml = """<?xml version="1.0"?><!DOCTYPE fontconfig SYSTEM "fonts.dtd"> <fontconfig> </fontconfig>""" try: xml = open(fontconfigpath).read() except: print "" soup = BTS(xml) app.mathcs = soup.findAll('match') app.alias = soup.findAll('alias') if len(app.alias) == 0: xmls = open("/etc/fonts/conf.d/60-latin.conf").read() soups = BTS(xmls) app.alias = soups.findAll('alias') firstime = True for alisstem in app.alias: if alisstem.family.string == "serif": app.serif = alisstem.prefer.findAll('family') if alisstem.family.string == "sans-serif": app.sans = alisstem.prefer.findAll('family') if alisstem.family.string == "monospace": app.monospace = alisstem.prefer.findAll('family') app.fontbutton1.set_font_name(str(app.sans[0].string) + " 12") app.fontbutton2.set_font_name(str(app.sans[1].string) + " 12") app.fontbutton3.set_font_name(str(app.serif[0].string) + " 12") app.fontbutton4.set_font_name(str(app.serif[1].string) + " 12") app.fontbutton5.set_font_name(str(app.monospace[0].string) + " 12") app.fontbutton6.set_font_name(str(app.monospace[1].string) + " 12")
def xunit_violation(data): """XUnit violation""" soup = BeautifulStoneSoup(data['raw']) cases = soup.findAll('testcase') cases_count = len(cases) errors_count = len(soup.findAll('error')) failures_count = len(soup.findAll('failure')) data['preview'] = render_to_string('violations/xunit/preview.html', { 'tests': cases_count, 'pass': cases_count - errors_count - failures_count, 'fail': errors_count + failures_count, }) data['status'] = STATUS_SUCCESS\ if (errors_count + failures_count) == 0 else STATUS_FAILED data['plot'] = { 'tests': cases_count, 'pass': cases_count - errors_count - failures_count, 'fail': errors_count + failures_count, } data['prepared'] = render_to_string('violations/xunit/prepared.html', { 'cases': map(_prepare_test_case, cases), 'tests': cases_count, 'failures': failures_count, 'errors': errors_count, }) failed_percent = (errors_count + failures_count) * 100 / cases_count data['success_percent'] = 100 - failed_percent return data
def main(): root = os.getcwdu() pages = os.path.join(root, 'packages') f = open(os.path.join(root, 'packages', 'repositories.config')) xml = BS(f.read()) f.close() used = [] for res in xml.findAll('repository'): #print res['path'] f = open(os.path.join(root, res['path'][3:])) pro = BS(f.read()) f.close() for p in pro.findAll('package'): used.append(p['id'] + '.' + p['version']) #已经使用的包 #print used remove = [] for dirp in os.listdir(pages): path = os.path.join(pages, dirp) if os.path.isdir(path) and (dirp not in used): print 'remove: ' + dirp remove.append(dirp) for d in remove: path = os.path.join(pages, d) cleanDir(os.path.join(pages, d)) os.rmdir(path)
def playThePlatform(): data = common.getURL(common.args.url) #mrss = urllib.unquote_plus(base64.b64decode(re.compile('{ mrss: "(.+?)",').findall(data)[0])) try:mrss = urllib.unquote_plus(base64.b64decode(re.compile('{ mrss: "(.+?)",').findall(data)[0])) except:mrss = urllib.unquote_plus(base64.b64decode(re.compile('"mrss=(.+?)&').findall(data)[0])) tree=BeautifulStoneSoup(mrss, convertEntities=BeautifulStoneSoup.HTML_ENTITIES) for item in tree.findAll('item'): link = item.find('link').string if link == common.args.url: smil_url = item.find('media:text',text=re.compile('smilUrl=')).string.split('smilUrl=')[1] #smil_url = re.compile('<media:text>smilUrl=(.+?)</media:text>').findall(mrss)[0] #signUrl = 'http://www.history.com/components/get-signed-signature' #signUrl += '?url='+smil_url.split('/s/')[1].split('?')[0] #signUrl += '&cache='+str(random.randint(100, 999)) #sig = str(common.getURL(signUrl)) sig = sign_url(smil_url) smil_url += '&sig='+sig data = common.getURL(smil_url) tree=BeautifulStoneSoup(data, convertEntities=BeautifulStoneSoup.HTML_ENTITIES) rtmp_base = tree.find('meta')['base'] filenames = tree.findAll('video') hbitrate = -1 sbitrate = int(common.settings['quality']) for filename in filenames: bitrate = int(filename['system-bitrate'])/1024 if bitrate > hbitrate and bitrate <= sbitrate: hbitrate = bitrate playpath = filename['src'] swfUrl = 'http://www.aetv.com/js/minisite4g/VideoPlayer.swf' rtmpurl = rtmp_base+' playpath='+playpath + " swfurl=" + swfUrl + " swfvfy=true" item = xbmcgui.ListItem(path=rtmpurl) xbmcplugin.setResolvedUrl(pluginhandle, True, item)
def search(self, terms): torrents = [] data = {'SearchString': '', 'SearchString1': terms, 'search': 'Search'} req = Request(self.search_uri, urlencode(data)) req.add_header('User-Agent', self.user_agent) f = urlopen(req) soup = BeautifulStoneSoup(f.read()) for (c, item) in enumerate(soup.findAll('a', {'class': 'magnet'})): if c == 30: break info = item.findPrevious('a') link = self.uri_prefix + info['href'] item_req = Request(link) item_req.add_header('User-Agent', self.user_agent) item_f = urlopen(item_req) item_soup = BeautifulStoneSoup(item_f.read()) sp = item_soup.findAll('span', {'class': re.compile('^stat_')}) if sp: sp = [int(i.text.replace(',', '')) for i in sp] else: sp = [0, 0] torrents.append({ 'url': item['href'], 'name': info.text.encode('utf-8'), 'seeds': sp[0], 'leechers': sp[1] }) return torrents
def loadJATSSentence(self, s, newDocument, par_id, section_id): """ Loads a JATS sentence (ready split) :param s: the plain text of the sentence (with all tags inside, e.g. <xref>) :param newDocument: SciDoc :param par_id: id of the paragraph containing this sentence :param section_id: id of the section containing the paragraph """ newSent=newDocument.addSentence(par_id,"") s_soup=BeautifulStoneSoup(s) refs=s_soup.findAll("xref",{"ref-type":"bibr"}) citations_found=[] for r in refs: citations_found.extend(self.loadJATSCitation(r, newSent["id"], newDocument, section=section_id)) non_refs=s_soup.findAll(lambda tag:tag.name.lower()=="xref" and tag.has_key("ref-type") and tag["ref-type"].lower() != "bibr") for nr in non_refs: nr.name="inref" newSent["citations"]=[acit["id"] for acit in citations_found] # TODO replace <xref> tags with <cit> tags newSent["text"]=newDocument.extractSentenceTextWithCitationTokens(s_soup, newSent["id"]) ## print(newSent["text"]) # deal with many citations within characters of each other: make them know they are a cluster # TODO cluster citations? Store them in some other way? newDocument.countMultiCitations(newSent)
def Play(self, stream_name, stream_id, subtitle): id = re.compile('video\/(.*?)-').search(str(stream_id)).group(1) url = 'http://eredivisielive.nl/content/playlist/website/%s_ere_lr.xml' % (id,) data = tools.urlopen(self.app, url) soup = BeautifulStoneSoup(data, convertEntities=BeautifulSoup.XML_ENTITIES, smartQuotesTo="xml") domain = soup.findAll('videodock:streamer')[0].contents[0] media = soup.findAll('media:content') quality = [] files = {} for i in media: quality.append(int(i['bitrate'])) files[int(i['bitrate'])] = i['url'] quality = sorted(quality) url = 'http://www.bartsidee.nl/flowplayer/player.php?url=' + str(domain) + '&clip=mp4:' + str(files[quality.pop()]) play = CreatePlay() play.content_type = 'video/x-flv' play.path = quote_plus(url) play.domain = 'bartsidee.nl' return play
def inlines(value, return_list=False): try: from BeautifulSoup import BeautifulStoneSoup except ImportError: from beautifulsoup import BeautifulStoneSoup content = BeautifulStoneSoup(value, selfClosingTags=['inline', 'img', 'br', 'input', 'meta', 'link', 'hr']) # Return a list of inline objects found in the value. if return_list: inline_list = [] for inline in content.findAll('inline'): rendered_inline = render_inline(inline) inline_list.append(rendered_inline['context']) return inline_list # Replace inline markup in the value with rendered inline templates. else: for inline in content.findAll('inline'): rendered_inline = render_inline(inline) if rendered_inline: inline_template = render_to_string(rendered_inline['template'], rendered_inline['context']) else: inline_template = '' value = value.replace(str(inline), inline_template) return mark_safe(unicode(value))
def inlines(value, return_list=False): try: from BeautifulSoup import BeautifulStoneSoup except ImportError: from beautifulsoup import BeautifulStoneSoup content = BeautifulStoneSoup( value, selfClosingTags=['inline', 'img', 'br', 'input', 'meta', 'link', 'hr']) # Return a list of inline objects found in the value. if return_list: inline_list = [] for inline in content.findAll('inline'): rendered_inline = render_inline(inline) inline_list.append(rendered_inline['context']) return inline_list # Replace inline markup in the value with rendered inline templates. else: for inline in content.findAll('inline'): rendered_inline = render_inline(inline) if rendered_inline: inline_template = render_to_string(rendered_inline['template'], rendered_inline['context']) else: inline_template = '' value = value.replace(str(inline), inline_template) return mark_safe(unicode(value))
def parse_config(file_to_read): parsed = BeautifulStoneSoup(open(file_to_read).read()) adapters = parsed.findAll('adapter') if (not adapters): adapters = parsed.findAll('interface') host_tag = parsed.find('hostname') if host_tag: host_name = host_tag.string.lower() else: host_name = None domain_tag = parsed.find('domainname') if domain_tag: domain_name = domain_tag.string if domain_name: domain_name = domain_name.lower() else: domain_name = None ip_list = [] for adapter in adapters: mac = (adapter.find('address').string if adapter.find('address') else None) if mac: mac = mac.replace('-', ':').lower() adapter_ips = adapter.findAll('adapterip') for adapter_ip_node in adapter_ips: if (not adapter_ip_node): continue ip = '' for ip_address in adapter_ip_node.find('ip'): ip = ip_address.string.strip() if (not ip): continue info = {'host_name': host_name, 'domain_name': domain_name, 'ip_address': ip, 'mac_address': mac} if ((info not in ip_list) and (ip != '127.0.0.1') and (':' not in ip)): ip_list.append(info) return ip_list
def search(self, terms): torrents = [] data = {'SearchString': '', 'SearchString1': terms, 'search': 'Search'} req = Request(self.search_uri, urlencode(data)) req.add_header('User-Agent', self.user_agent) f = urlopen(req) soup = BeautifulStoneSoup(f.read()) for (c, item) in enumerate(soup.findAll('a', {'class': 'magnet'})): if c == 30: break info = item.findPrevious('a') link = self.uri_prefix + info['href'] item_req = Request(link) item_req.add_header('User-Agent', self.user_agent) item_f = urlopen(item_req) item_soup = BeautifulStoneSoup(item_f.read()) sp = item_soup.findAll('span', {'class': re.compile('^stat_')}) if sp: sp = [int(i.text.replace(',', '')) for i in sp] else: sp = [0, 0] torrents.append({ 'url': item['href'], 'name': info.text, 'seeds': sp[0], 'leechers': sp[1] }) return torrents
def index(self): # Get a file-like object for the Python Web site's home page. f = urllib.urlopen("http://thefmly.com/feed") d = feedparser.parse("http://feedparser.org/docs/examples/atom10.xml") # f = urllib.urlopen("http://www.visitation-rites.com/audio/") # Read from the object, storing the page's contents in 's'. s = f.read() f.close() e = d.entries[0].date print e #IF XML soup = BeautifulStoneSoup(s) posts = soup.findAll('item') bundles = [] print soup print soup.findAll("pubdate") for post in posts: print post.pubdate postsoup = BeautifulSoup(str(post.contents)) mp3s = postsoup.findAll(url=re.compile(".mp3$"))# [0]['href'] # if post.has_key('pubDate'): # print post['pubDate'] # else: # print "nope" # print postsoup.find("link")# [0]['href'] if(len(mp3s)): # print post.get("pubDate") songs = [] for song in mp3s: songs.append(song.get('url').encode('utf8')) bundles.append({"link": post.link.string.encode('utf8'), "title": post.title.string.encode('utf8'), "mp3s": songs}) return dict(songs=bundles)
def getFileTypes(url): #list filetypes p=re.compile('/details/(.*)') match=p.findall(url) for name in match: temp= 'http://www.archive.org/download/'+name+'/'+name+'_files.xml' link=getLink(temp) tree=BeautifulStoneSoup(link) shn=tree.findAll('file', attrs= {"name" : re.compile('(.+?\.shn$)')}) m3u=tree.findAll('file', attrs= {"name" : re.compile('(.+?\.m3u$)')}) flac=tree.findAll('file', attrs= {"name" : re.compile('(.+?\.flac$)')}) mp3=tree.findAll('file', attrs= {"name" : re.compile('(.+?64kb\.mp3$)')}) vbr=tree.findAll('file', attrs= {"name" : re.compile('(.+?vbr\.mp3$)')}) if len(m3u)>0: addDir('.m3u Playlists',temp,7) if len(flac)>0: addDir('1. Flac Files',temp,7) if len(mp3)>0: addDir('2. VBR mp3',temp,7) if len(vbr)>0: addDir('3. 64kB mp3',temp,7) if len(shn)>0: addDir('1. Shorten Files',temp,7)
def index(self): # Get a file-like object for the Python Web site's home page. f = urllib.urlopen("http://thefmly.com/feed") # Read from the object, storing the page's contents in 's'. s = f.read() f.close() soup = BeautifulStoneSoup(s) posts = soup.findAll('item') bundles = [] for post in posts: content = str(post.contents).replace("<![CDATA[", "") postsoup = BeautifulStoneSoup(content) # print postsoup.prettify() mp3s = postsoup.findAll(url=re.compile(".mp3$")) # mp3s = postsoup.findAll(text=re.compile(".mp3$")) encoded = postsoup.findAll(re.compile(":encoded$")) # print encoded contentsoup = BeautifulSoup(str(encoded)) # print contentsoup mp3s = contentsoup.findAll('a', href=re.compile(".mp3$")) # print len(mp3s) if(len(mp3s)): songs = [] for song in mp3s: # print song.contents songs.append([song.get('href').encode('utf8'), song.string.encode('utf8')]) bundles.append({"date": post.pubdate, "description": post.description.string.encode('utf8'), "link": post.link.string.encode('utf8'), "title": post.title.string.encode('utf8'), "mp3s": songs}) return dict(songs=bundles)
def svg(self): xml = ''.join(l for l in file('/home/bteam/Dropbox/mm/bkz/striped.svg').readlines()) soup = BeautifulStoneSoup(xml,selfClosingTags=[u'polygon', u'polyline', u'path', u'line', u'rect']) # for name in [u'polygon', u'polyline', u'text', u'path', u'line', u'rect']: tags = soup.findAll() # print name.upper() css = {'text':'color:#000;fill:'} for k,g in groupby(sorted(chain.from_iterable(t.attrs for t in tags)),key=lambda x: str(x[0])): if 'y' not in k and 'x' not in k and k not in ('d','transform','points'): c = Counter(v for k,v in g) i=0 for ke,v in c.items(): i+=1 if v > 10: style = u'%s:%s' % (k,ke) css[u'.%s%d' % (k[:3]+k[-3:],i)]=style for j,t in enumerate(soup.findAll(attrs={k:ke})): del t[k] t['class']=t.get('class',u'') + u' %s%d' % (k[:3]+k[-3:],i) for t in soup.findAll('text'): t['fill']='#000' new = file('bkz.html','w') style= unicode(u'\n'.join(u'%s {%s}' % (k,v) for k,v in css.items())) new.write('<html> <head> <title>BKZ</title> <style>') new.write(style) new.write('</style> </head> <body>') new.write(soup.prettify()) new.write('</body> </html>') new.close()
def getSkills(self, db, name, id, vCode): accountCharacters = "http://api.eveonline.com/account/Characters.xml.aspx" charSheet = "http://api.eveonline.com/char/CharacterSheet.xml.aspx" print "Processing %s" % name params = urllib.urlencode({"keyID": id, "vCode": vCode}) f = urllib.urlopen(accountCharacters+"?"+params) data = f.read() f.close() soup = BeautifulStoneSoup(data) r = soup.findAll("row", {"name":unicode(name)}) if len(r)==0: return (1, "Character not found") corp = r[0]["corporationname"] charid = r[0]["characterid"] params = urllib.urlencode({"keyID": id, "vCode": vCode, "characterID":charid}) f = urllib.urlopen(charSheet+"?"+params) data = f.read() f.close() soup = BeautifulStoneSoup(data) error = soup.findAll("error") if len(error): print "Error" skills = str(soup.findAll("rowset", {"name": "skills"})[0]).split("\n") skills = map(lambda x:x.replace("</row>", ""), skills) skills = filter(lambda x:x.startswith("<row "), skills) skills = map(lambda x: skillstripper.match(x).groups(), skills) print len(skills) for t, l in skills: t=int(t) l=int(l) r = db.execute('INSERT INTO skills (name, typeid, level) VALUES (%s, %s, %s) ON DUPLICATE KEY UPDATE level=%s', (name, t, l, l) )
def inlines(value, return_list=False): try: from BeautifulSoup import BeautifulStoneSoup except ImportError: from beautifulsoup import BeautifulStoneSoup content = BeautifulStoneSoup( value, selfClosingTags=['inline', 'img', 'br', 'input', 'meta', 'link', 'hr']) inline_list = [] if return_list: for inline in content.findAll('inline'): rendered_inline = render_inline(inline) inline_list.append(rendered_inline['context']) return inline_list else: for inline in content.findAll('inline'): rendered_inline = render_inline(inline) if rendered_inline: inline.replaceWith( BeautifulStoneSoup( render_to_string(rendered_inline['template'], rendered_inline['context']))) else: inline.replaceWith(BeautifulStoneSoup('')) return mark_safe(content)
def resetfonts(app) : xml = """<?xml version="1.0"?><!DOCTYPE fontconfig SYSTEM "fonts.dtd"> <fontconfig> </fontconfig>""" try : xml = open(fontconfigpath).read() except: print "" soup = BTS(xml) app.mathcs = soup.findAll('match') app.alias = soup.findAll('alias') if len(app.alias) == 0 : xmls = open("/etc/fonts/conf.d/60-latin.conf").read() soups = BTS(xmls) app.alias = soups.findAll('alias') firstime = True for alisstem in app.alias : if alisstem.family.string == "serif" : app.serif = alisstem.prefer.findAll('family') if alisstem.family.string == "sans-serif" : app.sans = alisstem.prefer.findAll('family') if alisstem.family.string == "monospace" : app.monospace = alisstem.prefer.findAll('family') app.fontbutton1.set_font_name(str(app.sans[0].string) + " 12") app.fontbutton2.set_font_name(str(app.sans[1].string) + " 12") app.fontbutton3.set_font_name(str(app.serif[0].string) + " 12") app.fontbutton4.set_font_name(str(app.serif[1].string) + " 12") app.fontbutton5.set_font_name(str(app.monospace[0].string) + " 12") app.fontbutton6.set_font_name(str(app.monospace[1].string) + " 12")
def search(self, terms): torrents = [] url = self.search_uri % quote_plus(terms) try: f = requests.get(url) except: raise Exception("something wrong") if f.status_code != requests.codes.ok: f.raise_for_status() soup = BeautifulStoneSoup(f.text) for item in soup.findAll("item"): item_quality = item.link.text.rpartition("_")[2] try: item_f = requests.get(item.link.text) except: raise Exception("something wrong") if f.status_code != requests.codes.ok: f.raise_for_status() item_soup = BeautifulStoneSoup(item_f.text) qualities = [s.text.strip() for s in item_soup.findAll("span", {"class": re.compile("^tech-quality")})] q_index = qualities.index(item_quality) span = item_soup.findAll("span", {"title": "Peers and Seeds"})[q_index] ps_pos = len(span.parent.contents) - 1 ps = span.parent.contents[ps_pos].split("/") torrents.append( {"url": item.enclosure["url"], "name": item.title.text, "seeds": int(ps[1]), "leechers": int(ps[0])} ) return torrents
def search(self, terms): torrents = [] url = self.search_uri % quote_plus(terms) try: f = requests.get(url, headers=self.headers) except: raise Exception("something wrong") if f.status_code != requests.codes.ok: f.raise_for_status() soup = BeautifulStoneSoup(f.text) for (c, item) in enumerate(soup.findAll("a", {"class": "magnet"})): if c == 30: break info = item.findPrevious("a") link = self.search_uri % quote_plus(info["href"]) try: item_f = requests.get(link, headers=self.headers) except: raise Exception("something wrong") if item_f.status_code != requests.codes.ok: item_f.raise_for_status() item_soup = BeautifulStoneSoup(item_f.text) sp = item_soup.findAll("span", {"class": re.compile("^stat_")}) if sp: sp = [int(i.text.replace(",", "")) for i in sp] else: sp = [0, 0] torrents.append({"url": item["href"], "name": info.text, "seeds": sp[0], "leechers": sp[1]}) return torrents
def parse_sitemap(url_string, url_links=None): try: if not url_links: url_links = [] if not 'tag' in url_string: r = requests.get(url_string, timeout=10) # html = urllib2.urlopen(url_string) # if 200 != resp.status_code: # return [] soup = Soup(r.text) sitemap = soup.findAll('sitemapindex') url_set = soup.findAll('urlset') locs = [] if sitemap: locs = [s.string for s in soup.findAll('loc')] for l in locs: url_links = parse_sitemap(l, url_links) elif url_set: urls = soup.findAll('url') if urls: fir_url = urls[0].find('loc').string if is_article_url(fir_url): for u in urls: link = u.find('loc').string img_src = [ img.find('image:loc').string for img in u.findAll('image:image')] url_links.append({'link': link, 'images': img_src}) except Exception as e: print("Error occurred during parse sitemap", e) return url_links
def play(url=common.args.url): finalurl=False playpath=False vid=re.compile('(VD\d*)').findall(url)[0] rtmpdata = 'http://cdn.abc.go.com/vp2/ws/s/contents/2003/utils/video/mov/13/9024/%s/432?v=06000007_3' % vid data = common.getURL(rtmpdata) tree=BeautifulStoneSoup(data, convertEntities=BeautifulStoneSoup.HTML_ENTITIES) hosts = tree.findAll('host') for host in hosts: if host['name'] == 'L3': rtmp = 'rtmp://%s/%s' % (host['url'], host['app']) filenames = tree.findAll('video') hbitrate = -1 sbitrate = int(common.settings['quality']) platpath=False for filename in filenames: if filename['src'] <> '': bitrate = int(filename['bitrate']) if bitrate > hbitrate and bitrate <= sbitrate: hbitrate = bitrate playpath = filename['src'] if playpath: swfUrl = 'http://livepassdl.conviva.com/ver/2.27.0.42841/LivePassModuleMain.swf' finalurl = rtmp+' playpath='+playpath + " swfurl=" + swfUrl + " swfvfy=true" else: plid= re.compile('(PL\d*)').findall(url)[0] clipurl = 'http://abc.go.com/vp2/ws/s/contents/1000/videomrss?brand=001&device=001&start=0&limit=100&fk=CATEGORIES&fv='+plid data = common.getURL(clipurl) tree=BeautifulStoneSoup(data, convertEntities=BeautifulStoneSoup.HTML_ENTITIES) for video in tree.findAll('item'): if video.find('guid').string == vid: finalurl = video.find('media:content')['url'] if finalurl: item = xbmcgui.ListItem(path=finalurl) xbmcplugin.setResolvedUrl(pluginhandle, True, item)
def search(self, terms): torrents = [] url = self.search_uri % quote(terms, '') f = urlopen(url) soup = BeautifulStoneSoup(f.read()) for item in soup.findAll('item'): item_quality = item.link.text.rpartition('_')[2] item_f = urlopen(item.link.text) item_soup = BeautifulStoneSoup(item_f.read()) qualities = [ s.text.strip() for s in item_soup.findAll( 'span', {'class': re.compile('^tech-quality')}) ] q_index = qualities.index(item_quality) span = item_soup.findAll('span', {'title': 'Peers and Seeds'})[q_index] ps_pos = len(span.parent.contents) - 1 ps = span.parent.contents[ps_pos].split('/') torrents.append({ 'url': item.enclosure['url'], 'name': item.title.text, 'seeds': int(ps[1]), 'leechers': int(ps[0]) }) return torrents
def open(self, book_id=None): if book_id: self.book_id = book_id if not self.book_id: raise Exception('Book id not set') self.f = zipfile.ZipFile(self._FILE % self.book_id, 'r') soup = BeautifulStoneSoup(self.f.read('META-INF/container.xml')) oebps = soup.findAll('rootfile')[0]['full-path'] folder = oebps.rfind(os.sep) self.oebps_folder = '' if folder == -1 else oebps[:folder + 1] # 找到oebps的文件夹名称 oebps_content = self.f.read(oebps) self.read_doc_props(oebps_content) opf_bs = BeautifulStoneSoup(oebps_content) ncx = opf_bs.findAll('item', {'id': 'ncx'})[0] ncx = self.oebps_folder + ncx['href'] # 找到ncx的完整路径 ncx_bs = BeautifulStoneSoup(self.f.read(ncx)) self.chapters = [ (nav.navlabel.text, nav.content['src']) for nav in ncx_bs.findAll('navmap')[0].findAll('navpoint') ]
def inlines(value, return_list=False, clear_inlines=False): try: from BeautifulSoup import BeautifulStoneSoup except ImportError: from beautifulsoup import BeautifulStoneSoup content = BeautifulStoneSoup(value, selfClosingTags=['inline','img','br','input','meta','link','hr']) inline_list = [] if return_list: for inline in content.findAll('inline'): rendered_inline = render_inline(inline) inline_list.append(rendered_inline['context']) return inline_list else: for inline in content.findAll('inline'): if not clear_inlines: rendered_inline = render_inline(inline) else: rendered_inline = None if rendered_inline: inline.replaceWith(render_to_string(rendered_inline['template'], rendered_inline['context'])) else: inline.replaceWith('') return mark_safe(content)
def prepare_data(file_name): file = open(file_name) soup = BeautifulStoneSoup(file.read()) text_collection = [] for node in soup.findAll("text"): temp = [] for sentence in node.findAll("sentence"): temp.append(Sentence(sentence)) text_collection.append(temp) hypo_collection = [] for node in soup.findAll("hypothesis"): temp = [] for sentence in node.findAll("sentence"): temp.append(Sentence(sentence)) hypo_collection.append(temp) # print len(text_collection[0]), len(hypo_collection[0]) data_set = [] for i in range(len(text_collection)): H_T_pair = [text_collection[i], hypo_collection[i]] data_set.append(H_T_pair) return data_set
def initial_read(): """ Reads the contents of ElectionEvent.xml. This file should not change during the election, so should only need to be read once. In case results are not yet available, it also zeroes out data. """ if OFFLINE: html = file("ElectionEvent.xml").read() else: try: html = urlopen("%sElectionEvent.xml" % BASE_URL) except URLError: return None soup = BeautifulStoneSoup(html) election = soup.find('election') elect = {'nm': election['nm'], 'des': election['des'], \ 'jd': election['jd'], 'ts': election['ts'], 'pol': 0, 'clpol': 0} areatypes = soup.findAll('areatype') atyp = soup_to_dict(areatypes, 'id', ['nm', 's', 'id']) areas = soup.findAll('area') area = soup_to_dict(areas, 'id', ['nm', 'atid', 'el', 's', 'id']) contests = soup.findAll('contest') contest = soup_to_dict(contests, 'id', ['nm', 'aid', 'el', 's', 'id']) parties = soup.findAll('party') party = soup_to_dict(parties, 'id', ['nm', 'ab', 's', 'id']) choices = soup.findAll('choice') choice = soup_to_dict(choices, 'id', ['nm', 'conid', 's', 'id']) return {'election': elect, 'areatype': atyp, 'area': area, 'contest': contest, 'choice': choice, 'party': party}
def parse(cls_, file_handle, fail_fast=True): ''' parse is the main entry point for an OfxParser. It takes a file handle and an optional log_errors flag. If fail_fast is True, the parser will fail on any errors. If fail_fast is False, the parser will log poor statements in the statement class and continue to run. Note: the library does not guarantee that no exceptions will be raised to the caller, only that statements will include bad transactions (which are marked). ''' cls_.fail_fast = fail_fast if isinstance(file_handle, type('')): raise RuntimeError(u"parse() takes in a file handle, not a string") ofx_obj = Ofx() # Store the headers ofx_file = OfxFile(file_handle) ofx_obj.headers = ofx_file.headers ofx_obj.accounts = [] ofx = BeautifulStoneSoup(ofx_file.fh) if len(ofx.contents) == 0: raise OfxParserException('The ofx file is empty!') stmtrs_ofx = ofx.findAll('stmtrs') if stmtrs_ofx: ofx_obj.accounts += cls_.parseStmtrs(stmtrs_ofx, AccountType.Bank) ccstmtrs_ofx = ofx.findAll('ccstmtrs') if ccstmtrs_ofx: ofx_obj.accounts += cls_.parseStmtrs( ccstmtrs_ofx, AccountType.CreditCard) invstmtrs_ofx = ofx.findAll('invstmtrs') if invstmtrs_ofx: ofx_obj.accounts += cls_.parseInvstmtrs(invstmtrs_ofx) seclist_ofx = ofx.find('seclist') if seclist_ofx: ofx_obj.security_list = cls_.parseSeclist(seclist_ofx) else: ofx_obj.security_list = None acctinfors_ofx = ofx.find('acctinfors') if acctinfors_ofx: ofx_obj.accounts += cls_.parseAcctinfors(acctinfors_ofx, ofx) fi_ofx = ofx.find('fi') if fi_ofx: for account in ofx_obj.accounts: account.institution = cls_.parseOrg(fi_ofx) if ofx_obj.accounts: ofx_obj.account = ofx_obj.accounts[0] return ofx_obj
def getEpisodes(self, combinedShowID, proxy): # A bit hacky ... # Split into "title" style id and URL splitString = re.findall('(.*)__(.*)',combinedShowID) titleShowID = str(splitString[0][0]) showId = str(splitString[0][1]).split('=')[-1] # Check if proxy enabled & set if proxy['proxy'] == True and proxy['proxy_address'] <> "" and proxy['proxy_port'] <> 0: # Set the proxy information if proxy['proxy_type'] == 'HTTP': socks.setdefaultproxy(socks.PROXY_TYPE_HTTP, proxy['proxy_address'], proxy['proxy_port']) elif proxy['proxy_type'] == 'SOCKS4': socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS4, proxy['proxy_address'], proxy['proxy_port']) elif proxy['proxy_type'] == 'SOCKS5': socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, proxy['proxy_address'], proxy['proxy_port']) # Wrap urllib2 module socks.wrapmodule(urllib2) page = urllib2.urlopen(PROGRAMME_URL + titleShowID) soup = BeautifulStoneSoup(page, selfClosingTags=['link','category','media:player','media:thumbnail']) page.close() items = soup.findAll('entry') if not items: # That works for some things, but not this show ... # OK, that didn't work. Try using the ID to search for episodes urlShowID = EPISODE%(showId) page = urllib2.urlopen(urlShowID) soup = BeautifulStoneSoup(page, selfClosingTags=['link','category','media:player','media:thumbnail']) page.close() items = soup.findAll('entry') for item in items: # This finds the entire element ... get the bit we want linkElement = item.find(attrs={'type' : 'application/atom+xml'}) mymatch = re.findall('href="(.*)"' , str(linkElement)) title = self.getStringFor(item, 'title') published = self.getStringFor(item, 'published') desc = self.getStringFor(item, 'media:description') thumb = self.getStringFor(item, 'media:thumbnail', 'url', LOGOICON) # Duration doesn't seem to make any difference ... duration = str(int(self.getStringFor(item, 'rte:duration','ms'))/1000/60) yield { 'PlotOutline' : title, 'Duration' : duration, 'Studio' : CHANNEL, 'Year' : int("%s" % (published[ : 4 ])), 'Date' : "%s-%s-%s" % ( published[ 8 : 10], published[ 5 : 7 ], published[ : 4 ]), 'Thumb' : thumb, 'Channel' : CHANNEL, 'url' : mymatch[0], 'Title' : title, 'mode' : MenuConstants.MODE_PLAYVIDEO, 'Plot' : desc }
def genAlarmNewsText(): newstemptext = '' CNNrssTop = requests.get('http://rss.cnn.com/rss/cnn_topstories.rss') CNNrssTop = BeautifulStoneSoup(CNNrssTop.text) newstemptext = newstemptext + "Today's top headlines are \n" + str(CNNrssTop.findAll('title')[2]).split('>')[1].split('<')[0] + '\n' newstemptext = newstemptext + 'with the description \n' + str(CNNrssTop.findAll('description')[2]).split('>')[1].split('&')[0] + '\n' newstemptext = newstemptext + 'and \n' + str(CNNrssTop.findAll('title')[3]).split('>')[1].split('<')[0] + '\n' newstemptext = newstemptext + 'with the description \n' + str(CNNrssTop.findAll('description')[3]).split('>')[1].split('&')[0] + '\n' newstemptext = newstemptext + '\n' return newstemptext
def handle_noargs(self, **options): for member in Members.objects.all(): adjustedName = member.full_name.replace(' ', '+') page = BS(urllib2.urlopen("http://news.google.com/news?q=%s&output=rss" % adjustedName)) newsmp = MPNews(link1=page.findAll('link', limit=1)) for item in page.findAll('item'): itemDate = item['pubDate'].split(' ') if itemDate[2] == 'Jan': pubMonth = 1 elif itemDate[2] == 'Feb': pubMonth = 2 elif itemDate[2] == 'Mar': pubMonth = 3 elif itemDate[2] == 'Apr': pubMonth = 4 elif itemDate[2] == 'May': pubMonth = 5 elif itemDate[2] == 'Jun': pubMonth = 6 elif itemDate[2] == 'Jul': pubMonth = 7 elif itemDate[2] == 'Aug': pubMonth = 8 elif itemDate[2] == 'Sep': pubMonth = 9 elif itemDate[2] == 'Oct': pubMonth = 10 elif itemDate[2] == 'Nov': pubMonth = 11 elif itemDate[2] == 'Dec': pubMonth = 12 pubTime = itemDate[4].split(':') newarticle = MPNewsArticle( title=item['title'], link=item['link'] description=item['description'] pubDate=datetime.datetime( int(itemDate[3]), pubMonth, int(itemDate[1]), int(pubTime[0]), int(pubTime[1]), int(pubTime[2]) #GMT though ) newarticle.save() newsmp.articles.add(newarticle) newsmp.save() member.news.add(newsmp) member.save()
def get_updates(since, for_series_ids=None): """Returns all updates since 'since'. optionally filtering on series id""" if isinstance(since, datetime): since = time.mktime(since.timetuple()) now = time.time() if since - now > ONE_DAY * 30: interval = "all" elif since - now > ONE_DAY * 7: interval = "month" elif since - now > ONE_DAY: interval = "week" else: interval = "day" url = "%sapi/%s/updates/updates_%s.zip" % (BASE_URL, API_KEY, interval) filename, headers = urllib.urlretrieve(url) zf = ZipFile(file(filename)) soup = BeautifulStoneSoup(zf.read("updates_%s.xml" % (interval,))) last_update = int(soup.data["time"]) soup = soup.data def _parse_series_update(soup): d = dict(id=_g(soup, "id", int), time=_g(soup, "time", int)) if d["time"] > since and (for_series_ids is None or d["id"] in for_series_ids): return d return None def _parse_episode_update(soup): d = dict(id=_g(soup, "id", int), series=_g(soup, "series", int), time=_g(soup, "time", int)) if d["time"] > since and (for_series_ids is None or d["series"] in for_series_ids): return d return None def _parse_banner_update(soup): d = dict( series=_g(soup, "series", int), format=_g(soup, "format"), language=_g(soup, "language"), time=_g(soup, "time", int), path=_g(soup, "path"), type=_g(soup, "type"), ) if d["time"] > since and (for_series_ids is None or d["series"] in for_series_ids): return d return None def _for_series(id): return for_series_ids is None or id in for_series_ids return dict( series=filter(None, [_parse_series_update(s) for s in soup.findAll("series", recursive=False)]), banners=filter(None, [_parse_banner_update(b) for b in soup.findAll("banner", recursive=False)]), episodes=filter(None, [_parse_episode_update(e) for e in soup.findAll("episode", recursive=False)]), )
def _getCourseListing(self): xml = urllib2.urlopen(ebRss) soup = BeautifulStoneSoup(xml) tags = soup.findAll('link') eids = [] courses = {} global venue for tag in tags: match = re.search(r"(event/)(\d+)(/rss)", str(tag)) if match: print "Found EventBrite ID %s : %s"%(match.group(2), str(tag)) eids.append(match.group(2)) for eid in eids: print "Querying EventBrite API for %s"%(eid) xml = urllib2.urlopen('https://www.eventbrite.com/xml/event_get?app_key=%s&id=%s'%(appkey, eid)) soup = BeautifulStoneSoup(xml) startdate = self._fixText(soup.find('start_date')) enddate = self._fixText(soup.find('end_date')) title = self._fixText(soup.find('title')) #desc = self._fixText(soup.find('description')) if not venue: venueXML = soup.find('venue') name = str(venueXML.find('name')) address = str(venueXML.find('address')) address2 = str(venueXML.find('address_2')) city = str(venueXML.find('city')) region = str(venueXML.find('region')) zip = str(venueXML.find('postal_code')) list = [name, address, address2, city, region] venue = self._fixText(", ".join(list) + " " + zip) print "Setting Venue: " + venue urls = soup.findAll('url') url = "" for addr in urls: m = re.search(r"\d+", str(addr)) if m: url = self._fixText(addr) startdate = time.gmtime(time.mktime(time.strptime(startdate, "%Y-%m-%d %H:%M:%S"))) enddate = time.gmtime(time.mktime(time.strptime(enddate, "%Y-%m-%d %H:%M:%S"))) desc = '<a href="%s">Click Here</a> for more info.'%(url) thisCourse = {'title':title, 'desc':desc, 'startdate':startdate, 'enddate':enddate, 'url':url} courses[eid] = thisCourse return courses
def __init__( self ): url = 'http://'+g_host+':32400/servers' html=self.getURL(url) tree=BeautifulStoneSoup(html, convertEntities=BeautifulStoneSoup.HTML_ENTITIES) LibraryTags=tree.findAll('server') print tree print LibraryTags Servers=[] Sections=[] count = 0 for object in LibraryTags: name=object.get('name') host=object.get('host') Servers.append([name,host]) for server in Servers: url='http://'+server[1]+':32400/library/sections' html=self.getURL(url) tree=BeautifulStoneSoup(html, convertEntities=BeautifulStoneSoup.HTML_ENTITIES) SectionTags=tree.findAll('directory') for countsections,object in enumerate(SectionTags): key=object.get('key') name=object.get('title') type=object.get('type') if type == 'show': window = "VideoLibrary" mode=1 if type == 'movie': window = "VideoLibrary" mode=2 if type == 'artist': window = "MusicLibrary" mode=3 try: art = 'http://'+server[1]+':32400'+object.get('art') except: art = "" s_url='http://'+server[1]+':32400/library/sections/'+key+'/all' count = count + 1 self.WINDOW.setProperty( "PlexbmcSection.%d.Title" % ( count), name ) self.WINDOW.setProperty( "PlexbmcSection.%d.Path" % ( count), s_url ) self.WINDOW.setProperty( "PlexbmcSection.%d.Mode" % ( count), str(mode) ) self.WINDOW.setProperty( "PlexbmcSection.%d.Window" % ( count), window ) self.WINDOW.setProperty( "PlexbmcSection.%d.Fanart" % ( count), art ) try: for i in range(count+1, int(self.WINDOW.getProperty("PlexbmcSection.Count"))+1): self.WINDOW.clearProperty("PlexbmcSection.%d.Title" % ( i ) ) self.WINDOW.clearProperty("PlexbmcSection.%d.Path" % ( i ) ) self.WINDOW.clearProperty("PlexbmcSection.%d.Mode" % ( i ) ) self.WINDOW.clearProperty("PlexbmcSection.%d.Window" % ( i ) ) self.WINDOW.clearProperty("PlexbmcSection.%d.Fanart" % ( i ) ) except: pass self.WINDOW.setProperty( "PlexbmcSection.Count", str(count) )
def __init__(self, txt): if not (txt[:100].find("<?xml") >= 0): raise ValueError("Not an XML stream.") soup = BeautifulStoneSoup(txt, convertEntities=BeautifulStoneSoup.ALL_ENTITIES) self[u'title'] = getStr(soup.find('dc:title')) self[u'title sort'] = getAttr(soup.find('meta', attrs={'name':'calibre:title_sort'}), 'content') authors = ( soup.findAll('dc:creator', attrs={'opf:role':'aut'}) or soup.findAll('dc:creator', attrs={'role':'aut'}) ) self[u'authors'] = None self[u'author sort'] = None print len(authors) print authors[0] if authors and len(authors): self[u'authors'] = u" & ".join([x for x in [getStr(author) for author in authors] if x is not None]) self[u'author sort'] = ( getAttr(authors[0], 'opf:file-as') or getAttr(authors[0], 'file-as') ) self[u'publication date'] = formatDate( getStr(soup.find('dc:date')) ) self[u'publisher'] = getStr(soup.find('dc:publisher')) self[u'book producer'] = ( getStr( soup.find('dc:contributor', attrs={'opf:role':'bkp'}) ) or getStr( soup.find('dc:contributor', attrs={'role':'bkp'}) ) ) self[u'isbn'] = ( getStr( soup.find('dc:identifier', attrs={'opf:scheme':'ISBN'}) ) or getStr( soup.find('dc:identifier', attrs={'opf:scheme':'isbn'}) ) or getStr( soup.find('dc:identifier', attrs={'scheme':'ISBN'}) ) or getStr( soup.find('dc:identifier', attrs={'scheme':'isbn'}) ) ) if not self[u'isbn']: for bookid in [getStr(x) for x in soup.findAll('dc:identifier')]: if bookid and ('isbn' in bookid.lower()): self[u'isbn'] = bookid.split(':')[-1] self[u'language'] = getStr(soup.find('dc:language')) self[u'rating'] = getAttr(soup.find('meta', attrs={'name':'calibre:rating'}), 'content') self[u'series'] = getAttr(soup.find('meta', attrs={'name':'calibre:series'}), 'content') self[u'series index'] = getAttr(soup.find('meta', attrs={'name':'calibre:series_index'}), 'content') self[u'uuid'] = ( getStr(soup.find('dc:identifier', attrs={'opf:scheme':'uuid'})) or getStr(soup.find('dc:identifier', attrs={'opf:scheme':'UUID'})) or getStr(soup.find('dc:identifier', attrs={'scheme':'uuid'})) or getStr(soup.find('dc:identifier', attrs={'scheme':'UUID'})) ) tags = soup.findAll('dc:subject') self[u'tags'] = [] if tags: self[u'tags'] = [getStr(x) for x in tags] #self['tags'] = ", ".join([getStr(x) for x in tags]) description = getStr(soup.find('dc:description')) self[u'description'] = htmlToMarkdown(description)
def rank_entities(tagged_sents, ent_count, ent_names): try: test = BeautifulStoneSoup.NESTABLE_TAGS['mention'] except KeyError: BeautifulStoneSoup.NESTABLE_TAGS['mention'] = [] for sent in tagged_sents: soup = BeautifulStoneSoup(sent) all_ents = [sub("<.*?>","",str(m)) for m in soup.findAll('mention')] ent_attrs = [m.attrs for m in soup.findAll('mention')] for i, ent in enumerate(ent_attrs): ent_count[ent[1][1]] += 1 ent_names[ent[1][1]].append(all_ents[i])
class dazzle_doc: """ Parse DAZzle document and provide a method to add lines of text as <RubberStamp[stampbase]> ... <RubberStamp[stampbase+n]> elements. """ def __init__(self, fname, stampbase): self.doc = BeautifulStoneSoup(open(fname, "r").read()) self.base = stampbase self.last = 20 def skus(self): for p in self.doc.findAll("package"): skus = p.findAll("rubberstamp2") if (len(skus) != 1): raise dazzle_error("expected exactly 1 'rubberstamp2' field in package, found %d" % len(skus)) sku = skus[0] yield sku.contents[0] def pkg(self, sku): for p in self.doc.findAll("package"): skuelem = p.findAll("rubberstamp2")[0] if skuelem.contents[0] == sku: return p def set_condition_notes(self, sku, condition_lines): p = self.pkg(sku) if p is None: raise dazzle_error("package with sku '%s' not found" % sku) # make sure we don't have these fields set yet for i in xrange(self.base, self.base+len(condition_lines)): stamp = p.findAll("rubberstamp%d" % i) if len(stamp): raise dazzle_error("package with sku '%s' already has field 'rubberstamp%d'" % (sku, i)) i = self.base for cl in condition_lines: n = Tag(self.doc, "rubberstamp%d" % i) n.append(NavigableString(cl)) p.append(n) i += 1 while(i < self.last): n = Tag(self.doc, "rubberstamp%d" % i) n.append(NavigableString(".")) p.append(n) i += 1 return p def save(self, fname): f = open(fname, "w") f.write(self.doc.renderContents()) f.close()
def process(file, tag='Layer'): """Take a file and search for a given tag, returning a data structure representing it. @param file: string containing xml to process. @param tag: tagname for lxml to search for. @return: list of dictionaries, one per tag found.""" logger.debug("parse.process: tag=%s" % tag) selfClosingTags = ['boundingbox'] root = BeautifulStoneSoup(file, selfClosingTags=selfClosingTags) logger.debug(root.findAll(tag)) obj = [_process(i) for i in root.findAll(tag)] return obj
def get_trailer(self, movie_title, quality_id, trailer_type='trailer'): self.__log(('get_trailer started with movie_title: %s ' 'trailer_type: %s quality_id: %s') % (movie_title, trailer_type, quality_id)) movie = self.get_single_movie(movie_title) url = self.MOVIE_URL % movie['movie_string'] try: if trailer_type != 'trailer': url = url.replace('index', trailer_type) cache_filename = '%s-%s.xml' % ( movie['movie_string'].split('/')[1], trailer_type) html = self.__get_url(url, cache_filename=cache_filename) r_section = re.compile('<array>(.*?)</array>', re.DOTALL) section = re.search(r_section, html).group(1) tree = BS(section, convertEntities=BS.XML_ENTITIES) trailers = [] for s in tree.findAll('dict'): for k in s.findAll('key'): if k.string == 'previewURL': url = k.nextSibling.string if quality_id in url: return ('%s?|User-Agent=%s' % (url, self.UA)) except: url = self.BACKUP_MOVIE_BASE % movie['movie_string'] tree = None if quality_id == 'h480p.mov': cache = (movie['movie_string'].split('/')[1] ) + trailer_type + quality_id + '.xml' tree = self.__get_tree(url + 'itsxml/25-' + trailer_type + '.xml', cache_filename=cache) if quality_id == 'h720p.mov': cache = (movie['movie_string'].split('/')[1] ) + trailer_type + quality_id + '.xml' tree = self.__get_tree(url + 'itsxml/26-' + trailer_type + '.xml', cache_filename=cache) if quality_id == 'h1080p.mov': cache = (movie['movie_string'].split('/')[1] ) + trailer_type + quality_id + '.xml' tree = self.__get_tree(url + 'itsxml/27-' + trailer_type + '.xml', cache_filename=cache) for s in tree.findAll('dict'): for k in s.findAll('key'): if k.string == 'URL': url = k.nextSibling.string if quality_id in url: return ('%s?|User-Agent=%s' % (url, self.UA))
def GET_RTMP(vid): #url = 'http://www.tbs.com/video/cvp/videoData.jsp?oid='+vid url = 'http://www.tbs.com/tveverywhere/content/services/cvpXML.do?titleId=' + vid html = common.getURL(url) tree = BeautifulStoneSoup(html, convertEntities=BeautifulStoneSoup.HTML_ENTITIES) #print tree.prettify() files = tree.findAll('file') if not files: url = 'http://www.tbs.com/tveverywhere/content/services/cvpXML.do?titleId=&id=' + vid html = common.getURL(url) tree = BeautifulStoneSoup( html, convertEntities=BeautifulStoneSoup.HTML_ENTITIES) #print tree.prettify() files = tree.findAll('file') if files: html = common.getURL(url) tree = BeautifulStoneSoup( html, convertEntities=BeautifulStoneSoup.HTML_ENTITIES) print tree.prettify() sbitrate = int(common.settings['quality']) hbitrate = -1 files = tree.findAll('file') for filenames in files: try: bitrate = int(filenames['bitrate']) except: bitrate = 1 if bitrate > hbitrate and bitrate <= sbitrate: hbitrate = bitrate filename = filenames.string serverDetails = tree.find('akamai') if serverDetails: filename = filename[1:len(filename) - 4] serverDetails = tree.find('akamai') server = serverDetails.find('src').string.split('://')[1] #get auth tokentype = serverDetails.find('authtokentype').string window = serverDetails.find('window').string aifp = serverDetails.find('aifp').string auth = getAUTH(aifp, window, tokentype, vid, filename.replace('mp4:', '')) swfUrl = 'http://www.tbs.com/cvp/tbs_video.swf' link = 'rtmpe://' + server + '?' + auth + " swfurl=" + swfUrl + " swfvfy=true" + ' playpath=' + filename elif 'http://' in filename: link = filename else: link = 'http://ht.cdn.turner.com/tbs/big' + filename return link
def extract_links(url): """ Scan a web page for all the <a> tags referencing documents (must have one of the extensions in reExtensions). Returns an array of (fully qualified) urls to documents. """ sock = urllib.urlopen(url) htmlSource = sock.read() sock.close() links = [] dirs = [] soup = BeautifulStoneSoup(htmlSource) for link in soup.findAll('a'): href = urlparse.urljoin(url, link['href']) if reExtensions.match(href) is None: continue links.append(href) links = list(set(links)) mUrls = set() # SharePoint directories are not regular href's - pull path info from onclick javascript # Example onclick: # # javascript:ClearSearchTerm("{8EF6AB92-467B-410F-94E3-82048923368B}"); # javascript:SubmitFormPost("http://old.mit-club.org/WebContent/Forms/AllItems.aspx? # RootFolder=%2fWebContent%2fDummy%20Folder& # View=%7b8EF6AB92%2d467B%2d410F%2d94E3%2d82048923368B%7d");javascript:return false; for link in soup.findAll('a', href='javascript:SubmitFormPost()'): matchDir = reSPDir.match(link['onclick']) if matchDir is None: print "Error parsing onclick directory name: %r" % link['onclick'] continue url = matchDir.group(1) if url in mUrls: continue mUrls.add(url) print "url: %s" % url matchPath = reSPPath.match(url) aPath = matchPath.group(1).split("%2f") dirs.append((url, aPath[-1])) return (links, dirs)
def parseSubs(data): subs = [] if addon.getSetting('subtitles') == 'false' or 'subtitleUrls' not in data: return subs for sub in data['subtitleUrls']: lang = sub['displayName'].split('(')[0].strip() Log('Convert %s Subtitle' % lang) srtfile = xbmc.translatePath('special://temp/%s.srt' % lang).decode('utf-8') srt = codecs.open(srtfile, 'w', encoding='utf-8') soup = BeautifulStoneSoup( getURL(sub['url'], retjson=False), convertEntities=BeautifulStoneSoup.XML_ENTITIES) enc = soup.originalEncoding num = 0 for caption in soup.findAll('tt:p'): num += 1 subtext = caption.renderContents().decode(enc).replace( '<tt:br>', '\n').replace('</tt:br>', '') srt.write(u'%s\n%s --> %s\n%s\n\n' % (num, caption['begin'], caption['end'], subtext)) srt.close() subs.append(srtfile) return subs
def parse_response(self, response, limit=None): strainer = self.strainer soup = BeautifulStoneSoup(response, selfClosingTags=self.self_closing_tags, parseOnlyThese=strainer) return self.parse_results( soup.findAll(recursive=False, limit=limit))
def get_mirror(type='xml'): """Returns a random mirror for a given type 'xml', 'zip', or 'banner'""" global _mirrors if not _mirrors.get(type): # Get the list of mirrors from tvdb page = None try: page = requests.get(server + api_key + '/mirrors.xml').content except RequestException: pass # If there were problems getting the mirror list we'll just fall back to the main site. if page: data = BeautifulStoneSoup( page, convertEntities=BeautifulStoneSoup.HTML_ENTITIES) for mirror in data.findAll('mirror'): type_mask = int(mirror.typemask.string) mirrorpath = mirror.mirrorpath.string for t in [(1, 'xml'), (2, 'banner'), (4, 'zip')]: if type_mask & t[0]: _mirrors.setdefault(t[1], set()).add(mirrorpath) else: log.debug('Unable to get the mirrors list from thetvdb.') if _mirrors.get(type): return random.sample(_mirrors[type], 1)[0] + ('/banners/' if type == 'banner' else '/api/') else: # If nothing was populated from the server's mirror list, return the main site as fallback return 'http://thetvdb.com' + ('/banners/' if type == 'banner' else '/api/')
def main(): """Generate a list of all the morphological tags in an XML document.""" in_file = codecs.open("proiel-GNT.xml", "rU", "utf-8") print "Parsing the input file with BeautifulStoneSoup..." print soup = BeautifulStoneSoup(in_file) print "Finding all the tokens..." print tokens = soup.findAll('token') out_file = codecs.open("GNT-morph-list.txt", "w", "utf-8") unique_tags = Set([]) for token in tokens: try: stuff = token['morph-features'].split(",") proiel_pos = stuff[1] proiel_morph = stuff[3] tag = proiel_pos + "_" + proiel_morph unique_tags.add(tag) except KeyError: pass for tag in unique_tags: print >> out_file, tag
def parse_netbios(file_to_read): parsed = BeautifulStoneSoup(file(file_to_read).read()) adapters = parsed.findAll('adapter') if adapters: call_name = parsed.find('callname').string if call_name[0].isdigit(): ip_address = unicode(call_name.strip()) else: ip_address = None netbios_list = [] for adapter in adapters: mac_address = adapter['adapter_addr'].replace('.', ':').strip() names_list = adapter.findAll('names') host_name = None domain_name = None for names_elements in names_list: type = names_elements.find('type') name = names_elements.find('name') if (type.string == 'Workstation Service'): host_name = unicode(name.string.strip()).lower() elif (type.string == 'Domain Name'): domain_name = unicode(name.string.strip()).lower() netbios_list += [{ 'host_name': host_name, 'domain_name': domain_name, 'ip_address': ip_address, 'mac_address': mac_address }] return netbios_list
def find_series_id(name): """Looks up the tvdb id for a series""" url = server + 'GetSeries.php?seriesname=%s&language=%s' % ( urllib.quote(name), language) try: page = requests.get(url).content except RequestException as e: raise LookupError("Unable to get search results for %s: %s" % (name, e)) xmldata = BeautifulStoneSoup( page, convertEntities=BeautifulStoneSoup.HTML_ENTITIES).data if not xmldata: log.error("Didn't get a return from tvdb on the series search for %s" % name) return # See if there is an exact match # TODO: Check if there are multiple exact matches firstmatch = xmldata.find('series') if firstmatch and firstmatch.seriesname.string.lower() == name.lower(): return int(firstmatch.seriesid.string) # If there is no exact match, sort by airing date and pick the latest # TODO: Is there a better way to do this? Maybe weight name similarity and air date series_list = [(s.firstaired.string, s.seriesid.string) for s in xmldata.findAll('series', recursive=False) if s.firstaired] if series_list: series_list.sort(key=lambda s: s[0], reverse=True) return int(series_list[0][1]) else: raise LookupError('No results for `%s`' % name)
class Extract(): def __init__(self, xml, filename, game_name, away_team, home_team): self.xml = xml self.game_name = game_name self.filename = filename self.soup = BeautifulStoneSoup(self.xml) self.home_team = home_team self.away_team = away_team def extract(self): plays = self.splitRowsIntoPlays() row_indexes = self.getPeriodIndexes() indexed_plays = self.combinePlaysWithPeriodIndexes(row_indexes, plays) self.dumpToFile(indexed_plays) def getGameData(self): gamedata = self.soup.find("game") print gamedata.attrs def getPlayByPlayData(self): playbyplaydata = self.soup.findAll("event") for play in playbyplaydata: print dict(play.attrs) def dumpToFile(self, list_data): writer = csv.writer(open( LOGDIR_EXTRACT + self.filename + '_pbp_nbacom', 'wb'), delimiter=',', lineterminator='\n') writer.writerows(list_data)
def Episode(self, stream_name, stream_id, page, totalpage): url = self.url_base + stream_id data = tools.urlopen(self.app, url, {'cache': 3600}) if data == "": mc.ShowDialogNotification("No episode found for " + str(stream_name)) return [] rssfeed = re.compile('</a> <a href="(.*?)">RSS</a>').search( data).group(1) url = self.url_base + rssfeed data = tools.urlopen(self.app, url, {'cache': 3600}) soup = BeautifulStoneSoup(data, convertEntities="xml", smartQuotesTo="xml") episodelist = list() for info in soup.findAll('item'): episode = CreateEpisode() episode.name = info.title.contents[0] episode.id = info.link.contents[0] episode.description = info.description.contents[0] episode.thumbnails = info.thumbnailimage.contents[0] episode.date = info.pubdate.contents[0] episode.page = page episode.totalpage = totalpage episodelist.append(episode) return episodelist
def Play(self, stream_name, stream_id, subtitle): play = CreatePlay() id = re.compile('tv8play.se\/play\/(.*?)\/').search( str(stream_id)).group(1) url = 'http://viastream.viasat.tv/PlayProduct/' + id data = tools.urlopen(self.app, url) soup = BeautifulStoneSoup(data, convertEntities="xml", smartQuotesTo="xml") video = soup.findAll('video')[0] video = '%r' % video.url.contents[0] video = video.replace("u'", "").replace("'", "") rtmp = video.split("/") rtmpURL = "/".join(rtmp[:4]) playPath = "/".join(rtmp[4:]) authPath = '' play.rtmpurl = playPath play.rtmpdomain = rtmpURL play.rtmpauth = authPath return play
def show_choices(self, menuw, info): items = [] soup = BeautifulStoneSoup(info, selfClosingTags=['feat']) results = soup.findAll('result') for result in results[:20]: # for performance reasons show the first possibilities only, # the more sensible hits are at the beginning of the list hid = result['hid'] title = result.titleTag.string.replace('&', '&') artist = result.artistTag.nameTag.string.replace('&', '&') items.append( menu.MenuItem('%s - %s' % (title, artist), action=self.fetch_lyric, arg=hid)) if len(items) > 0: msgtext = _('No exact match. ') msgtext += _('Here are some sugestions.') box = PopupBox(text=msgtext) box.show() time.sleep(2) box.destroy() choices_menu = menu.Menu(_('Choices'), items) menuw.pushmenu(choices_menu) else: box = PopupBox(text=_('Lyrics not found, sorry...')) box.show() time.sleep(3) box.destroy()
def parseBuckets(self, response): self.expect(response, httplib.OK) result = [] dom = BeautifulStoneSoup(response.read()) for element in dom.findAll('bucket'): result.append(element.find('name').string) return result
def initServerInfoBase(fileName): """ @description: Intializes soup for the Beautiful soup parser. Reads the exisitng Data from the fileName paramter. @todo:None @param xml: String, Name of file to be loaded in soup. @return: Boolean, True if a successful, else False """ if os.path.exists(fileName): try: f = open(fileName, "r") except: return None, False xml = f.read() f.close() soup = BeautifulStoneSoup(xml) serverinfolist = soup.findAll("serverinfo") else: serverinfolist = [] soup = BeautifulSoup() xml = "null" if len(serverinfolist) == 0: serverinfo = Tag(soup, "serverinfo") soup.insert(0, serverinfo) return soup, True
def videosRSS(url=common.args.url): link = common.getURL(url) mrssData = re.compile('mrssData += +"(.+)"').findall(link)[0]; mrssData = urllib2.unquote(base64.decodestring(mrssData)) tree=BeautifulStoneSoup(mrssData,convertEntities=BeautifulStoneSoup.HTML_ENTITIES) print tree.prettify() items = tree.findAll('item') for item in items: title = item.title.contents[0] plot = item.description.contents[0] thumb = item.findAll('media:thumbnail')[0]['url'] duration = item.findAll('media:content')[0]['duration'] smil = item.findAll('media:text')[5].contents[0] smil = smil.replace('smilUrl=','') #episode_list.append((title, image, duration, plot, smil)) u = sys.argv[0] u += '?url="'+urllib.quote_plus(smil)+'"' u += '&mode="history"' u += '&sitemode="play"' infoLabels={ "Title":title, #"Season":season, #"Episode":episode, "Plot":plot, #"premiered":airdate, "Duration":duration, #"TVShowTitle":common.args.name } common.addVideo(u,title,thumb,infoLabels=infoLabels) common.setView('episodes')