def kw(): from BeautifulSoup import BeautifulStoneSoup from urllib2 import urlopen tmp_create_lit () disease = request.vars.get ('d') result = db.executesql (''' SELECT keyword.kw, counts.count FROM (SELECT keycount.kw_id, keycount.count FROM (SELECT id FROM disease WHERE d = '%s') AS diseases INNER JOIN keycount ON keycount.d_id = diseases.id) AS counts INNER JOIN keyword ON keyword.id = counts.kw_id; ''' % (disease,)) rList = [] capabilities = urlopen (deployment_settings.geoserver.wms_capabilities ()).read () soup = BeautifulStoneSoup (capabilities) keywords = soup.findAll (name = 'keyword') for r in result: words = [] for k in keywords: if k.string == r[0]: words.append (k) mapList = [] for w in words: layer = w.parent.parent id = layer.find ('name').string name = layer.find ('title').string mapList.append ({'filename': id, 'name': name, 'type': 'db'}) d = dict (kw = r[0], count = r[1], numMaps = len (words), maps = mapList) rList.append (d) return json.dumps (rList)
def sync_geoserver (path): file = urlopen (path + '/wms?SERVICE=WMS&REQUEST=GetCapabilities') buffer = file.read () soup = BeautifulStoneSoup (buffer) layers = soup.findAll (name = 'layer') results = [] for l in layers: name = l.find ('title') id = l.find ('name') if name and id: text = name.string if not text: text = id.string m = match ('^([^\:]+)\:(.+)$', id.string) if m: p = m.group (1) f = m.group (2) else: p = '', f = id.string if dm.query ('maps', prefix = p, filename = f, name = text, src = path).first (): pass else: id = dm.insert ('maps', prefix = p, filename = f, name = text, src = path, public = True) keywords = l.findAll (name = 'keyword') kw = [] for k in keywords: kw.append (k.string)
def Episode(self, stream_name, stream_id, page, totalpage): url = self.url_base + stream_id data = tools.urlopen(self.app, url, {'cache':3600}) if data == "": mc.ShowDialogNotification("No episode found for " + str(stream_name)) return [] rssfeed = re.compile('</a> <a href="(.*?)">RSS</a>').search(data).group(1) url = self.url_base + rssfeed data = tools.urlopen(self.app, url, {'cache':3600}) soup = BeautifulStoneSoup(data, convertEntities="xml", smartQuotesTo="xml") episodelist = list() for info in soup.findAll('item'): episode = CreateEpisode() episode.name = info.title.contents[0] episode.id = info.link.contents[0] episode.description = info.description.contents[0] episode.thumbnails = info.thumbnailimage.contents[0] episode.date = info.pubdate.contents[0] episode.page = page episode.totalpage = totalpage episodelist.append(episode) return episodelist
def geocode(address="", city="", state="CA"): address = urllib.quote(address.encode('utf-8')) g_url = 'http://local.yahooapis.com/MapsService/V1/geocode?appid=' g_url += '0MoPk9DV34FH0rumXB_xENjSlf.jdG4woRO9nFqyUcM86nLsFSynUvAwZZo6g--' g_url += '&street=%s&city=%s&state=%s' % (address, city, state) url = urllib.urlopen(g_url) dom = BeautifulStoneSoup(url) url.close() coords = { 'address': None, 'latitude': None, 'longitude': None, } result_attr = dom.find('result') if result_attr and result_attr['precision'] == 'address': dom_fields = ['address', 'latitude', 'longitude'] for field in dom_fields: i = dom.find(field) if i: if field == 'address': coords[field] = i.string else: try: coords[field] = float(i.string) except: pass return coords
def handle_noargs(self, **options): page = BS(urllib2.urlopen("http://data.openaustralia.org/members/wikipedia-commons.xml")) for member in page.findAll("personinfo"): m = Member.objects.get(oa_id=member['id']) m.wikipedia = member['wikipedia_url'] m.save()
def search(self, terms): torrents = [] url = self.search_uri % quote_plus(terms) try: f = requests.get(url) except: raise Exception("something wrong") if f.status_code != requests.codes.ok: f.raise_for_status() soup = BeautifulStoneSoup(f.text) for details in soup.findAll("a", {"href": re.compile("^/torrent/")}): div = details.findNext("div") seeds = int(div.text) div = div.findNext("div") try: f_link = requests.get(self.uri_prefix + details["href"]) except: raise Exception("something wrong") if f.status_code != requests.codes.ok: f.raise_for_status() soup_link = BeautifulStoneSoup(f_link.text) link = soup_link.find("a", {"href": re.compile("^magnet:")}) if not link: continue torrents.append({"url": link["href"], "name": details.text, "seeds": seeds, "leechers": int(div.text)}) return torrents
def text_to_xml_item(self, filepath): """read file and generate xml""" pname = os.path.basename(filepath).replace(".txt", "") date = os.path.getmtime(filepath) (tags, title, content) = self.read_text(filepath) # TODO: do exception proc. categories = self.get_categories(filepath) date_str = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(date) ); date_str_gmt = time.strftime( "%Y-%m-%d %H:%M:%S", time.gmtime(date) ); pubDate_str = time.strftime( "%a, %d %b %Y %H:%M:%S +0000", time.gmtime(date) ); tidied = content tidied = tidied.replace("\r\n", "\n") rex = re.compile(r"<pre>.*?</pre>", re.S) tidied = rex.sub(self.escape, tidied) tidied = BeautifulStoneSoup(tidied).prettify() tidied = tidied.replace("\n", "") tidied = tidied.replace(",", ",") tidied = self.unescape(tidied) # add entry post_item = wordpress.Item( title = title, pubDate = pubDate_str, post_date = date_str, post_date_gmt = date_str_gmt, content = tidied, post_name = pname) post_item.tags = tags post_item.categories = categories self._wxr.channel.items.append(post_item)
def parse_complejosxml(data): """regresa un diccionario con 'id': 'nombre complejo'""" parser = BeautifulStoneSoup(data) ids = [{'id_org': complejo.clave.string, 'nombre': 'Cinemex ' + complejo.nombre.string} for complejo in parser.findAll('cine')] return ids
def videosRSS(url=common.args.url): link = common.getURL(url) mrssData = re.compile('mrssData += +"(.+)"').findall(link)[0]; mrssData = urllib2.unquote(base64.decodestring(mrssData)) tree=BeautifulStoneSoup(mrssData,convertEntities=BeautifulStoneSoup.HTML_ENTITIES) print tree.prettify() items = tree.findAll('item') for item in items: title = item.title.contents[0] plot = item.description.contents[0] thumb = item.findAll('media:thumbnail')[0]['url'] duration = item.findAll('media:content')[0]['duration'] smil = item.findAll('media:text')[5].contents[0] smil = smil.replace('smilUrl=','') #episode_list.append((title, image, duration, plot, smil)) u = sys.argv[0] u += '?url="'+urllib.quote_plus(smil)+'"' u += '&mode="history"' u += '&sitemode="play"' infoLabels={ "Title":title, #"Season":season, #"Episode":episode, "Plot":plot, #"premiered":airdate, "Duration":duration, #"TVShowTitle":common.args.name } common.addVideo(u,title,thumb,infoLabels=infoLabels) common.setView('episodes')
def ExtractPubPar(xmldata): """Yields sucessive paragraphs from a Pubmed xml""" xmltree = BeautifulStoneSoup(xmldata) v = xmltree.find("abstracttext") if v: yield v.string.strip()
def extract_stats(self, page, list_of_pmids): if not page: return([]) (response_header, content) = page response = [] soup = BeautifulStoneSoup(content) #print soup.prettify() for docsum in soup.findAll("docsum"): #print(tag.id.text) id = docsum.id.text author_list = [] response_dict = {} for item in docsum.findAll("item"): if item.get("name") == "DOI": doi = item.text response_dict.update(doi=doi) if item.get("name") == "pmc": pmcid = item.text share_details_url = "http://www.ncbi.nlm.nih.gov/pmc/articles/%s/citedby/?tool=pubmed" %pmcid response_dict.update(pmcid=pmcid, share_details_url=share_details_url) response += [(id, response_dict)] return(response)
def inlines(value, return_list=False): try: from BeautifulSoup import BeautifulStoneSoup except ImportError: from beautifulsoup import BeautifulStoneSoup content = BeautifulStoneSoup(value, selfClosingTags=['inline', 'img', 'br', 'input', 'meta', 'link', 'hr']) # Return a list of inline objects found in the value. if return_list: inline_list = [] for inline in content.findAll('inline'): rendered_inline = render_inline(inline) inline_list.append(rendered_inline['context']) return inline_list # Replace inline markup in the value with rendered inline templates. else: for inline in content.findAll('inline'): rendered_inline = render_inline(inline) if rendered_inline: inline_template = render_to_string(rendered_inline['template'], rendered_inline['context']) else: inline_template = '' value = value.replace(str(inline), inline_template) return mark_safe(unicode(value))
def handle(self, *args, **kwargs): resource = urlopen(TRAIL_REPORT_URL) soup = BeautifulStoneSoup(resource) lift = soup.find("lifts") cache.set(TRAIL_REPORT_CACHE_KEY, { "total": lift.get("total"), "open": lift.get("totalopen"), }, 7 * 24 * 60 * 60) resource = urlopen(WEATHER_REPORT_URL) soup = BeautifulStoneSoup(resource) report = soup.findAll("report")[1] forecast = [] weather_data = { "temperature": report.get("temp"), "surface": report.get("surfaceconditions"), } for i in range(1, 5): day = soup.find("day%d" % i) if day: forecast.append({ "day": day.get("day"), "status": WEATHER_TYPES[int(day.get("icon"))], }) weather_data["forecast"] = forecast cache.set(WEATHER_REPORT_CACHE_KEY, weather_data, 7 * 24 * 60 * 60)
def parse_categories(soup): categories_list = [] """ <category id="pre-school" genre="true"> <name>ABC 4 Kids</name> </category> """ # This next line is the magic to make recursive=False work (wtf?) BeautifulStoneSoup.NESTABLE_TAGS["category"] = [] xml = BeautifulStoneSoup(soup) # Get all the top level categories, except the alphabetical ones for cat in xml.find('categories').findAll('category', recursive=False): id = cat.get('id') if cat.get('index') or id == 'index': continue item = {} item['keyword'] = id item['name'] = cat.find('name').string; categories_list.append(item); return categories_list
def xml_to_dict (self, data): from BeautifulSoup import BeautifulStoneSoup as BS soup = BS(data) username = soup.find('db:uid').contents[0] uid = soup.find('id').contents[0].split('/')[-1] title = soup.find('title').contents[0] return {'id':uid, 'username':username,'title':title}
def search(self, terms, settings={}): torrents = [] f = None for url in self.search_uris: try: final_url = url + '/usearch/' + terms.replace(' ','%20') + '/?field=seeders&sorder=desc&rss=1' request = urlRequest(final_url) request.add_header('Accept-encoding','gzip') response = urlopen(request) if response.info().get('Content-Encoding') == 'gzip': buf = StringIO(response.read()) data = gzip.GzipFile(fileobj=buf) f = data.read() else: f = response.read() break except: pass if not f: raise Exception('Out of kickass proxies') soup = BeautifulStoneSoup(f) for item in soup.findAll('item'): isVerified = int(item.find('torrent:verified').text) if isVerified == 1 or str(settings['trusted_uploaders']).lower() != 'true': torrents.append({ 'url': item.find('torrent:magneturi').text, 'name': item.title.text, 'seeds': int(item.find('torrent:seeds').text), 'leechers': int(item.find('torrent:peers').text), }) sorted_torrents = sorted(torrents,key = lambda k: k['seeds'], reverse=True) return sorted_torrents
def parse(self): """Parses the raw XML of the input file.""" print "Parsing raw XML file using BeautifulStoneSoup..." print # initial parse soup = BeautifulStoneSoup(self.in_file, selfClosingTags=['milestone', 'ref']) print "Finished parsing raw XML using BeautifulStoneSoup." print out_name = self.to_scrub self.open_out(out_name) print "Finding major divisions in the XML file..." print count = 1 # gets sub-trees for all books divisions = soup.findAll(type="book") book = raw_input("What number book of Herodotus would you like to scrub? ") for division in divisions: if count == int(book): self.scrub(division, book) count += 1
def gettrackinfo(self, track="", artist=""): """ getinfo - Get info from Lastfm about a particular song @param track - Track name and other information we may have @return song - Return the song information on success else return None on failure """ track = track.replace("&", "and") artist = artist.replace("&", "and") url = self.trackurl+"&track="+track.encode('ascii')+"&artist="+artist.encode('ascii') try: file = urllib.urlopen(url) page = BeautifulStoneSoup(file.read()) file.close() info = page.find("track") song = {} song["track"] = info.find("name").next song["artist"] = info.find("artist").next song["albumart"] = info.findAll("image")[0].next except Exception as e: print e return None return song
def getXmlCursor(xml): """解析xml,获取当前查询的起始位置,每页数量,总共数量""" soup = BeautifulStoneSoup(xml) start = int(soup.find('opensearch:startindex').string) count = int(soup.find('opensearch:itemsperpage').string) totalCount = int(soup.find('opensearch:totalresults').string) return (start, count, totalCount)
def handle(self, *args, **options): try: resource = urlopen(args[0]) except IndexError: raise CommandError("You have to specify a file path or a URL.") soup = BeautifulStoneSoup(resource) for event in soup.findAll("event"): event_info = { "description": event.get("description", u""), } for attribute in event: if not isinstance(attribute, NavigableString): event_info[attribute.name] = attribute.string or u"" identifier = sha_constructor("%(description)s-%(date)s" % { "description": slugify(event_info.get("description").encode("utf-8")), "date": event_info.get("date", u"").encode("utf-8"), }).hexdigest().encode("utf-8") obj, created = Event.objects.get_or_create(identifier=identifier, defaults={ "title": event_info.get("title", u"").encode("utf-8"), "content": event_info.get("body", u"").encode("utf-8"), "description": event_info.get("description").encode("utf-8"), "url": event_info.get("url", u"").encode("utf-8"), "order": int(event_info.get("displayorder", 0)), "date": date( int(event_info.get("year", 0)), int(event_info.get("month", 0)), int(event_info.get("day", 0)), ), })
def parse_peliculasxml(data): parser = BeautifulStoneSoup(data) ids = [ {"titulo": peli.nombre.string, "id_pol": peli.id.string, "id_cineticket": peli.idcineticket.string} for peli in parser.findAll("pelicula") ] return ids
def search(self, terms): torrents = [] url = self.search_uri % quote_plus(terms) try: f = requests.get(url, headers=self.headers) except: raise Exception("something wrong") if f.status_code != requests.codes.ok: f.raise_for_status() soup = BeautifulStoneSoup(f.text) for (c, item) in enumerate(soup.findAll("a", {"class": "magnet"})): if c == 30: break info = item.findPrevious("a") link = self.search_uri % quote_plus(info["href"]) try: item_f = requests.get(link, headers=self.headers) except: raise Exception("something wrong") if item_f.status_code != requests.codes.ok: item_f.raise_for_status() item_soup = BeautifulStoneSoup(item_f.text) sp = item_soup.findAll("span", {"class": re.compile("^stat_")}) if sp: sp = [int(i.text.replace(",", "")) for i in sp] else: sp = [0, 0] torrents.append({"url": item["href"], "name": info.text, "seeds": sp[0], "leechers": sp[1]}) return torrents
def parse(request, doc_id): """ Parse the 'resource_data' xml of a given resource (by id) NOTE only works on dc docuemnts atm """ is_service_avaliable() bag = {} url = "%sobtain?request_ID=%s&by_doc_ID=true" % (NODE_URL, doc_id) req = urllib2.Request(url) opener = urllib2.build_opener() data = opener.open(req) result = simplejson.load(data) data = result['documents'][0]['document'][0]['resource_data'] soup = BeautifulStoneSoup(data) parsed_data = {} fields = ['title','identifier','creator','publisher','date','description','rights'] for field in fields: try: parsed_data[field] = soup.find('dc:' + field).text except: pass parsed_data['subject'] = ''.join([s.text for s in soup.findAll('dc:subject')]) bag['parsed_data'] = parsed_data return shortcuts.render_to_response("parse.html", bag, context_instance=context.RequestContext(request))
def splitCell(cell): """Read the contents of a table cell and build the lecture dicts. Gets a BeautifulSoup element of a Cell and splits it into lectures. Then it builds the lecture dicts. The returned value is a list of lecture dicts (if any or a empty list else). """ st = BeautifulStoneSoup(unicode(cell.renderContents('utf-8').replace('<br />', '\n'), 'utf-8'), convertEntities=BeautifulStoneSoup.HTML_ENTITIES) elements = unicode(st.renderContents('utf-8'),'utf-8').split('\n\n') lectures = [] for elem in elements: lines = elem.split('\n') if len(lines) != 3: continue lines = map(unicode.strip, lines) if lines[2] != '-': (short, typ) = lines[1].split(" ") lectures.append({'short': short, 'typ': typ, 'name': lines[0], 'room': lines[2]}) return lectures
def determine_subtype(in_file): """Determines the subtype of a genome.""" hits = defaultdict(int) with open(in_file) as handle: soup = BeautifulStoneSoup(handle.read()) for seq in soup.findAll('iteration'): try: hit = seq.iteration_hits.hit.hit_def.contents[0] except: hit = None if hit: hits[hit.split('_')[1]] += 1 count = sum(hits.values()) if count < 5: return None elif all([x < count*0.6 for x in hits.values()]): #print 'too heterogenus %s' % ','.join(map(str,hits.items())) return None else: for key, val in hits.items(): if val > count*0.6: return key
def determine_subtype_short(in_file): hits = defaultdict(int) strainer = SoupStrainer(re.compile('iteration')) with open(in_file) as handle: soup = BeautifulStoneSoup(handle.read(), parseOnlyThese = strainer) for seq in soup.findAll('iteration'): try: hit = seq.iteration_hits.hit.hit_def.contents[0] except: hit = None if hit: hits[hit.split('_')[1]] += 1 count = sum(hits.values()) if count < 5: return None elif all([x < count*0.6 for x in hits.values()]): print 'too heterogenus %s' % ','.join(map(str,hits.items())) return None else: for key, val in hits.items(): if val > count*0.6: return key
def sitemap_parse(sitemap_option, astring, google_results, website_url): not_indexed = [] not_sitemap = [] error = '' sitemap_results = [] website_host = urlparse(website_url).scheme if website_host != '': website_url = urlparse(website_url).scheme + "://" + urlparse(website_url).netloc if website_url[-1] != '/': website_url += '/' if astring != '': if sitemap_option == 'sitemap': resp = requests.get(astring) soup = Soup(resp.content) elif sitemap_option == 'upload_sitemap': soup = Soup(astring) urls = soup.findAll('url') for u in urls: loc = u.find('loc').string sitemap_results.append(loc) if loc not in google_results: not_indexed.append(loc) for loc in google_results: if loc not in sitemap_results: not_sitemap.append(loc) return not_indexed, not_sitemap, error
def get_mirror(type='xml'): """Returns a random mirror for a given type 'xml', 'zip', or 'banner'""" global _mirrors if not _mirrors.get(type): # Get the list of mirrors from tvdb page = None try: page = requests.get(server + api_key + '/mirrors.xml').content except RequestException: pass # If there were problems getting the mirror list we'll just fall back to the main site. if page: data = BeautifulStoneSoup(page, convertEntities=BeautifulStoneSoup.HTML_ENTITIES) for mirror in data.findAll('mirror'): type_mask = int(mirror.typemask.string) mirrorpath = mirror.mirrorpath.string for t in [(1, 'xml'), (2, 'banner'), (4, 'zip')]: if type_mask & t[0]: _mirrors.setdefault(t[1], set()).add(mirrorpath) else: log.debug('Unable to get the mirrors list from thetvdb.') if _mirrors.get(type): return random.sample(_mirrors[type], 1)[0] + ('/banners/' if type == 'banner' else '/api/') else: # If nothing was populated from the server's mirror list, return the main site as fallback return 'http://thetvdb.com' + ('/banners/' if type == 'banner' else '/api/')
def create(cls, conn, user_sn, user_id, buddy_id, file_nm): ''' Take a filename, parse the XML, and insert it into the database. Stores most of the attributes raw, in order to do other sorts of processing later. ''' xml = BeautifulStoneSoup(open(file_nm, 'r')) msgs = xml('message') if len(msgs) == 0: return my_msgs = len(xml.findAll({'message': True}, {'sender': user_sn})) their_msgs = len(msgs)-my_msgs initiated = (msgs[0]['sender'] == user_sn) start_time = parser.parse(msgs[0]['time'].replace('.', ':'), fuzzy=True) end_time = parser.parse(msgs[-1]['time'].replace('.', ':'), fuzzy=True) stats = stat(file_nm) cur = conn.cursor() try: cur.execute(cls.CREATE_NEW_BUDDY_LOG_ENTRY_QUERY, (user_id, buddy_id, stats.st_size, initiated, my_msgs, their_msgs, time.mktime(start_time.timetuple()), time.mktime(end_time.timetuple()), time.time(), file_nm)) conn.commit() except sqlite3.IntegrityError: pass
def search(self, terms): torrents = [] url = self.search_uri % quote_plus(terms) try: f = requests.get(url) except: raise Exception("something wrong") if f.status_code != requests.codes.ok: f.raise_for_status() soup = BeautifulStoneSoup(f.text) for item in soup.findAll("item"): item_quality = item.link.text.rpartition("_")[2] try: item_f = requests.get(item.link.text) except: raise Exception("something wrong") if f.status_code != requests.codes.ok: f.raise_for_status() item_soup = BeautifulStoneSoup(item_f.text) qualities = [s.text.strip() for s in item_soup.findAll("span", {"class": re.compile("^tech-quality")})] q_index = qualities.index(item_quality) span = item_soup.findAll("span", {"title": "Peers and Seeds"})[q_index] ps_pos = len(span.parent.contents) - 1 ps = span.parent.contents[ps_pos].split("/") torrents.append( {"url": item.enclosure["url"], "name": item.title.text, "seeds": int(ps[1]), "leechers": int(ps[0])} ) return torrents
def postDownloadPageDay(self, host=None, postData={}): headers = { 'User-Agent': 'Googlebot/2.1 (+http://www.googlebot.com/bot.html) ' } s = requests.session() s.post(host, headers=headers, data={"agree": "OK"}) r = s.post(host, headers=headers, data=postData) decodedstring = BeautifulStoneSoup( r.text, convertEntities=BeautifulStoneSoup.HTML_ENTITIES) return decodedstring
def replace_bad_characters(self, str): """ Osetruje konverzi na utf-8 a prevadi html entity na utf-8 pismena """ str = unicode( BeautifulStoneSoup( str, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) str = unicodedata.normalize('NFKD', str).encode('ascii', 'ignore') str = unicode(re.sub('[^\w\s-]', '', str).strip().lower()) str = unicode(str.replace(' ', '-')) return str
def get_list_of_addon_author_names(self, addon_name): try: addon_xml = self.get_xml_for_single_addon(addon_name) name_tags = addon_xml.authors.findAll('name') return [ BeautifulStoneSoup(str(name_tags[i])).find('name').string for i in range(len(name_tags)) ] except AttributeError: self._print_search_error()
def retrieveVideoInfo(video_id): video_info = VideoInfo() video_info.set_video_hosting_info(getVideoHostingInfo()) video_info.set_video_id(video_id) try: video_link = 'http://cdn.playwire.com/' + str(video_id) + '.xml' soup = BeautifulStoneSoup(HttpUtils.HttpClient().getHtmlContent(url=video_link), convertEntities=BeautifulStoneSoup.XML_ENTITIES) cfg = soup.find("config") img_link = cfg.findNext("poster").string video_link = cfg.findNext("src").string video_info.set_video_stopped(False) video_info.set_video_image(img_link) video_info.set_video_name("PLAYWIRE Video") if re.search(r'\Artmp',video_link): video_info.add_video_link(VIDEO_QUAL_HD_720, video_link, addUserAgent=False) else: video_info.add_video_link(VIDEO_QUAL_HD_720, video_link, addUserAgent=True) except: video_info.set_video_stopped(True) return video_info
def convert_regions(): f = open('raw/regions.xml', 'r') data = f.read() f.close() soup = BeautifulStoneSoup(data) tp = Path('//regions') objs = tp.apply(soup) f = open('refined/regions.csv', 'w') f.write('\t'.join(['reg_id', 'name']) + '\n') for o in objs: rname = unicode(o.find('regname').string).encode('utf8', 'ignore') f.write('\t'.join([str(o.find('reg_id').string), rname]) + '\n')
def show_vodcast_videos(rss_file): log('get_vodcasts started with rss_file=%s' % rss_file) r_media = re.compile('^media') url = MAIN_URL + rss_file rss = urlopen(url).read() e = BeautifulStoneSoup.XML_ENTITIES tree = BeautifulStoneSoup(rss, convertEntities=e) videos = [] for item in tree.findAll('item'): if item.find(r_media): thumbnail = item.find(r_media)['url'] else: thumbnail = 'DefaultVideo.png' videos.append({ 'title': item.title.string, 'thumbnail': thumbnail, 'url': item.enclosure['url'], 'description': item.description.string }) log('show_vodcast_videos finished with %d videos' % len(videos)) return videos
def getDetailsForSerieByID(self, serieName, serieID): url = SERIE_DETAILS_URL % (urllib.quote(serieID)) try: # Change the User Agent USER_AGENT = 'Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B334b Safari/531.21.10' cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) req = urllib2.Request(url) req.add_header('User-Agent', USER_AGENT) resp = opener.open(req) soup = BeautifulStoneSoup(resp.read()) resp.close() for banner in soup.banners.findAll('banner'): if banner.language.string == 'en': if not 'Fanart' in KNOWN_SHOWS[serieName].keys( ) and banner.bannertype.string == 'fanart': KNOWN_SHOWS[serieName]['Fanart'] = str( BANNER_URL % (banner.bannerpath.string)) if banner.thumbnailpath: KNOWN_SHOWS[serieName]['FanartThumb'] = str( BANNER_URL % (banner.thumbnailpath.string)) elif not 'Poster' in KNOWN_SHOWS[serieName].keys( ) and banner.bannertype.string == 'poster': KNOWN_SHOWS[serieName]['Poster'] = str( BANNER_URL % (banner.bannerpath.string)) if banner.thumbnailpath: KNOWN_SHOWS[serieName]['PosterThumb'] = str( BANNER_URL % (banner.thumbnailpath.string)) elif not 'Season' in KNOWN_SHOWS[serieName].keys( ) and banner.bannertype.string == 'season': KNOWN_SHOWS[serieName]['Season'] = str( BANNER_URL % (banner.bannerpath.string)) if banner.thumbnailpath: KNOWN_SHOWS[serieName]['SeasonThumb'] = str( BANNER_URL % (banner.thumbnailpath.string)) elif not 'Series' in KNOWN_SHOWS[serieName].keys( ) and banner.bannertype.string == 'series': KNOWN_SHOWS[serieName]['Series'] = str( BANNER_URL % (banner.bannerpath.string)) if banner.thumbnailpath: KNOWN_SHOWS[serieName]['SeriesThumb'] = str( BANNER_URL % (banner.thumbnailpath.string)) return KNOWN_SHOWS[serieName] except: print 'Error: ' + url return None
def getVideoUrl(vod_sq, vod_key, selAltMovie=False): url = "http://v.nate.com/movie_url.php?mov_id=%s&v_key=%s&type=xml" % ( vod_sq, vod_key) xml = urllib2.urlopen(url).read() #dom = xml.dom.minidom.parseString(xml) # encoding error? soup = BeautifulStoneSoup(xml, fromEncoding='euc-kr') if selAltMovie: vid_url = urllib.unquote(soup.movie.mov_url_alt.string) else: vid_url = urllib.unquote(soup.movie.mov_url.string) img_url = soup.movie.master_thumbnail.url.string return (vid_url, img_url)
def _google_checkout_post(url, params): u = urlparse("%s%s" % (url, g.GOOGLE_ID)) conn = HTTPSConnection(u.hostname, u.port) auth = base64.encodestring('%s:%s' % (g.GOOGLE_ID, g.GOOGLE_KEY))[:-1] headers = {"Authorization": "Basic %s" % auth, "Content-type": "text/xml; charset=\"UTF-8\""} conn.request("POST", u.path, params, headers) response = conn.getresponse().read() conn.close() return BeautifulStoneSoup(response)
def calculate_collocations(self, content, collocation_measures=TrigramAssocMeasures, collocation_finder=TrigramCollocationFinder): content = re.sub(r'’', '\'', content) content = re.sub(r'&', '&', content) try: content = unicode( BeautifulStoneSoup( content, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) except ValueError, e: print "ValueError, ignoring: %s" % e
def get_routes(self): """ Get all available routes. Return a dictionary with route ids as keys. Values are dictionaries of route attributes. """ url = self._build_api_url('getroutes') xml = self._grab_url(url) soup = BeautifulStoneSoup(xml) routes = {} for tag in soup.findAll('route'): routes[str(tag.rt.string)] = { 'id': str(tag.rt.string), 'name': str(tag.rtnm.string) } return routes
def __init__(self, response_text, status_code): self.text = response_text self.xml = None try: self.json = json.loads(response_text, object_pairs_hook=SortedDict) except (JSONDecodeError, ValueError): if self.text[:5] == "<?xml": # perhaps it's XML? self.xml = BeautifulStoneSoup(self.text) # must be an awful code. self.json = None self.status_code = status_code
def getChannels(url): response = open(file, 'rb') link = response.read() soup = BeautifulStoneSoup(file, convertEntities=BeautifulStoneSoup.XML_ENTITIES) channels = soup('channel') for channel in channels: name = channel('name')[0].string thumbnail = channel('thumbnail')[0].string addDir(name, '', 2, thumbnail) else: INDEX()
def _start_job(self, job_id, time_to_live, priority, conversion_parameters): print "starting job" params = dict(jobId=job_id, timeToLive=time_to_live, priority=priority, conversionParameters=conversion_parameters) start_job_result = requests.get( "https://%s/%s/%s" % (self._service_host, self._service_base, self._start_job_uri), params=params, headers=self._http_header ).text # Although PHP sample code loops through all startjobresult tags, we only get the first one, only need to return one anyway tag = BeautifulStoneSoup(start_job_result).find('startjobresult') print "starting job done" return JobInfo(soup_tag = tag)
def get_series(series_name_search): """Return all possible matches for series_name_search in the chosen language """ url = "%sapi/GetSeries.php?seriesname=%s&language=%s" % ( BASE_URL, urllib.quote(series_name_search), _LANGUAGE) soup = BeautifulStoneSoup(urllib2.urlopen(url).read()) matches = [] for series in soup.data.findAll("series"): matches.append(_parse_series(series)) return matches
def EPISODE(name, cid): showname = name xbmcplugin.setContent(pluginhandle, 'episodes') xbmcplugin.addSortMethod(pluginhandle, xbmcplugin.SORT_METHOD_NONE) url = 'http://www.tnt.tv/processors/services/getCollectionByContentId.do?offset=0&sort=&limit=200&id='+cid html=getURL(url) tree=BeautifulStoneSoup(html, convertEntities=BeautifulStoneSoup.HTML_ENTITIES) episodes = tree.findAll('episode') for episode in episodes: episodeId = episode['id'] name = episode.find('title').string thumbnail = episode.find('thumbnailurl').string plot = episode.find('description').string duration = episode.find('duration').string try: seasonNum = int(episode.find('seasonnumber').string) print seasonNum except: seasonNum = 0 try: episodeNum = int(episode.find('episodenumber').string) print episodeNum except: episodeNum = 0 if episodeNum == 0 or seasonNum == 0: print 'bad season or episode value' else: name = str(seasonNum)+'x'+str(episodeNum)+' - '+name segments = episode.findAll('segment') if len(segments) == 0: url = episodeId mode = 4 addLink(name,url,mode,thumbnail,plot,seasonNum,episodeNum,showname,duration) else: url = '' for segment in segments: url += segment['id']+'<segment>' mode = 5 #PLAYEPISODE addLink(name,url,mode,thumbnail,plot,seasonNum,episodeNum,showname,duration) xbmcplugin.endOfDirectory(pluginhandle)
def getFileTypes(url): #list filetypes p = re.compile('/details/(.*)') match = p.findall(url) for name in match: temp = 'http://www.archive.org/download/' + name + '/' + name + '_files.xml' link = getLink(temp) tree = BeautifulStoneSoup(link) shn = tree.findAll('file', attrs={"name": re.compile('(.+?\.shn$)')}) m3u = tree.findAll('file', attrs={"name": re.compile('(.+?\.m3u$)')}) flac = tree.findAll('file', attrs={"name": re.compile('(.+?\.flac$)')}) mp3 = tree.findAll('file', attrs={"name": re.compile('(.+?64kb\.mp3$)')}) vbr = tree.findAll('file', attrs={"name": re.compile('(.+?vbr\.mp3$)')}) if len(m3u) > 0: addDir('.m3u Playlists', temp, 7) if len(flac) > 0: addDir('1. Flac Files', temp, 7) if len(mp3) > 0: addDir('2. VBR mp3', temp, 7) if len(vbr) > 0: addDir('3. 64kB mp3', temp, 7) if len(shn) > 0: addDir('1. Shorten Files', temp, 7)
def parse_config(file_to_read): parsed = BeautifulStoneSoup(open(file_to_read).read()) adapters = parsed.findAll('adapter') if (not adapters): adapters = parsed.findAll('interface') host_tag = parsed.find('hostname') if host_tag: host_name = host_tag.string.lower() else: host_name = None domain_tag = parsed.find('domainname') if domain_tag: domain_name = domain_tag.string if domain_name: domain_name = domain_name.lower() else: domain_name = None ip_list = [] for adapter in adapters: mac = (adapter.find('address').string if adapter.find('address') else None) if mac: mac = mac.replace('-', ':').lower() adapter_ips = adapter.findAll('adapterip') for adapter_ip_node in adapter_ips: if (not adapter_ip_node): continue ip = '' for ip_address in adapter_ip_node.find('ip'): ip = ip_address.string.strip() if (not ip): continue info = {'host_name': host_name, 'domain_name': domain_name, 'ip_address': ip, 'mac_address': mac} if ((info not in ip_list) and (ip != '127.0.0.1') and (':' not in ip)): ip_list.append(info) return ip_list
def response(self): """Handle/parse the OnlineNIC API response.""" soup = BeautifulStoneSoup(self.read()) response = soup.find('response') if response is None: raise InvalidResponseError('No <response> container found.') contents = {} for key in [ 'code', 'msg', 'value', 'category', 'action', 'cltrid', 'svtrid', 'chksum' ]: value = response.find(key) if value is None: raise InvalidResponseError( 'No {} found in response.'.format(key)) contents[key] = value.string.strip() if contents['code'] in ONLINENIC_ERRORS: raise ONLINENIC_ERRORS[contents['code']]('{} [{}]'.format( contents['msg'], contents['value'])) resdata = response.find('resdata') if resdata is not None: contents['data'] = {} for d in resdata.contents: if d is not None and d.string.strip(): key = d.get('name') val = d.string.strip() if key in contents['data'].keys(): if not isinstance(contents['data'][key], list): existing_val = contents['data'][key] contents['data'][key] = [] contents['data'][key].append(existing_val) contents['data'][key].append(val) else: contents['data'][key] = val return contents
def image_path_with_fgdc_to_world_file(image_path, world_file, srs, units="m"): image = Image.open(image_path) (width, height) = image.size xml_path = "%s.xml" % (os.path.splitext(image_path)[0]) with open(xml_path, "r") as f: xml = BeautifulStoneSoup(f) north_bound = float(xml.find("northbc").text) west_bound = float(xml.find("westbc").text) south_bound = float(xml.find("southbc").text) east_bound = float(xml.find("eastbc").text) srs = "%s" % (srs) if not srs.startswith("EPSG:"): srs = "EPSG:%s" % (srs) (west_bound, north_bound) = latlng_to_srs(north_bound, west_bound, srs, units) (east_bound, south_bound) = latlng_to_srs(south_bound, east_bound, srs, units) x_pixel_width = (east_bound - west_bound) / width y_pixel_width = (south_bound - north_bound) / height for l in [x_pixel_width, 0, 0, y_pixel_width, west_bound, north_bound]: world_file.write("%s\n" % l) return world_file
def search(self, terms): torrents = [] data = {'SearchString': '', 'SearchString1': terms, 'search': 'Search'} req = Request(self.search_uri, urlencode(data)) req.add_header('User-Agent', self.user_agent) f = urlopen(req) soup = BeautifulStoneSoup(f.read()) for (c, item) in enumerate(soup.findAll('a', {'class': 'magnet'})): if c == 30: break info = item.findPrevious('a') link = self.uri_prefix + info['href'] item_req = Request(link) item_req.add_header('User-Agent', self.user_agent) item_f = urlopen(item_req) item_soup = BeautifulStoneSoup(item_f.read()) sp = item_soup.findAll('span', {'class': re.compile('^stat_')}) if sp: sp = [int(i.text.replace(',', '')) for i in sp] else: sp = [0, 0] torrents.append({ 'url': item['href'], 'name': info.text, 'seeds': sp[0], 'leechers': sp[1] }) return torrents
def ask_whatizit(search_sent_list, client = None, pipeline = 'whatizitSwissprot'): """A function which queries the Whatizit tool use the SOAP client. Care is taken to ensure that identical sentences are not querried multiple times. Arguments: search_sent_list -- A LIST of sentences to search. client = None -- A SOAP client ... If None then one is created on the fly. pipeline = 'whatizitSwissprot' -- The pipeline to search. """ if client is None: client = generate_whatizit_client() resdict = {} for sent in search_sent_list: if sent in resdict: yield resdict[sent] resp = client.service.contact(pipelineName = pipeline, text = sent, convertToHtml = False) soup = BeautifulStoneSoup(de_safe_xml(resp)) if pipeline == 'whatizitSwissprot': groups = soup.findAll('z:uniprot') if groups: res = [(p.contents[0], p['ids'].split(',')) for p in groups] else: res = None elif pipeline == 'whatizitMeshUp': groups = soup.findAll('concepts') if groups: tmp = [x.contents[0].strip() for x in groups] ntmp = [x.split(';') for x in tmp] meshids = set(x.split(':')[0] for x in chain.from_iterable(ntmp)) res = [(None, x) for x in sorted(meshids)] else: raise KeyError, 'Unknown pipeline: %s' % pipeline resdict[sent] = res yield res
def GET_RTMP(vid): url = 'http://www.adultswim.com/astv/mvpd/services/cvpXML.do?id=' + vid html = common.getURL(url) tree = BeautifulStoneSoup(html, convertEntities=BeautifulStoneSoup.HTML_ENTITIES) print tree.prettify() sbitrate = int(common.settings['quality']) hbitrate = -1 files = tree.findAll('file') for filenames in files: try: bitrate = int(filenames['bitrate']) except: bitrate = 1 if bitrate > hbitrate and bitrate <= sbitrate: hbitrate = bitrate filename = filenames.string if 'http://' in filename: filename = filename return filename else: filename = filename[1:len(filename) - 4] serverDetails = tree.find('akamai') server = serverDetails.find('src').string.split('://')[1] #get auth tokentype = serverDetails.find('authtokentype').string window = serverDetails.find('window').string aifp = serverDetails.find('aifp').string auth = getAUTH(aifp, window, tokentype, vid, filename.replace('mp4:', '')) #swfUrl = 'http://www.tbs.com/cvp/tbs_video.swf swfvfy=true' rtmp = 'rtmpe://' + server + '?' + auth + ' playpath=' + filename #+" swfurl="+swfUrl return rtmp
def message_cb(word, word_eol, userdata): message = word[1] #Verifica se ela e igual a !news if(message == "!news"): # Lista para indexar o nome dos servidores rss_servers_names = ['lifehacker', 'linux-journal', 'revista-info', 'gizmodo', 'lol-cats'] # Dicionário com servidores e links rss_servers = { 'lifehacker': 'http://feeds.gawker.com/lifehacker/full.xml', 'linux-journal': 'http://feeds.feedburner.com/LinuxJournal-BreakingNews', 'revista-info': 'http://feeds.feedburner.com/Plantao-INFO', 'gizmodo': 'http://feeds.gawker.com/gizmodo/full', 'lol-cats': 'http://feeds.feedburner.com/lolcats/rss', } # Inicia a bibilioteca http http = httplib2.Http() # Realizar a requisicao no servidor escolhido aleatoriamente na lista de servidores # - status -> cabecalho da requisicao # - response -> corpo do arquivo XML status, response = http.request(rss_servers[rss_servers_names[randint(0, len(rss_servers_names)-1)]]) # Inicializa o soup com o conteúdo XML soup = BeautifulStoneSoup(response) # Busca todos os itens dentro do XML, onde cada item representa uma noticia all_news = soup.findAll("item"); #Recupera o inicio da mensagem ate o primeiro espaco em branco message = word[1] # Seleciona aleatoriamente uma noticia presente na lista selected = randint(0, len(all_news)-1) # Envia a mensagem no IRC xchat.command("ME "+all_news[selected].title.string + " - " + all_news[selected].link.string)
def playitems(self, params): print params print "@1" soup = BeautifulSoup(geturl(params['url'])) id = dict( it.split('=', 1) for it in urllib.unquote( soup.find("embed")['flashvars']).split('&'))['vid'] if 0: soup = BeautifulStoneSoup( geturl( "http://cosmos.bcst.yahoo.com/rest/v2/pops;id=%s;lmsoverride=1" % id)) val = { "title": soup.channel.item.title, "descr": soup.channel.item.description, "date": soup.channel.item.find("media:pubStart"), } print "@@" soup = BeautifulStoneSoup( geturl( "http://cosmos.bcst.yahoo.com/rest/v2/pops;id=%s;lmsoverride=1;element=stream;bw=1200" % id)) print soup item = soup.channel.item.find('media:content') val = { "url": "%s playpath=%s swfurl=%s swfvfy=true" % (item['url'], item['path'], "http://d.yimg.com/m/up/ypp/au/player.swf"), "duration": item['duration'], "name": re.sub(r'<!\[CDATA\[([^\]+])\]\]', '', soup.channel.item.title.contents[0]) } print("@2", val) if "record" in params: self.record(val) else: self.play(val)
def inlines(value, return_list=False): try: from BeautifulSoup import BeautifulStoneSoup except ImportError: from beautifulsoup import BeautifulStoneSoup content = BeautifulStoneSoup( value, selfClosingTags=['inline', 'img', 'br', 'input', 'meta', 'link', 'hr']) inline_list = [] if return_list: for inline in content.findAll('inline'): rendered_inline = render_inline(inline) inline_list.append(rendered_inline['context']) return inline_list else: for inline in content.findAll('inline'): rendered_inline = render_inline(inline) if rendered_inline: inline.replaceWith( BeautifulStoneSoup( render_to_string(rendered_inline['template'], rendered_inline['context']))) else: inline.replaceWith(BeautifulStoneSoup('')) return mark_safe(content)
def popular(self, terms): torrents = [] url = "http://1337x.to/popular-" + terms xbmc.log("url: %s" % (url), xbmc.LOGSEVERE) f = urlopen(url) soup = BeautifulStoneSoup(f.read()) for table in soup.findAll( 'table', {'class': 'table-list table table-responsive table-striped'}): for row in table.find('tbody').findAll('tr'): xbmc.log("row: %s" % (row), xbmc.LOGSEVERE) details = row.find('td', {"class": "coll-1 name"}) size = row.find('td', {"class": re.compile("coll-4.*")}) # details=row.find('td[class*="coll-1 name"]'); # size=row.find('td[class*="coll-4"]'); # details=row.find('td', class_='coll-1 name'); # size=row.find('td',class_=re.compile('coll-4 .*')); name = details.text test = 'http://1337x.to' + details.find( 'a', {"class": None})['href'] magnet = "" seeds = 0 leechers = 0 torrents.append({ 'url': test, 'name': name.encode('ascii', 'ignore').decode('ascii'), 'size': size.text, 'seeds': seeds, 'leechers': leechers, 'magnet': magnet, }) return torrents
def initServerInfoBase(fileName): """ @description: Intializes soup for the Beautiful soup parser. Reads the exisitng Data from the fileName paramter. @todo:None @param xml: String, Name of file to be loaded in soup. @return: Boolean, True if a successful, else False """ if os.path.exists(fileName): try: f = open(fileName, "r") except: return None, False xml = f.read() f.close() soup = BeautifulStoneSoup(xml) serverinfolist = soup.findAll("serverinfo") else: serverinfolist = [] soup = BeautifulSoup() xml = "null" if len(serverinfolist) == 0: serverinfo = Tag(soup, "serverinfo") soup.insert(0, serverinfo) return soup, True
def getEpsLegendados(url): link = openURL(url) soup = BeautifulSoup(link, convertEntities=BeautifulSoup.HTML_ENTITIES) eps = soup.findAll("div", {"class": "well well-sm"}) plotE = re.findall('<span itemprop="description">\s*(.*?)</span>', link, re.DOTALL | re.MULTILINE)[0] plotE = unicode( BeautifulStoneSoup( plotE, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)).encode('utf-8') plotE = BeautifulSoup(plotE.replace("<br>", " ")).text totE = len(eps) try: anterior = re.findall('href="(.*?)">Voltar</a></li>', link)[0] primeira = re.findall('href="(.*?)">Primeiro</a></li>', link)[0] proxima = re.findall('href="(.*?)">Avançar</a></li>', link)[0] pa = re.findall('([0-9]+?)$', anterior)[0] pd = re.findall('([0-9]+?)$', primeira)[0] pp = re.findall('([0-9]+?)$', proxima)[0] if (pp != '2'): addDir('. Primeira Página', base + primeira, 31, artfolder + 'pagantr.jpg') if (pp != '2'): addDir('<< Página Anterior ' + pa, base + anterior, 31, artfolder + 'pagantr.jpg') except: pass for ep in eps: try: titE = ep.img["title"].encode('ascii', 'ignore') urlE = base + ep.a["href"] if ep.a.img.has_key("src"): imgE = ep.a.img["src"] else: imgE = ep.a.img["data-cfsrc"] addDir(titE, urlE, 100, imgE, False, totE, plotE) except: pass try: ultima = re.findall('href="(.*?)">Último</a></li>', link)[0] pu = re.findall('([0-9]+?)$', ultima)[0] if (pu != '1'): addDir('Página Seguinte ' + pp + ' >>', base + proxima, 31, artfolder + 'proxpag.jpg') if (pu != '1'): addDir('Última Página ' + pu + ' >>', base + ultima, 31, artfolder + 'proxpag.jpg') except: pass
def main(): """Generate a list of all the morphological tags in an XML document.""" in_file = codecs.open("herodotus.xml", "rU", "utf-8") print "Parsing the input file with BeautifulStoneSoup..." print soup = BeautifulStoneSoup(in_file) print "Finding all the tokens..." print tokens = soup.findAll('w') out_file = codecs.open("HDT-morph-list.txt", "w", "utf-8") out_file2 = codecs.open("HDT-pos-list.txt", "w", "utf-8") unique_tags = Set([]) short_tags = Set([]) for token in tokens: try: tag = token['pos'] if tag != "": unique_tags.add(tag) short_tag = tag[:2] short_tags.add(short_tag) except KeyError: pass for tag in unique_tags: print >> out_file, tag for tag in short_tags: print >> out_file2, tag