def csv2po(csv_file): """Convert a file-like object `csv_file` to a polib.POFile object""" po = polib.POFile() # Reset to reading from the beginning of the file csv_file.seek(0) csv_reader = csv.reader(csv_file) for count, row in enumerate(csv_reader): # Skip the first two header rows if count < len(csv_header_rows): continue msgid = unescape(row[0]) msgid_plural = unescape(row[1]) msgctxt = row[2] msgstr, msgstr_plural = undo_plurals(msgid_plural, row[3]) entry = polib.POEntry() entry.msgid = msgid if msgid_plural: entry.msgid_plural = msgid_plural if msgctxt: entry.msgctxt = msgctxt if msgstr: entry.msgstr = msgstr if msgstr_plural: entry.msgstr_plural = msgstr_plural po.append(entry) return po
def parse_video_response(self, response): data = simplejson.loads(response.read()) items = list() count = 0 for video in data['videos']: vid_url, thumb_url, category_id, dur = self.get_video_urls(video) count += 1 if vid_url is None: continue meta = video.get('meta') items.append({ 'label': unescape(meta.get('title')), 'thumbnail': thumb_url, 'info': { 'plot': unescape(meta.get('preamble') or ''), 'originaltitle': unescape(meta.get('title') or '???'), 'tagline': unescape(meta.get('preamble') or ''), 'aired': self.get_date(meta.get('timePublished')), 'duration': self.get_duration(meta.get('duration')) }, 'stream_info': { 'video': { 'duration': meta.get('duration', 0) } }, 'path': vid_url, 'is_playable': True, }) return items, (count < self.PER_PAGE)
def AddDir(name, mode, url=None, image=None, fanart=None, isFolder=False, isPlayable=False, desc='', plot='', contextMenu=None, replaceItems=False, infoLabels=None): try: name = name.encode('utf-8') url = utils.fixUnicode(utils.unescape(url)) except: pass try: if not validateMode(mode, name): return if not fanart: fanart = FANART name = name.replace('_', ' ') infoLabels = {'title':name, 'fanart':fanart, 'description':desc, 'plot':plot} image = utils.patchImage(mode, image, url, infoLabels) u = '' u += '?mode=' + str(mode) u += '&title=' + urllib.quote_plus(name) if image: u += '&image=' + urllib.quote_plus(image) if url: u += '&url=' + urllib.quote_plus(url).replace('%25LB%25', '%') APPLICATION.addDir(utils.unescape(name), mode, u, image, isFolder, isPlayable, contextMenu=contextMenu, replaceItems=replaceItems, infoLabels=infoLabels) except Exception, e: raise
def load(request): if not request.user.is_authenticated(): return HttpResponseRedirect("/accounts/login") if request.method == 'POST': form = ImportDeliciousForm(request.POST,request.FILES) if form.is_valid(): db = get_database()[Bookmark.collection_name] html=request.FILES['exported'].read().decode('utf8') soup=BeautifulSoup(html) for item in soup.findAll('dt'): desc='' next=item.findNextSiblings() if next: next=next[0] if 'name' in dir(next) and next.name=='dd': desc=unescape(u''.join(imap(unicode, next.contents))) db.Bookmark({'url': urlSanitize(item.a['href']), 'seq': getNextVal('seq'), 'tags': item.a['tags'].split(','), 'user': unicode(request.user), 'created': datetime.fromtimestamp(float(item.a['add_date'])), 'private': item.a['private']=='1', 'title': unescape(unicode(item.a.string)), 'notes': unicode(desc)}).save() return HttpResponseRedirect('/u/%s/' % request.user) else: form = ImportDeliciousForm() return render_to_response('import.html', { 'form': form, }, context_instance=RequestContext(request) )
def __reload(self, values): self.__raw.__dict__.update(values) self.firstname = unescape(self.__raw.firstname) self.lastname = unescape(self.__raw.lastname) self.company = unescape(self.__raw.company) self.colleagues = self.__raw.colleagues self.id = int(self.__raw.id_user) self.lang = LANG_ID[int(self.__raw.lang) + 1]
def from_text(cls, text): match = cls.token_re.match(text) assert match, 'cannot parse Token from {}'.format(text) groups = match.groupdict() word = unescape(groups['word']) lemma = unescape(groups['lemma']) pos = unescape(groups['pos']) return cls(word, lemma, pos)
def from_text(cls, text): match = cls.pred_re.match(text) assert match, 'cannot parse Predicate from {}'.format(text) groups = match.groupdict() word = unescape(groups['word']) lemma = unescape(groups['lemma']) pos = unescape(groups['pos']) neg = True if groups['neg'] is not None else False prt = unescape(groups['prt']) if groups['prt'] is not None else '' return cls(word, lemma, pos, neg, prt)
def from_text(cls, text): match = cls.arg_re.match(text) assert match, 'cannot parse Argument from {}'.format(text) groups = match.groupdict() word = unescape(groups['word']) lemma = unescape(groups['lemma']) pos = unescape(groups['pos']) ner = groups['ner'] if groups['ner'] != 'NONE' else '' entity_idx = int(groups['entity_idx']) if groups['entity_idx'] else -1 mention_idx = \ int(groups['mention_idx']) if groups['mention_idx'] else -1 return cls(word, lemma, pos, ner, entity_idx, mention_idx)
def collect_album_info(album_soup): url = 'http://tut-audio.su' album_dict = {} album_dict['name'] = unescape( album_soup.find(id="titlealb").get_text()[:-14]) album_dict['year'] = album_soup.find( id="dopinfoalb").find('p').find('b').get_text() if album_dict['year']: album_dict['year'] = int(album_dict['year']) album_dict['cover_url'] = url + album_soup.find(id="imagesalb").get('src') t = album_soup.find_all("div", "player")[0] artist, _ = t['data-title'].split(' — ') artist = unescape(artist) album_dict['url'] = url + album_url return album_dict, artist
def __init__( self, uid, summary, dtstamp=None, created=None, last_modified=None, related_to=None, completed=None, percent_complete=None, x_kde_ktimetracker_totalsessiontime=None, x_kde_ktimetracker_totaltasktime=None, x_kde_ktimetracker_bctype=None, ): self.uid = uid self.summary = unescape(summary) self.dtstamp = dtstamp self.created = created self.last_modified = last_modified self.related_to = related_to self.completed = completed self.percent_complete = percent_complete self.x_kde_ktimetracker_totalsessiontime = x_kde_ktimetracker_totalsessiontime self.x_kde_ktimetracker_totaltasktime = x_kde_ktimetracker_totaltasktime self.x_kde_ktimetracker_bctype = x_kde_ktimetracker_bctype self.todos = {}
def gen_solution(cur, td, num, p_id): # import pdb # pdb.set_trace() global testcase_id global testcase_crawled if num == 0: column_name = 'java' elif num == 1: column_name = 'cpp' elif num == 2: column_name = 'csharp' else: column_name = 'VB' cur.execute('select %s from problem where id = %d' % (column_name, p_id)) if cur.fetchall()[0][0] != None: return p = compile('"/stat\?c=problem_solution.*?"') l = p.findall(td) if len(l) == 1: url = topcoder_site_url + unescape(l[0][1:-1]) try: page = topcoder.get_page(url) except Exception, e: print url, e return p = compile('<TD CLASS="problemText" COLSPAN="8" VALIGN="middle" ALIGN="left">[\d\D]*?</TD>') try: code = escape_string(p.findall(page)[0]) except Exception, e: print 'No code found:',url,e return
def fetch_bioguide_page(bioguide, force): url = "http://bioguide.congress.gov/scripts/biodisplay.pl?index=%s" % bioguide cache = "legislators/bioguide/%s.html" % bioguide try: body = download(url, cache, force) # Fix a problem? body = body.replace("Á\xc2\x81", "Á") # Entities like ’ are in Windows-1252 encoding. Normally lxml # handles that for us, but we're also parsing HTML. The lxml.html.HTMLParser # doesn't support specifying an encoding, and the lxml.etree.HTMLParser doesn't # provide a cssselect method on element objects. So we'll just decode ourselves. body = utils.unescape(body, "Windows-1252") dom = lxml.html.parse(io.StringIO(body)).getroot() except lxml.etree.XMLSyntaxError: raise Exception("Error parsing: " + url) # Sanity check. if len(dom.cssselect("title")) == 0: raise Exception("No page for bioguide %s!" % bioguide) return dom
def insertPicDetail(self, picDetailModel): cur = self.con.cursor() try: sql = '''INSERT INTO admin_picdetail (`pid`, `pic_path`, `height`, `width`, `pic_desc`, `categoary_id`, `albunm_name`, `albunm_id`, `user_id`, `time`, `taoke_num_iid`, `taoke_title`, `taoke_price`) VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')''' % ( picDetailModel.pId, picDetailModel.picPath, picDetailModel.height, picDetailModel.width, picDetailModel.desc, picDetailModel.cateId, picDetailModel.albunmName, picDetailModel.albunmId, picDetailModel.userId, picDetailModel.time, picDetailModel.taokeNumIID, picDetailModel.title, picDetailModel.price) sql = utils.unescape(sql).encode('utf-8') cur.execute(sql) self.con.commit() except Exception, what: print '========-------=======', what # print sql pass
def gen_solution(cur, td, num, p_id): # import pdb # pdb.set_trace() global testcase_id global testcase_crawled if num == 0: column_name = 'java' elif num == 1: column_name = 'cpp' elif num == 2: column_name = 'csharp' else: column_name = 'VB' cur.execute('select %s from problem where id = %d' % (column_name, p_id)) if cur.fetchall()[0][0] != None: return p = compile('"/stat\?c=problem_solution.*?"') l = p.findall(td) if len(l) == 1: url = topcoder_site_url + unescape(l[0][1:-1]) try: page = topcoder.get_page(url) except Exception, e: print url, e return p = compile( '<TD CLASS="problemText" COLSPAN="8" VALIGN="middle" ALIGN="left">[\d\D]*?</TD>' ) try: code = escape_string(p.findall(page)[0]) except Exception, e: print 'No code found:', url, e return
def get_categories(self, root_id=0, only_series=False): categories = self.get_category_tree() root = int(root_id) matches = [] for category in categories: id = category.get('id') if category.get('showCategory') is False: continue if only_series is True and category.get('isSeries') is not True: continue if only_series is False and category.get('parentId') != root: continue matches.append({ 'label': unescape(category.get('title')), 'path': self.plugin.url_for( 'show_category', id=str(id), mode='all' ), 'id': id }) return matches
def view(request, shurl): item = getItemByUrl(shurl) item['shurl'] = base62.from_decimal(item['seq']) if request.GET.get('format', '') == 'json': del item['user'] res = { 'url': unicode(item['url']), 'title': unicode(item['title']), 'created': tuple(item['created'].timetuple()), 'private': item['private'], 'notes': unicode(unescape(item['notes'])), 'tags': item['tags'], } return HttpResponse(json.dumps(res), mimetype="application/json") else: item['snapshot'] = '' if not item.get('snapshot') else item.get( 'snapshot')[0] tpl = 'view.html' if request.GET.get('raw', None): tpl = 'view-bare.html' return render_to_response(tpl, { 'item': item, }, context_instance=RequestContext(request))
def fetch_albums(url): html = urlopen(url) found = re.findall(r'<td class="Title".*?<a href="/music/url\?q=(/music/album\?id%3D.*?)".*?>(.*?)</a>', html) print '# albums:', len(found), urllib.unquote(url) for link, title in found: link = 'http://www.google.cn'+link.split('&')[0] title = unescape(title) print urllib.unquote(link), '|', title found = re.findall(r'<td>.*?<a class="imglink" href="/music/url\?q=(.*?)"', html) pages = [ 'http://www.google.cn'+urllib.unquote(i.split('&')[0]) for i in found ] cache[url] = True for page in pages: if page not in cache: cache[page] = False another_page = None for page, done in cache.iteritems(): if not done: another_page = page break if another_page: fetch_albums(another_page)
def _parse_result(self, buf, properties={}): count = 0 result = "" try: self.redis_conn.send(buf) except Exception as e: if e.args[0] == errno.EPIPE and count < 3: self.setup() count += 1 time.sleep(1) else: raise while True: recv = self.redis_conn.recv(1024000) if recv: result += recv if not recv or recv.endswith("\r\n\r\n"): break a = result.split("#-*-#") code, info, data = a data = data[:-4] if code == "200": return handle_safely(properties.get("recv", default_recv))(unescape(data)) elif code == "502": return properties.get("result", data) else: raise RedisError("%s:%s, data: %s" % (code, info, data))
def insertPicDetail(self,picDetailModel): cur = self.con.cursor() try: sql = '''INSERT INTO admin_picdetail (`pid`, `pic_path`, `height`, `width`, `pic_desc`, `categoary_id`, `albunm_name`, `albunm_id`, `user_id`, `time`, `taoke_num_iid`, `taoke_title`, `taoke_price`) VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')'''%(picDetailModel.pId ,picDetailModel.picPath ,picDetailModel.height ,picDetailModel.width ,picDetailModel.desc ,picDetailModel.cateId ,picDetailModel.albunmName ,picDetailModel.albunmId ,picDetailModel.userId ,picDetailModel.time ,picDetailModel.taokeNumIID ,picDetailModel.title ,picDetailModel.price) sql = utils.unescape(sql).encode('utf-8') cur.execute(sql) self.con.commit() except Exception,what: print '========-------=======',what # print sql pass
def settings_to_log( self ): try: utils.log( "Settings" ) setting_values = self.read_settings_xml() for k, v in sorted( setting_values.items() ): utils.log( "%30s: %s" % ( k, str( utils.unescape( v.decode('utf-8', 'ignore') ) ) ) ) except: traceback.print_exc()
def get_musicbrainz_artist_id(artist_search, limit=1, alias=False): name = "" id = "" sortname = "" artist_name = smart_unicode( (artist_search.replace('"', '?').replace('&', 'and'))) if not alias: url = artist_url % (server, quote_plus( artist_name.encode("utf-8")), limit) else: url = alias_url % (server, quote_plus( artist_name.encode("utf-8")), limit) htmlsource = get_html_source(url, "", save_file=False) match = re.search('''<artist(.*?)</artist>''', htmlsource) if match: score_match = re.search('''score="(.*?)"''', htmlsource) name_match = re.search('''<name>(.*?)</name>''', htmlsource) id_match = re.search('''<artist id="(.*?)"(?:.*?)>''', htmlsource) if not id_match: id_match = re.search('''<artist (?:.*?)id="(.*?)">''', htmlsource) sort_name_match = re.search('''<sort-name>(.*?)</sort-name>''', htmlsource) if score_match: score = score_match.group(1) if name_match: name = unescape(smart_unicode(name_match.group(1))) if id_match: id = id_match.group(1) if sort_name_match: sortname = unescape(smart_unicode(sort_name_match.group(1))) log("Score : %s" % score, xbmc.LOGDEBUG) log("Id : %s" % id, xbmc.LOGDEBUG) log("Name : %s" % name, xbmc.LOGDEBUG) log("Sort Name : %s" % sortname, xbmc.LOGDEBUG) else: if not alias: log("No Artist ID found trying aliases: %s" % artist_search, xbmc.LOGDEBUG) name, id, sortname = get_musicbrainz_artist_id( artist_search, limit, True) else: log("No Artist ID found for Artist: %s" % artist_search, xbmc.LOGDEBUG) xbmc.sleep(mb_delay) return name, id, sortname
def get_musicbrainz_artists(artist_search, limit=1): log("Artist: %s" % artist_search, xbmc.LOGDEBUG) score = "" name = "" id = "" sortname = "" artists = [] artist_name = smart_unicode( (artist_search.replace('"', '?').replace('&', 'and'))) url = artist_url % (server, quote_plus(artist_name.encode("utf-8")), limit) htmlsource = get_html_source(url, "", save_file=False, overwrite=False) match = re.findall('''<artist(.*?)</artist>''', htmlsource) if match: for item in match: artist = {} artist["score"] = "" artist["name"] = "" artist["id"] = "" artist["sortname"] = "" score_match = re.search('''score="(.*?)"''', item) name_match = re.search('''<name>(.*?)</name>''', item) id_match = re.search('''id="(.*?)"(?:.*?)>''', item) if not id_match: id_match = re.search('''id="(.*?)">''', item) sort_name_match = re.search('''<sort-name>(.*?)</sort-name>''', item) if score_match: artist["score"] = score_match.group(1) if name_match: artist["name"] = unescape(smart_unicode(name_match.group(1))) if id_match: artist["id"] = id_match.group(1) if sort_name_match: artist["sortname"] = unescape( smart_unicode(sort_name_match.group(1))) log("Score : %s" % artist["score"], xbmc.LOGDEBUG) log("Id : %s" % artist["id"], xbmc.LOGDEBUG) log("Name : %s" % artist["name"], xbmc.LOGDEBUG) log("Sort Name : %s" % artist["sortname"], xbmc.LOGDEBUG) artists.append(artist) else: log("No Artist ID found for Artist: %s" % repr(artist_search), xbmc.LOGDEBUG) xbmc.sleep(mb_delay) return artists
def _locate(self, town_name): town_name = utils.unescape(town_name.strip()) if town_name not in self.location_cache: try: self.location_cache[town_name] = self.geo_locator.geocode(town_name) except geopy.exc.GeocoderTimedOut: print "Geocoder Timeout." return None return self.location_cache[town_name]
def getFavourites(file, limit=10000, validate=True, superSearch=False): import xbmcgui file = xbmc.translatePath(file) xml = '<favourites></favourites>' if os.path.exists(file): fav = open(file , 'r') xml = fav.read() fav.close() items = [] faves = re.compile('<favourite(.+?)</favourite>').findall(xml) for fave in faves: fave = fave.replace('"', '&_quot_;') fave = fave.replace('\'', '"') fave = utils.unescape(fave) fave = fave.replace('name=""', '') try: name = re.compile('name="(.+?)"').findall(fave)[0] except: name = '' try: thumb = re.compile('thumb="(.+?)"').findall(fave)[0] except: thumb = '' try: cmd = fave.split('>', 1)[-1] except: cmd = '' #name = utils.Clean(name.replace( '&_quot_;', '"')) name = name.replace( '&_quot_;', '"') thumb = thumb.replace('&_quot_;', '"') cmd = cmd.replace( '&_quot_;', '"') add = False if superSearch: add = isValid(cmd) elif (SHOWUNAVAIL) or (not validate) or isValid(cmd): add = True if add: cmd = upgradeCmd(cmd) if cmd.startswith('PlayMedia'): option = 'mode' try: mode = int(favourite.getOption(cmd, option)) except: win = xbmcgui.getCurrentWindowId() cmd = updateSFOption(cmd, 'winID', win) items.append([name, thumb, cmd]) if len(items) > limit: return items return items
def check_url(url, geturl=False): send = [] try: o = urllib.urlopen(url) ctype, clength = o.info().get("Content-Type"), o.info().get( "Content-Length") if o.info().gettype() == "text/html": title = 'Pas de titre' html = o.read(1000000) try: SoupList = BeautifulSoup(utils.unescape(html), parseOnlyThese=SoupStrainer('title')) except UnicodeDecodeError: SoupList = BeautifulSoup(utils.unescape( html.decode("latin1", "ignore")), parseOnlyThese=SoupStrainer('title')) try: titles = [title for title in SoupList] title = utils.xhtml2text(titles[0].renderContents()) except IndexError: title = "Pas de titre" except HTMLParseError: pass if geturl: send.append("%s : [Lien] Titre : %s" % (o.geturl(), " ".join(title.split()))) else: send.append("[Lien] Titre : %s" % " ".join(title.split())) else: send.append("[Lien] Type: %s, Taille : %s octets" % (ctype, clength)) o.close() except IOError as error: if error[1] == 401: send.append("Je ne peux pas m'authentifier sur %s :'(" % url) elif error[1] == 404: send.append("%s n'existe pas !" % url) elif error[1] == 403: send.append("Il est interdit d'accéder à %s !" % url) else: send.append("Erreur %s sur %s" % (error[1], url)) except httplib.InvalidURL: send.append("L'URL %s n'est pas valide !" % url) return send
def settings_to_log(self): try: utils.log("Settings") setting_values = self.read_settings_xml() for k, v in sorted(setting_values.items()): utils.log( "%30s: %s" % (k, str(utils.unescape(v.decode('utf-8', 'ignore'))))) except: traceback.print_exc()
def to_str(self, i=1): s = "{}:\n".format(type(self).__name__) for field in list(self._fields) + self.extra_fields: value = self.__getattribute__(field) if isinstance(value, Tree): s += "{}{}:{}".format(self.unit * i, field, value.to_str(i + 1)) elif isinstance(value, list): s += "{}{}:\n".format(self.unit * i, field) for v in value: if isinstance(v, Tree): s += "{}- {}".format(self.unit * i, v.to_str(i + 1)) else: s += "{}- {}\n".format(self.unit * i, utils.unescape(v)) else: s += "{}{}: {}\n".format(self.unit * i, field, utils.unescape(value)) return s
def post(self): text = self.request.get('text') if text: conver_url = utils.unescape(self.request.get('url')) conver = Conver.get_for_url(conver_url) message = Message(author=PermaUser.get_current_permauser(), text=text, conver=conver) message.put() self.distribute_message(message) else: logging.error("No message '%S'saved for %s", text, conver_url)
def parse_starting_page(self, response): ranking = 0 for sel in response.xpath('//div[@class="content"]/table/tr'): team_link = sel.xpath('td/a/@href').extract_first() if team_link is not None: team_name = sel.xpath('td/a/text()').extract_first() data = sel.xpath('td/text()').extract() ranking_item = JtrTeamRankingItem() ranking_item['team_name'] = utils.unescape(team_name) if len(data) == 4: ranking, city, tournaments, points = data else: city, tournaments, points = data ranking_item['ranking'] = int(ranking.split("/")[0].strip().strip(".")) ranking_item['hometown'] = utils.unescape(city) ranking_item['points'] = float(points) ranking_item['number_of_tournaments'] = utils.unescape(tournaments) ranking_item['crawl_date'] = datetime.datetime.now() yield ranking_item yield scrapy.Request(response.urljoin(team_link), callback=self.parse_team_site)
def addtoKodiFavorites(command, name, thumbnail): import xml.etree.ElementTree from utils import unescape #adding to favorites involve 3 steps: # 1.) add the favorite via jsonrpc (script params not included) # 2.) modify the favourites.xml to include script params <-- (kodi18 leia alpha1) i think there is another favourites file or this file is cached until another favorite is added # 3.) ??? <-- adding another favorite will delete the first one (until kodi is restarted) need to find a way for kodi to reload the modified favourite.xml #http://kodi.wiki/view/JSON-RPC_API/v8#Favourites #schema=xbmc.executeJSONRPC('{"jsonrpc": "2.0", "method": "JSONRPC.Introspect", "id": 1}') #log(repr(schema)) favorite_was_found=False #add_dummy_favorite() temp_command='script.reddit.reader' #can't add script favorites with parameter using jsonrpc saved_command='RunScript("script.reddit.reader")' json_rpc_command={"jsonrpc": "2.0", "method": "Favourites.AddFavourite", 'params': { 'title': name, 'type': 'script', 'path': temp_command, 'thumbnail':thumbnail, }, 'id': '1' } a=xbmc.executeJSONRPC(json.dumps(json_rpc_command)) #log(repr(a)) a=json.loads(a) if a.get('result','')=="OK": log('Favourite added') #now that we've created the favorite, we edit it to add parameters favorites_xml = xbmc.translatePath(os.path.join(addon.getAddonInfo('profile'), '..','..','favourites.xml')) if os.path.exists(favorites_xml): #log('{0} exists'.format(favorites_xml) ) et = xml.etree.ElementTree.parse(favorites_xml) root=et.getroot() for f in root.findall('favourite'): #the name attribute is escape encoded the xml file. fav_name=unescape( f.get('name') ) #replaces & to & etc. fav_cmd=f.text #log('*a*'+repr(name) + ' ' + saved_command) #log('*b*'+repr(fav_name) + ' ' + fav_cmd ) #log('---') if (fav_name==name) and (fav_cmd==saved_command): log('Favourite entry found {0}'.format(fav_name) ) favorite_was_found=True f.text=command if favorite_was_found: et.write(favorites_xml) xbmc_notify(translation(32028), fav_name, icon=thumbnail)
def extract_tweets(tweets, cmd_line=False): """ prints the tweets from tweets: list of tweet dicts """ tweet_texts = [] for tweet in tweets: text = get_tweet(tweet) if cmd_line: text = text.encode('unicode-escape') text = ununicode(text) text = unescape(text) tweet_texts.append(parser(text)) return tweet_texts
def check_url(url, geturl=False): send = [] try: o = urllib.urlopen(url) ctype, clength = o.info().get("Content-Type"), o.info().get("Content-Length") if o.info().gettype() == "text/html": title = 'Pas de titre' html = o.read(1000000) try: SoupList = BeautifulSoup(utils.unescape(html), parseOnlyThese=SoupStrainer('title')) except UnicodeDecodeError: SoupList = BeautifulSoup(utils.unescape(html.decode("latin1", "ignore")), parseOnlyThese=SoupStrainer('title')) try: titles = [title for title in SoupList] title = utils.xhtml2text(titles[0].renderContents()) except IndexError: title = "Pas de titre" except HTMLParseError: pass if geturl: send.append("%s : [Lien] Titre : %s" % (o.geturl(), " ".join(title.split()))) else: send.append("[Lien] Titre : %s" % " ".join(title.split())) else: send.append("[Lien] Type: %s, Taille : %s octets" % (ctype, clength)) o.close() except IOError as error: if error[1] == 401: send.append("Je ne peux pas m'authentifier sur %s :'(" % url) elif error[1] == 404: send.append("%s n'existe pas !" % url) elif error[1] == 403: send.append("Il est interdit d'accéder à %s !" % url) else: send.append("Erreur %s sur %s" % (error[1], url)) except httplib.InvalidURL: send.append("L'URL %s n'est pas valide !" % url) return send
def load(request): if not request.user.is_authenticated(): return HttpResponseRedirect("/accounts/login") if request.method == 'POST': form = ImportDeliciousForm(request.POST, request.FILES) if form.is_valid(): db = get_database()[Bookmark.collection_name] html = request.FILES['exported'].read().decode('utf8') soup = BeautifulSoup(html) for item in soup.findAll('dt'): desc = '' next = item.findNextSiblings() if next: next = next[0] if 'name' in dir(next) and next.name == 'dd': desc = unescape(u''.join(imap(unicode, next.contents))) db.Bookmark({ 'url': urlSanitize(item.a['href']), 'seq': getNextVal('seq'), 'tags': item.a['tags'].split(','), 'user': unicode(request.user), 'created': datetime.fromtimestamp(float(item.a['add_date'])), 'private': item.a['private'] == '1', 'title': unescape(unicode(item.a.string)), 'notes': unicode(desc) }).save() return HttpResponseRedirect('/u/%s/' % request.user) else: form = ImportDeliciousForm() return render_to_response('import.html', { 'form': form, }, context_instance=RequestContext(request))
def _build_show_summary( self, data, show_status=False, pre_rating='', post_rating='', ratings_pos='front', preserve_rating=False, ): out = [] star = unescape("★") sep = " | " status = _get(data, 'status') plot = _get(data, 'plot') alt_ratings = _get(data, 'alt_ratings') rating = _get(data, 'rating', 0.0) if show_status and status: out.append('Status: {}'.format(status)) if plot: out.append(plot) if alt_ratings: buf = [] for source, _rating in alt_ratings: buf.append("{}: {}".format(source, _rating)) piece = sep.join(buf) if ratings_pos == 'front': out.insert(0, star + " " + piece + " " + star + "\n\n") else: out.append("\n\n" + star + " " + piece + " " + star) if preserve_rating: tmp = unescape("{}{:.1f}{}".format(pre_rating, rating, post_rating)) out.insert(0, tmp) return sep.join(out)
def get_musicbrainz_artists( artist_search, limit=1 ): log( "Artist: %s" % artist_search, xbmc.LOGDEBUG ) score = "" name = "" id = "" sortname = "" artists = [] artist_name = smart_unicode( artist_search.replace( '"', '?' ) ) url = artist_url % ( server, quote_plus( artist_name.encode("utf-8") ), limit ) htmlsource = get_html_source( url, "", save_file = False, overwrite = False ) match = re.findall( '''<artist(.*?)</artist>''', htmlsource ) if match: for item in match: artist = {} artist["score"] = "" artist["name"] = "" artist["id"] = "" artist["sortname"] = "" score_match = re.search( '''score="(.*?)"''', item ) name_match = re.search( '''<name>(.*?)</name>''', item ) id_match = re.search( '''id="(.*?)"(?:.*?)>''', item ) if not id_match: id_match = re.search( '''id="(.*?)">''', item ) sort_name_match = re.search( '''<sort-name>(.*?)</sort-name>''', item ) if score_match: artist["score"] = score_match.group(1) if name_match: artist["name"] = unescape( smart_unicode( name_match.group(1) ) ) if id_match: artist["id"] = id_match.group(1) if sort_name_match: artist["sortname"] = unescape( smart_unicode( sort_name_match.group(1) ) ) log( "Score : %s" % artist["score"], xbmc.LOGDEBUG ) log( "Id : %s" % artist["id"], xbmc.LOGDEBUG ) log( "Name : %s" % artist["name"], xbmc.LOGDEBUG ) log( "Sort Name : %s" % artist["sortname"], xbmc.LOGDEBUG ) artists.append(artist) else: log( "No Artist ID found for Artist: %s" % repr( artist_search ), xbmc.LOGDEBUG ) xbmc.sleep( mb_delay ) return artists
def artist(url): html = urlopen(url) found = re.findall(r'<a href="/music/url\?q=(/music/album\?.*?)&.*?>(.*?)</a>', html.split('所有专辑', 1)[1]) albums = dict(found) artist = trim_title(html) print artist, 'albums', len(albums) for href, title in sorted(albums.items(), lambda i,j: cmp(i[1],j[1])): url = 'http://www.google.cn%s' % urllib.unquote(href) print '%s |%s' % (url, unescape(title))
def _message_handler(message): mention = '@gvobot' print(message) # Strip mention if it's at the beginning if message == mention: message = '' elif message.startswith(mention): # Remove the extra space added after the mention, too message = message[len(mention) + 1:] # Unescape message (skype encodes &, <, >, ', and ") message = unescape(message) # Be snarky when no message is sent; otherwise, S.C.I.E.N.C.E. if len(message) == 0: response = 'Has anyone really been far even as decided to use ' \ 'even go want to do look more like?' elif FIXED_RESPONSES.get(message.lower(), None) is not None: response = FIXED_RESPONSES[message.lower()] elif message.startswith('!number'): usage = 'Usage: !number [<start num> <end num>]' args = message.split() if len(args) == 1: response = str(random.randint(1, 6)) elif len(args) == 3: try: start = int(args[1]) end = int(args[2]) response = str(random.randint(start, end)) except ValueError: response = usage else: response = usage elif message.startswith('!song'): args = message.split() if len(args) == 1: song = get_random_song() response = song.to_message() else: response = 'Usage: !song' else: response = sciencify(message) # Allow bot to do actions with /me if response.startswith('/M.E. '): response = response.replace('/M.E.', '/me', 1) # The bot's name is unscienceable. response = response.replace('@G.V.O.B.O.T.', '@gvobot') print(response) return response
def twitter_status(twitter_username): status = cache.get('feds-%s-status' % twitter_username) if status is None: try: import twitter user = twitter.Api().GetUser(twitter_username) status = user.status text = unescape(status.text) status = render_to_string('feds/twitter_status.html', locals()) except: status = '' cache.set('feds-%s-status' % twitter_username, status) return status
def get_musicbrainz_artist_id( artist, limit=1, alias = False ): name = "" id = "" sortname = "" artist_name = smart_unicode( artist.replace( '"', '?' ) ) if not alias: url = artist_url % ( server, quote_plus( artist_name.encode("utf-8") ), limit ) else: url = alias_url % ( server, quote_plus( artist_name.encode("utf-8") ), limit ) htmlsource = get_html_source( url, "", save_file = False) match = re.search( '''<artist(.*?)</artist>''', htmlsource ) if match: score_match = re.search( '''score="(.*?)"''', htmlsource ) name_match = re.search( '''<name>(.*?)</name>''', htmlsource ) id_match = re.search( '''<artist id="(.*?)"(?:.*?)>''', htmlsource ) if not id_match: id_match = re.search( '''<artist (?:.*?)id="(.*?)">''', htmlsource ) sort_name_match = re.search( '''<sort-name>(.*?)</sort-name>''', htmlsource ) if score_match: score = score_match.group(1) if name_match: name = unescape( smart_unicode( name_match.group(1) ) ) if id_match: id = id_match.group(1) if sort_name_match: sortname = unescape( smart_unicode( sort_name_match.group(1) ) ) log( "Score : %s" % score, xbmc.LOGDEBUG ) log( "Id : %s" % id, xbmc.LOGDEBUG ) log( "Name : %s" % name, xbmc.LOGDEBUG ) log( "Sort Name : %s" % sortname, xbmc.LOGDEBUG ) else: if not alias: log( "No Artist ID found trying aliases: %s" % artist, xbmc.LOGDEBUG ) name, id, sortname = get_musicbrainz_artist_id( artist, limit, True ) else: log( "No Artist ID found for Artist: %s" % artist, xbmc.LOGDEBUG ) xbmc.sleep( mb_delay ) return name, id, sortname
def article_list(request, blog_id=None): ret = {'status': 'error', "data": []} if blog_id: try: blog = BlogModel.objects.get(pk=blog_id) articles = ArticleModel.objects.filter(niche=blog.niche) for article in articles: ret["data"].append({'id': article.id, 'title': article.title, 'text': mark_safe( unescape(article.text))}) ret['status'] = 'ok'; except Exception, e: print(e) pass
def undo_plurals(has_plural, plurals): """Undo what `force_plurals` does in order to figure out if just `msgstr` or `msgstr[x]` should be set. Returns `(singular_msgstr, plural_msgstr_map)` """ plurals_list = plurals.split(PLURAL_SEPARATOR) plurals_dict = {} for i, p in enumerate(plurals_list): plurals_dict[unicode(i)] = unescape(p) if has_plural: return '', plurals_dict return plurals_dict.get('0', ''), {}
def strip_tags(text): # Preserve paragraph breaks. Convert closing p tags (and surrounding whitespace) into two newlines. Strip trailing whitespace text = re.sub("\s*</\s*p\s*>\s*", "\n\n", text).strip() # naive stripping of tags, should work okay in this limited context text = re.sub("<[^>]+>", "", text) # compress and strip whitespace artifacts, except for the paragraph breaks text = re.sub("[ \t\r\f\v]{2,}", " ", text).strip() # Replace HTML entities with characters. text = utils.unescape(text) return text
def parse_team_site(self, response): team = response.xpath('//div[@class="title"]/text()').extract_first() for sel in response.xpath('//div[@class="content"]/table/tr'): tournament_link = sel.xpath('td/a/@href').extract_first() if tournament_link is not None: data = sel.xpath('td/text()').extract() tournament_name = sel.xpath('td/a/text()').extract_first() if len(data) == 6: date, tournament_town, ranking, zf, tw, points = data item = JtrTournamentPartition() item['tournament_date'] = date item['crawl_date'] = datetime.datetime.now() item['ranking'] = int(ranking.split("/")[0].strip().strip(".")) home_town, team_name = team.split("-", 1) item['team_name'] = utils.unescape(team_name.strip()) item['team_hometown'] = utils.unescape(home_town.strip()) item['tournament_town'] = utils.unescape(tournament_town) item['tournament_name'] = utils.unescape(tournament_name) home_town = self._locate(home_town) tournament_town = self._locate(tournament_town) item["team_hometown_position"] = self._get_geohash(home_town) item["tournament_town_position"] = self._get_geohash(tournament_town) item["distance"] = self._get_distance(home_town, tournament_town) yield item
def get_tracks(album, links): track_num = 1 for link in links: track = {} _, track['name'] = link['data-title'].split(' — ') track['year'] = album.year track['album'] = album track['name'] = unescape(track['name']) track['number'] = track_num track['url'] = url + link['data-mp3url'] new_track = Track(**track) session.add(new_track) track_num += 1 session.commit() return track_num
def article_list(request, blog_id=None): ret = {'status': 'error', "data": []} if blog_id: try: blog = BlogModel.objects.get(pk=blog_id) articles = ArticleModel.objects.filter(niche=blog.niche) for article in articles: ret["data"].append({ 'id': article.id, 'title': article.title, 'text': mark_safe(unescape(article.text)) }) ret['status'] = 'ok' except Exception, e: print(e) pass
def get(self): permauser = PermaUser.get_current_permauser() conver_url = utils.unescape(self.request.get('url')) conver = Conver.get_for_url(conver_url) messages = Message.all().filter('conver =', conver).order('created').fetch(1000) self.response.out.write(template.render( os.path.join(os.path.dirname(__file__), 'templates/conver.html'), { 'token': channel.create_channel(permauser.user_id() + str(conver.key().id_or_name())), 'conver_url': conver_url, 'messages': [ {'author': message.author.display_name(), 'text': message.text} for message in messages], 'loginorout_text': 'Log out', 'loginorout_url': users.create_logout_url(self.request.uri) } ))
def get_categories(self, root_id=0): categories = self.get_category_tree() matches = [] for id in categories: if int(id) < 0: continue category = categories.get(id) if (int(category.get('parentId')) == int(root_id)): matches.append({ 'label': unescape(category.get('name')), 'path': self.plugin.url_for('show_category', id=str(id)), 'id': id }) return matches
def getTVGuide(tvchannel): url = getChannelGuideUrl(tvchannel) if not url: return None try: req = urllib2.Request(url) req.add_header('User-Agent', common.HEADERS['User-Agent']) conn = urllib2.urlopen(req, timeout=5) html = conn.read() conn.close() soup = BeautifulSoup(html, 'html5lib') tds = soup.findAll('td', attrs={'class': 'container_events'}) tds = [tds[i] for i in xrange(len(tds)) if divmod(i, 4)[1] == 0] hours = [] titles = [] for td in tds: hours.extend(td.findAll('td', attrs={'class': 'ora'})) titles.extend(td.findAll('div', attrs={'class': 'title'})) if not hours or not titles or len(hours) != len(titles): return None items = [] for i in xrange(len(titles)): current = 'current' in str(hours[i]) hour = re.search(r'<div>(\d+:\d+)<\/div>', str(hours[i])).group(1) title = titles[i].getText().strip() title = ' '.join(title.split()) title = utils.unescape(title, True) item = (hour, title, current) items.append(item) return items except: log_utils.log(traceback.print_exc()) return None
def find_links(doc_id): if doc_id is None: return doc = Page.load(settings.db, doc_id) if doc.content is None: print "Got None for the content of %s -> %s." % (doc_id, doc.url) return raw_links = [] for match in link_single_re.finditer(doc.content): raw_links.append(match.group(1)) for match in link_double_re.finditer(doc.content): raw_links.append(match.group(1)) doc.links = [] for link in raw_links: if link.startswith("#"): continue elif link.startswith("http://") or link.startswith("https://"): pass elif link.startswith("/"): parse = urlparse(doc["url"]) link = parse.scheme + "://" + parse.netloc + link else: link = "/".join(doc["url"].split("/")[:-1]) + "/" + link doc.links.append(unescape(link.split("#")[0])) print "find_links %s -> %i" % (doc.url, len(doc.links)) doc.store(settings.db) calculate_rank.delay(doc.id) for link in doc.links: p = Page.get_id_by_url(link, update=False) if p is not None: calculate_rank.delay(p) else: retrieve_page.delay(link)
res = db.find(query, sort=order) total = res.count() paginator = Paginator(res, limit) try: res = paginator.page(page) except (EmptyPage, InvalidPage): res = paginator.page(paginator.num_pages) if request.GET.get('format', '') == 'json': res = [{ 'url': unicode(obj['url']), 'title': unicode(obj['title']), 'created': tuple(obj['created'].timetuple()), 'private': obj['private'], 'notes': unicode(unescape(obj['notes'])), 'tags': obj['tags'] } for obj in res.object_list] if request.GET.get('j') == None: return HttpResponse(json.dumps(res), mimetype="application/json") return HttpResponse("var omnom_posts = " + json.dumps(res) + ";", mimetype="text/javascript") if request.GET.get('format', '') == 'atom': tpl = 'atom.xml' else: tpl = 'list.html' res.object_list = [{ 'seq': obj['seq'],
def latex2png(picture_element, preamble, return_eps=False, page_width_px=None, dpi=150, included_files={}, pdflatexpath=None): """ Create a PNG image from latex. Inputs: pspicture_element - etree.Element preamble - which preamble to use, one of PsPicture_preamble, tikzpicture_preamble or equation_preamble return_eps - whether to also return the intermediate EPS file page_width_px - page width in pixels, used to scale the style:width attribute in the element. dpi - Will be used only if the width of the figure relative to the page width was not set (or the page width in pixels was not passed as an argument). Outputs: One or two paths, the first to the PNG, the second to the EPS. """ temp_dir = tempfile.mkdtemp() latex_path = os.path.join(temp_dir, 'figure.tex') png_path = os.path.join(temp_dir, 'figure.png') pdf_path = os.path.join(temp_dir, 'figure.pdf') # can send the raw string code or a <pre> element with <code> child if isinstance(picture_element, (str, unicode)): code = picture_element code = cleanup_code(code) else: code = picture_element.find('.//code').text.encode('utf-8') code = code.replace(r'&', '&').replace(r'>', '>').replace(r'<', '<') if not code: raise ValueError("Code cannot be empty.") with open(latex_path, 'wt') as fp: temp = unescape(preamble.replace('__CODE__', code.strip())) try: fp.write(temp) except UnicodeEncodeError: fp.write(temp.encode('utf-8')) for path, path_file in included_files.iteritems(): try: os.makedirs(os.path.join(temp_dir, os.path.dirname(path))) except OSError: # Catch exception if path already exists pass with open(os.path.join(temp_dir, path), 'wb') as fp: fp.write(path_file.read()) if not pdflatexpath: raise ValueError("pdflatexpath cannot be None") errorLog, temp = execute([pdflatexpath, "-shell-escape", "-halt-on-error", "-output-directory", temp_dir, latex_path]) try: open(pdf_path, "rb") except IOError: raise LatexPictureError( "LaTeX failed to compile the image. %s \n%s" % ( latex_path, preamble.replace('__CODE__', code.strip()))) # crop the pdf image too # execute(['pdfcrop', '--margins', '1', pdfPath, pdfPath]) execute(['convert', '-density', '%i' % dpi, pdf_path, png_path]) return png_path
def listLinksInComment(url, name, type_): from domains import parse_reddit_link, build_DirectoryItem_url_based_on_media_type from utils import markdown_to_bbcode, unescape from guis import progressBG #from resources.domains import make_addon_url_from #called from context menu log('listLinksInComment:%s:%s' % (type_, url)) #does not work for list comments coz key is the playable url (not reddit comments url) #msg=WINDOW.getProperty(url) #WINDOW.clearProperty( url ) #log( ' msg=' + msg ) directory_items = [] author = "" ShowOnlyCommentsWithlink = False if type_ == 'linksOnly': ShowOnlyCommentsWithlink = True #url='https://www.reddit.com/r/Music/comments/4k02t1/bonnie_tyler_total_eclipse_of_the_heart_80s_pop/' + '.json' #only get up to "https://www.reddit.com/r/Music/comments/4k02t1". # do not include "/bonnie_tyler_total_eclipse_of_the_heart_80s_pop/" # because we'll have problem when it looks like this: "https://www.reddit.com/r/Overwatch/comments/4nx91h/ever_get_that_feeling_déjà _vu/" #url=re.findall(r'(.*/comments/[A-Za-z0-9]+)',url)[0] #use safe='' argument in quoteplus to encode only the weird chars part url = urllib.quote_plus(url, safe=':/?&') if '?' in url: url = url.split('?', 1)[0] + '.json?' + url.split('?', 1)[1] else: url += '.json' loading_indicator = progressBG(translation(30024)) loading_indicator.update(0, 'Retrieving comments') content = reddit_request(url) if not content: loading_indicator.end() return loading_indicator.update(10, 'Parsing') content = json.loads(content) del harvest[:] #harvest links in the post text (just 1) r_linkHunter(content[0]['data']['children']) try: submitter = content[0]['data']['children'][0]['data']['author'] except: submitter = '' #the post title is provided in json, we'll just use that instead of messages from addLink() try: post_title = content[0]['data']['children'][0]['data']['title'] except: post_title = '' #for i, h in enumerate(harvest): # log("aaaaa first harvest "+h[2]) #harvest links in the post itself r_linkHunter(content[1]['data']['children']) comment_score = 0 loading_indicator.set_tick_total(len(harvest)) for i, h in enumerate(harvest): try: #log(str(i)+" score:"+ str(h[0]).zfill(5)+" "+ h[1] +'|'+ h[3] ) comment_score = h[0] #log("score %d < %d (%s)" %(comment_score,int_CommentTreshold, CommentTreshold) ) link_url = h[2] desc100 = h[3].replace( '\n', ' ')[0:100] #first 100 characters of description kind = h[ 6] #reddit uses t1 for user comments and t3 for OP text of the post. like a poster describing the post. d = h[5] #depth of the comment tab = " " * d if d > 0 else "-" from urlparse import urlparse domain = '{uri.netloc}'.format(uri=urlparse(link_url)) author = h[7] DirectoryItem_url = '' if comment_score < int_CommentTreshold: continue #hoster, DirectoryItem_url, videoID, mode_type, thumb_url,poster_url, isFolder,setInfo_type, setProperty_IsPlayable =make_addon_url_from(h[2]) #if link_url: # log( ' comment %s TITLE:%s... link[%s]' % ( str(d).zfill(3), desc100.ljust(20)[:20],link_url ) ) ld = parse_reddit_link(link_url=link_url, assume_is_video=False, needs_preview=True, get_playable_url=True) if kind == 't1': list_title = r"[COLOR cadetblue]%3d[/COLOR] %s" % (h[0], tab) elif kind == 't3': list_title = r"[COLOR cadetblue]Title [/COLOR] %s" % (tab) #helps the the textbox control treat [url description] and (url) as separate words. so that they can be separated into 2 lines plot = h[3].replace('](', '] (') plot = markdown_to_bbcode(plot) plot = unescape(plot) #convert html entities e.g.:(') liz = xbmcgui.ListItem(label=list_title + ': ' + desc100) liz.setInfo(type="Video", infoLabels={ "Title": h[1], "plot": plot, "studio": domain, "votes": str(comment_score), "director": author }) isFolder = False #force all links to ytdl to see if it can be played if link_url: DirectoryItem_url, setProperty_IsPlayable, isFolder, title_prefix = build_DirectoryItem_url_based_on_media_type( ld, link_url) liz.setProperty('IsPlayable', setProperty_IsPlayable) liz.setProperty( 'url', DirectoryItem_url) #<-- needed by the xml gui skin liz.setPath(DirectoryItem_url) if domain: plot = " [COLOR greenyellow][%s] %s" % (domain, plot) + "[/COLOR]" else: plot = " [COLOR greenyellow][%s]" % (plot) + "[/COLOR]" liz.setLabel(list_title + plot) if ld: liz.setArt({ "thumb": ld.poster, "poster": ld.poster, "banner": ld.poster, "fanart": ld.poster, "landscape": ld.poster }) if DirectoryItem_url: #log( 'IsPlayable:'+setProperty_IsPlayable ) directory_items.append(( DirectoryItem_url, liz, isFolder, )) #xbmcplugin.addDirectoryItem(handle=pluginhandle,url=DirectoryItem_url,listitem=liz,isFolder=isFolder) else: #this section are for comments that have no links if not ShowOnlyCommentsWithlink: result = h[3].replace('](', '] (') result = markdown_to_bbcode(result) liz = xbmcgui.ListItem(label=list_title + desc100) liz.setInfo(type="Video", infoLabels={ "Title": h[1], "plot": result, "studio": domain, "votes": str(h[0]), "director": author }) liz.setProperty('IsPlayable', 'false') directory_items.append(( "", liz, False, )) #xbmcplugin.addDirectoryItem(handle=pluginhandle,url="",listitem=liz,isFolder=False) #END section are for comments that have no links or unsupported links except Exception as e: log(' EXCEPTION:' + str(e)) #for di in directory_items: # log( str(di) ) loading_indicator.tick(1, desc100) loading_indicator.end() #log(' comments_view id=%s' %comments_viewMode) #xbmcplugin.setContent(pluginhandle, "mixed") #in estuary, mixed have limited view id's available. it has widelist which is nice for comments but we'll just stick with 'movies' xbmcplugin.setContent( pluginhandle, "episodes" ) #files, songs, artists, albums, movies, tvshows, episodes, musicvideos xbmcplugin.setPluginCategory(pluginhandle, 'Comments') xbmcplugin.addDirectoryItems(handle=pluginhandle, items=directory_items) xbmcplugin.endOfDirectory(pluginhandle) if comments_viewMode: xbmc.executebuiltin('Container.SetViewMode(%s)' % comments_viewMode)
def getFavourites(file, limit=10000, validate=True, superSearch=False, chooser=False): import xbmcgui prefix = '' if not chooser: prefix = 'HOME:' if xbmcgui.getCurrentWindowId() == 10000 else '' xml = '<favourites></favourites>' if sfile.exists(file): xml = sfile.read(file) items = [] faves = re.compile('<favourite(.+?)</favourite>').findall(xml) for fave in faves: fave = fave.replace('"', '&_quot_;') fave = fave.replace('\'', '"') fave = utils.unescape(fave) fave = fave.replace('name=""', '') try: name = re.compile('name="(.+?)"').findall(fave)[0] except: name = '' try: thumb = re.compile('thumb="(.+?)"').findall(fave)[0] except: thumb = '' try: cmd = fave.split('>', 1)[-1] except: cmd = '' #name = utils.Clean(name.replace( '&_quot_;', '"')) name = name.replace('&_quot_;', '"') thumb = thumb.replace('&_quot_;', '"') cmd = cmd.replace('&_quot_;', '"') add = False if superSearch: add = isValid(cmd) elif (SHOWUNAVAIL) or (not validate) or isValid(cmd): add = True if add: cmd = upgradeCmd(cmd) if cmd.startswith('PlayMedia'): option = 'mode' try: mode = int(favourite.getOption(cmd, option)) except: win = xbmcgui.getCurrentWindowId() cmd = updateSFOption(cmd, 'winID', win) name = resolve(name) cmd = patch(cmd) cmd = resolve(cmd) cmd = prefix + cmd items.append([name, thumb, cmd]) if len(items) > limit: return items return items
elif _command.startswith("movie_title="): titles = re.split("=", _command, maxsplit=1)[1] movie_titles = titles.split(";") if not movie_titles == "": _build_playlist(movie_titles) exit = Script().start_script("oldway") else: exit = False elif _command.startswith( "open_settings"): # Open Settings __addon__.openSettings() exit = False elif sys.argv[1].startswith( "jsonquery="): # JSON RPC Query _clear_playlists() jsonquery = utils.unescape( re.split("=", sys.argv[1], maxsplit=1)[1]) jsonquery = (jsonquery.replace("<li>", ":")).replace( "<lic>", ",") #print jsonquery movie_ids = Script()._jsonrpc_query(jsonquery) if movie_ids: _build_playlist(movie_ids) exit = Script().start_script("oldway") else: exit = False elif sys.argv[1].startswith("movieid="): _clear_playlists() movie_id = sys.argv[1].split("=")[1] movie_ids = movie_id.split(";") if movie_ids: _build_playlist(movie_ids, mode="movie_ids")
def run(): def update_birthday(bioguide, person, main): birthday = birthday_for(main) if not birthday: print("[%s] NO BIRTHDAY :(\n\n%s" % (bioguide, main.encode("utf8"))) warnings.append(bioguide) return if birthday == "UNKNOWN": return try: birthday = datetime.datetime.strptime(birthday.replace(",", ""), "%B %d %Y") except ValueError: print("[%s] BAD BIRTHDAY :(\n\n%s" % (bioguide, main.encode("utf8"))) warnings.append(bioguide) return birthday = "%04d-%02d-%02d" % (birthday.year, birthday.month, birthday.day) person.setdefault("bio", {})["birthday"] = birthday def birthday_for(string): # exceptions for not-nicely-placed semicolons string = string.replace( "born in Cresskill, Bergen County, N. J.; April", "born April") string = string.replace( "FOSTER, A. Lawrence, a Representative from New York; September 17, 1802;", "born September 17, 1802") string = string.replace( "CAO, Anh (Joseph), a Representative from Louisiana; born in Ho Chi Minh City, Vietnam; March 13, 1967", "born March 13, 1967") string = string.replace( "CRITZ, Mark S., a Representative from Pennsylvania; born in Irwin, Westmoreland County, Pa.; January 5, 1962;", "born January 5, 1962") string = string.replace( "SCHIFF, Steven Harvey, a Representative from New Mexico; born in Chicago, Ill.; March 18, 1947", "born March 18, 1947") string = string.replace( 'KRATOVIL, Frank, M. Jr., a Representative from Maryland; born in Lanham, Prince George\u2019s County, Md.; May 29, 1968', "born May 29, 1968") # look for a date pattern = r"born [^;]*?((?:January|February|March|April|May|June|July|August|September|October|November|December),? \d{1,2},? \d{4})" match = re.search(pattern, string, re.I) if not match or not match.group(1): # specifically detect cases that we can't handle to avoid unnecessary warnings if re.search("birth dates? unknown|date of birth is unknown", string, re.I): return "UNKNOWN" if re.search( "born [^;]*?(?:in|about|before )?(?:(?:January|February|March|April|May|June|July|August|September|October|November|December) )?\d{4}", string, re.I): return "UNKNOWN" return None return match.group(1).strip() def relationships_of(string): # relationship data is stored in a parenthetical immediately after the end of the </font> tag in the bio # e.g. "(son of Joseph Patrick Kennedy, II, and great-nephew of Edward Moore Kennedy and John Fitzgerald Kennedy)" pattern = "^\((.*?)\)" match = re.search(pattern, string, re.I) relationships = [] if match and len(match.groups()) > 0: relationship_text = match.group(1).encode("ascii", "replace") # since some relationships refer to multiple people--great-nephew of Edward Moore Kennedy AND John Fitzgerald Kennedy--we need a special grammar from nltk import tree, pos_tag, RegexpParser tokens = re.split("[ ,;]+|-(?![0-9])", relationship_text) pos = pos_tag(tokens) grammar = r""" NAME: {<NNP>+} NAMES: { <IN><NAME>(?:<CC><NAME>)* } RELATIONSHIP: { <JJ|NN|RB|VB|VBD|VBN|IN|PRP\$>+ } MATCH: { <RELATIONSHIP><NAMES> } """ cp = RegexpParser(grammar) chunks = cp.parse(pos) # iterate through the Relationship/Names pairs for n in chunks: if isinstance(n, tree.Tree) and n.node == "MATCH": people = [] relationship = None for piece in n: if piece.node == "RELATIONSHIP": relationship = " ".join([x[0] for x in piece]) elif piece.node == "NAMES": for name in [ x for x in piece if isinstance(x, tree.Tree) ]: people.append(" ".join([x[0] for x in name])) for person in people: relationships.append({ "relation": relationship, "name": person }) return relationships # default to caching cache = utils.flags().get('cache', True) force = not cache # pick either current or historical # order is important here, since current defaults to true if utils.flags().get('historical', False): filename = "legislators-historical.yaml" elif utils.flags().get('current', True): filename = "legislators-current.yaml" else: print("No legislators selected.") exit(0) print("Loading %s..." % filename) legislators = load_data(filename) # reoriented cache to access by bioguide ID by_bioguide = {} for m in legislators: if "bioguide" in m["id"]: by_bioguide[m["id"]["bioguide"]] = m # optionally focus on one legislator bioguide = utils.flags().get('bioguide', None) if bioguide: bioguides = [bioguide] else: bioguides = list(by_bioguide.keys()) warnings = [] missing = [] count = 0 families = 0 for bioguide in bioguides: # Download & parse the HTML of the bioguide page. url = "http://bioguide.congress.gov/scripts/biodisplay.pl?index=%s" % bioguide cache = "legislators/bioguide/%s.html" % bioguide try: body = download(url, cache, force) # Fix a problem? body = body.replace("Á\xc2\x81", "Á") # Entities like ’ are in Windows-1252 encoding. Normally lxml # handles that for us, but we're also parsing HTML. The lxml.html.HTMLParser # doesn't support specifying an encoding, and the lxml.etree.HTMLParser doesn't # provide a cssselect method on element objects. So we'll just decode ourselves. body = utils.unescape(body, "Windows-1252") dom = lxml.html.parse(io.StringIO(body)).getroot() except lxml.etree.XMLSyntaxError: print("Error parsing: ", url) continue # Sanity check. if len(dom.cssselect("title")) == 0: print("[%s] No page for this bioguide!" % bioguide) missing.append(bioguide) continue # Extract the member's name and the biography paragraph (main). try: name = dom.cssselect("p font")[0] main = dom.cssselect("p")[0] except IndexError: print("[%s] Missing name or content!" % bioguide) exit(0) name = name.text_content().strip() main = main.text_content().strip().replace("\n", " ").replace("\r", " ") main = re.sub("\s+", " ", main) # Extract the member's birthday. update_birthday(bioguide, by_bioguide[bioguide], main) # Extract relationships with other Members of Congress. if utils.flags().get("relationships", False): #relationship information, if present, is in a parenthetical immediately after the name. #should always be present if we passed the IndexError catch above after_name = dom.cssselect("p font")[0].tail.strip() relationships = relationships_of(after_name) if len(relationships): families = families + 1 by_bioguide[bioguide]["family"] = relationships count = count + 1 print() if warnings: print("Missed %d birthdays: %s" % (len(warnings), str.join(", ", warnings))) if missing: print("Missing a page for %d bioguides: %s" % (len(missing), str.join(", ", missing))) print("Saving data to %s..." % filename) save_data(legislators, filename) print("Saved %d legislators to %s" % (count, filename)) if utils.flags().get("relationships", False): print("Found family members for %d of those legislators" % families)
def fget(self): return unescape(self.__raw.title)