def parse_rules(): rules = glob.glob('../src/chrome/content/rules/*.xml') out = [] for rule in rules: ruleset = xmltramp.seed(file(rule)) #print ruleset('name') hosts = [] for k in ruleset['target':]: hosts.append(k('host')) for k in ruleset['rule':]: for host in hosts: escaped_host = host.replace('.', '\\.') if (k('from') == "^http://(www\.)?%s/" % escaped_host) and (k('to') == "https://www.%s/" % host or k('to') == "https://%s/" % host): out.extend([(host, False), ('www.' + host, False)]) break elif k('from') == "^http://([^/:@]*)\.%s/" % escaped_host and \ k('to') == "https://$1.%s/" % host: out.extend([(host, True)]) elif k('from') == '^http://%s/' % escaped_host and \ k('to') == "https://%s/" % host: out.append((host, False)) else: pass #print ' ', host, k('from').encode('utf8'), k('to').encode('utf8') #print ' ', out return out
def parse_rules(): rules = glob.glob('../src/chrome/content/rules/*.xml') out = [] for rule in rules: ruleset = xmltramp.seed(file(rule)) #print ruleset('name') hosts = [] for k in ruleset['target':]: hosts.append(k('host')) for k in ruleset['rule':]: for host in hosts: escaped_host = host.replace('.', '\\.') if ( k('from') == "^http://(www\.)?%s/" % escaped_host ) and ( k('to') == "https://www.%s/" % host or k('to') == "https://%s/" % host ): out.extend([(host, False), ('www.' + host, False)]) break elif k('from') == "^http://([^/:@]*)\.%s/" % escaped_host and \ k('to') == "https://$1.%s/" % host: out.extend([(host, True)]) elif k('from') == '^http://%s/' % escaped_host and \ k('to') == "https://%s/" % host: out.append((host, False)) else: pass #print ' ', host, k('from').encode('utf8'), k('to').encode('utf8') #print ' ', out return out
def handle(self, **options): url = "http://upcoming.yahooapis.com/services/rest/?api_key=%s&method=user.getWatchlist&user_id=%s&show=upcoming"%(UPCOMING_KEY, UPCOMING_ID) request = urllib2.urlopen( url ) xml = xmltramp.seed( request ) for event in xml["event":]: page = self.getPage( "stream", u"upcoming:///%s"%event('id') ) page.slug = str(event('id')) page.format = "markdown" page.excerpt = "" # the date the event happens, not the date it was added. Use implicit 'now' add date date = iso8601.parse_date( event('start_date') + "T00:00:00" ).date() # stupid page.link = "http://upcoming.yahoo.com/event/%s/"%( event('id') ) page.guid = page.link icon = """<a href="http://upcoming.yahoo.com/user/123407" title="upcoming"><img src="%sweb/icons/upcoming.png" title="upcoming"></a> """%MEDIA_URL if event('status') == 'attend': page.body = icon + """I will be attending <a href="%s">%s</a> on %s"""%( page.link, event('name'), date.strftime("%A %B %d") ) else: page.body = icon + """I'm considering <a href="%s">%s</a> on %s"""%( page.link, event('name'), date.strftime("%A %B %d") ) page.title = "Upcoming" # re.sub(r'<.*?>','', page.body ).strip() if not page.id: # only the first time # http://code.google.com/apis/maps/documentation/services.html#Geocoding_Direct if event('venue_zip'): address = ", ".join([ event('venue_zip'), event("venue_country_name"), ]) else: address = ", ".join( [ event('venue_address'), re.sub(r',.*','', event('venue_city')), event("venue_country_name") ] ) geocode = "http://maps.google.com/maps/geo?key=%s&q=%s&output=json"%( self.getConfig("google", "maps_key"), urllib.quote( address.encode('utf-8'), "" ) ) request = urllib2.urlopen( geocode ) data = json.load( request ) if "Placemark" in data: print "Got location for %s"%address page.longitude, page.latitude, precision = data["Placemark"][0]["Point"]['coordinates'] else: print "Can't find location for %s"%address page.save() page.set_tags( ["cougar:syndicate=upcoming"] ) if not page.human_tags(): text = event('description') or event('name') if text: url = "http://search.yahooapis.com/ContentAnalysisService/V1/termExtraction?appid=%s&output=json&context=%s"%( config.get('yahoo','appid'), urllib.quote(text.encode('utf8'), "") ) keywords = json.load( urllib2.urlopen( url ) ) tags = keywords['ResultSet']['Result'] page.set_human_tags( tags )
def handle(self, **other): pagetype = PageType.objects.get( folder = "stream" ) realm = 'Nordrassil' character = 'Granark' # TODO - US armoury address #url = "http://armory.wow-europe.com/character-sheet.xml?r=%s&n=%s"%( urllib.quote(realm,''), urllib.quote(character,'') ) url = "http://eu.wowarmory.com/character-achievements.xml?r=%s&n=%s"%( urllib.quote(realm,''), urllib.quote(character,'') ) # Need to specify firefox as user agent as this makes the server return an XML file. opener = urllib2.build_opener() opener.addheaders = [ ('user-agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-GB; rv:1.8.1.4) Gecko/20070515 Firefox/2.0.0.4') ] # timeout in seconds - the armoury falls over a lot socket.setdefaulttimeout(10) req = urllib2.Request( url) try: data = opener.open(req) except urllib2.HTTPError: return except urllib2.URLError: return xml = xmltramp.seed( data ) for ach in xml['achievements']['summary']['achievement':]: page = self.getPage( "stream", "armory:///%s"%ach('id') ) if not page.id: # new page page.date = iso8601.parse_date( ach('dateCompleted')[0:10] + "T21:00:00+00:00" ) # default to evening sometime, that's when I'm usually playing if page.date.day == datetime.now().day and page.date.month == datetime.now().month: page.date = datetime.now() page.date = page.date.replace(tzinfo=None) # http://code.djangoproject.com/ticket/5304 desc = unicode(ach('desc')) page.format = "markdown" page.slug = slugify( ach('title') ) page.link = url # TODO - can's link directly to armory, apparently. icon = """<a href="%s"><img src="%sweb/icons/wow.png" title="flickr favourites"></a> """%( url, MEDIA_URL ) page.body = icon + """Gained the achievement "<a href="%s" title="%s">%s</a>" """%( "http://www.wowhead.com/?achievement=%s"%ach('id'), ach("desc"), ach('title') ) page.title = re.sub(r'<.*?>','', page.body ).strip() page.excerpt = "" page.save() page.set_tags( ["cougar:syndicate=wow-armory", "warcraft"] )
def handle(self, **options): pagetype = PageType.objects.get(folder="stream") url = "http://ws.audioscrobbler.com/2.0/?method=user.getlovedtracks&user=jerakeen&api_key=%s" % LASTFM_KEY request = urllib2.urlopen(url) xml = xmltramp.seed(request) for track in xml.lovedtracks["track":]: link = "http://%s" % str(track.url) extref = u"lastfm:///%s" % hashlib.md5(link).hexdigest() try: page = Page.objects.get(extref=extref) except ObjectDoesNotExist: print("Creating page %s" % extref) page = Page(pagetype=pagetype, extref=extref, status="published") page.save() # gets round some initial date setting bugs page.date = datetime.fromtimestamp(int(track.date("uts"))) page.slug = hashlib.md5(link).hexdigest()[0:8] page.format = "markdown" page.excerpt = "" icon = ( """<a href="http://www.last.fm/user/jerakeen" title="last.fm"><img src="%sweb/icons/lastfm.png" title="last.fm"></a> """ % MEDIA_URL ) page.body = icon + """Loved the track <a href="%s">%s</a> by <a href="%s">%s</a>""" % ( link, track.name, track.artist.url, track.artist.name, ) page.title = re.sub(r"<.*?>", "", page.body) page.link = link try: page.image = filter(lambda i: i("size") == "large", track["image":])[0] except IndexError: page.image = ( "http://cdn.last.fm/depth/catalogue/noimage/cover_large.gif" ) # "http://cdn.last.fm/flatness/catalogue/noimage/cover_large.gif" page.save() page.set_tags(["music", "cougar:syndicate=lastfm"])
def handle(self, url = "http://jerakeen.tumblr.com/api/read", **options): opener = urllib2.build_opener() opener.addheaders = [ ('user-agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-GB; rv:1.8.1.4) Gecko/20070515 Firefox/2.0.0.4') ] req = urllib2.Request( url) data = opener.open(req) xml = xmltramp.seed( data ) for entry in xml.posts: #print entry.__repr__(1) extref = "tumblr:///%s"%entry('id') page = self.getPage( 'notes', extref ) page.date = datetime.fromtimestamp( int(entry("unix-timestamp") ) ) page.slug = entry("id") page.format = "raw" page.excerpt = "" tags = [ "cougar:syndicate=tumblr" ] for tag in entry['tag':]: tags += [ unicode(tag) ] if entry("type") == 'photo': page.title = re.sub(r'[\.\|].*', '', re.sub(r'<.*?>','', unicode(entry['photo-caption']) ) ) if not page.title: page.title = "Photo" page.body = entry['photo-caption'] photos = {} for p in entry['photo-url':]: photos[ int(p('max-width')) ] = unicode(p) def mirror( size, photos = photos ): url = photos[size] filename = os.path.join( MEDIA_ROOT, "thumbnail", "tumblr", re.sub(r'^.*/','',url) ) if not os.path.exists( filename ): print "Fetching %s to %s"%( url, filename ) urllib.urlretrieve( url, filename ) return re.sub(r'^.*/',"thumbnail/tumblr/",url) page.image = mirror( 250 ) page.thumbnail = mirror( 75 ) tags += ["tumblr:type=photo"] elif entry("type") == 'link': try: page.title = self.text_of( entry['link-text'] ) except KeyError: page.title = "Link" page.link = self.text_of( entry['link-url'] ) page.body = self.text_of( entry['link-description'] ) tags += ["tumblr:type=link"] #page.pagetype = PageType.objects.get( folder = "links" ) elif entry("type") == 'quote': page.title = "Quote" # TODO source = re.sub( r'^\—\s+', '', self.text_of( entry['quote-source'] ) ) # strip leading mdash page.body = "<blockquote>%s</blockquote>\n\n<cite>%s</cite>\n"%( self.text_of(entry['quote-text']), source ) tags += ["tumblr:type=quote"] elif entry("type") == 'regular': try: page.title = self.text_of(entry['regular-title']) except KeyError: page.title = "Page" try: page.body = self.text_of(entry['regular-body']) except KeyError: page.body = "" tags += ["tumblr:type=regular"] elif entry("type") == 'conversation': try: page.title = self.text_of(entry['conversation-title']) except Exception: page.title = "Conversation" body = self.text_of(entry['conversation-text']) page.body = "<pre>" + body + "</pre>" lines = entry['conversation'][:] def line_to_tr(line): return """<tr><td valign="top" align="right" class="name">%s</td><td valign="top" class="words">%s</td></tr>"""%( line('name'), self.text_of(line) ) page.body = """<table border="0" margin="0" padding="0" class="conversation">%s</table>"""%( "\n".join(map(line_to_tr, lines)) ) tags += ["tumblr:type=regular"] elif entry("type") == 'video': page.title = unicode(entry['video-caption']) page.body = unicode(entry['video-player']) tags += ["tumblr:type=video"] else: print "Can't handle page type '%s'"%entry('type') continue if len(page.title) > 50: page.title = page.title[0:45] + "..." page.save() page.set_tags( tags )
# Need to specify firefox as user agent as this makes the server return an XML file. opener = urllib2.build_opener() opener.addheaders = [ ('user-agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-GB; rv:1.8.1.4) Gecko/20070515 Firefox/2.0.0.4') ] # timeout in seconds - the armoury falls over a lot socket.setdefaulttimeout(10) req = urllib2.Request( url) try: data = opener.open(req) except urllib2.HTTPError: sys.exit(1) except urllib2.URLError: sys.exit(1) xml = xmltramp.seed( data ) achievements = [] for character in xml['guildInfo']['guild']['members']['character':]: char_url = "http://%s/character-achievements.xml?r=%s&n=%s"%( wowserver, urllib.quote(realm,''), urllib.quote(character('name').encode('utf-8'),'') ) char_req = urllib2.Request(char_url) try: char_data = opener.open(char_req) except urllib2.HTTPError: sys.exit(1) except urllib2.URLError: sys.exit(1) char_xml = xmltramp.seed( char_data )