def get_mentions(self, cursor): # ignore the keys here, they're just for show at = time.time() mentions = [] for query in self.config.values(): uri = "%s?%s" % ( self.api_endpoint, urllib.urlencode({ 'v': '1.0', 'q': query, }), ) rq = urllib2.Request( uri, headers = { 'User-Agent': user_agent_string }, ) f = urllib2.urlopen(rq) obj = simplejson.loads(f.read()) f.close() for result in obj['responseData']['results']: p = MySanitizer('utf-8') p.feed(result['content']) m = Mention( query, result['unescapedUrl'], p.output(), at, ) mentions.append(m) return mentions
def get_mentions(self, cursor): mentions = [] # keys here describe what we're searching for for key in self.config.keys(): feed = self.url_from_config(key, self.config[key]) #print key, '=', feed, '/', str(self.config[key]) kwargs = {} cursor.execute("SELECT last_modified, etag FROM mentions_feeds WHERE uri=%s", (feed,)) row = cursor.fetchone() feed_seen=False if row!=None: kwargs['etag'] = row[1] kwargs['modified'] = row[0] feed_seen=True kwargs['agent'] = user_agent_string kwargs['handlers'] = self.auth_handlers(self.config[key]) f = feedparser.parse(feed, **kwargs) #print "Got feed length %i" % (len(f.entries),) for mentry in f.entries: # body/summary detection taken from toreadless.com (newspan) # but was written by me in the first place ;-) # # note that we're looking for the shortest thing that works # rather than the most content p = MySanitizer(f.encoding) body = None try: content = mentry.content except AttributeError: content = None try: summary = mentry.summary except AttributeError: summary = None if content: body1 = content[0].value else: body1 = '' if summary: body2 = summary else: body2 = '' if len(body1) > len(body2) and len(body2)>0: body = body2 else: body = body1 p.feed(body) m = Mention( key, mentry.link, unicode(p.output(), f.encoding), calendar.timegm(mentry.date_parsed)) #print m mentions.append(m) etag = None last_modified = None if hasattr(f, 'etag') and f.etag: etag = f.etag if hasattr(f, 'last_modified') and f.last_modified: last_modified = time.strftime("%Y-%m-%d %H:%M:%S", f.modified) if etag or last_modified: if feed_seen: cursor.execute( "UPDATE mentions_feeds SET etag=%s, last_modified=%s WHERE uri=%s", (etag, last_modified, feed,) ) else: cursor.execute( "INSERT INTO mentions_feeds (uri, etag, last_modified) VALUES (%s, %s, %s)", (feed, etag, last_modified,) ) self.sleep() return mentions