def normalize(self): """Checks that all attribute values are in order so that the entry can be used for output, particularly in an Atom feed.""" if self.title is None: self.title = '' else: self.title = self.title.strip() if self.summary is None: self.summary = '' else: self.summary = self.summary.strip() if self.content == '' or self.content is None: self.content = self.summary else: self.content = self.content.strip() if self.published is None: self.published = self.date if self.published_parsed is None: self.published_parsed = parse_date(self.published) if self.created is None: self.created = self.date if self.created_parsed is None: self.created_parsed = parse_date(self.created) if self.updated is None: self.updated = self.date if self.updated_parsed is None: self.updated_parsed = parse_date(self.updated) self.date_atom = time.strftime(config.ATOM_TIME_FORMAT, self.date_parsed) self.published_atom = time.strftime(config.ATOM_TIME_FORMAT, self.published_parsed) self.created_atom = time.strftime(config.ATOM_TIME_FORMAT, self.created_parsed) self.updated_atom = time.strftime(config.ATOM_TIME_FORMAT, self.updated_parsed) self.date_formatted = time.strftime(config.HTML_TIME_FORMAT, self.date_parsed) self.published_formatted = time.strftime(config.HTML_TIME_FORMAT, self.published_parsed) self.created_formatted = time.strftime(config.HTML_TIME_FORMAT, self.created_parsed) self.updated_formatted = time.strftime(config.HTML_TIME_FORMAT, self.updated_parsed) # Build GUID if self.id is None: self.id = self.get_tag_uri(self.date_parsed, self.url) # Truncate content for main page if publish.shorten.wc(self.content) > config.WORD_LIMIT: self.content_abridged = publish.shorten.shorten(self.content, config.WORD_LIMIT) else: self.content_abridged = self.content # Sanitize content self.title = publish.sanitizer.sanitize(self.title) self.summary = publish.sanitizer.strip(self.summary) #self.summary = publish.sanitizer.sanitize(self.summary) # If the entry is a photo, allow <img> tag if self.type == 'photo': self.content = publish.sanitizer.sanitize(self.content, additional_tags=[ 'img' ]) elif self.type == 'quote': self.content = publish.sanitizer.block_to_break(self.content) self.content = publish.sanitizer.sanitize(self.content, additional_tags = [ 'br' ]) else: self.content = publish.sanitizer.heading_to_bold(self.content) self.content = publish.sanitizer.sanitize(self.content, additional_tags = [ 'p', 'br', 'blockquote' ]) self.content_abridged = publish.sanitizer.heading_to_bold(self.content_abridged) self.content_abridged = publish.sanitizer.sanitize(self.content_abridged, additional_tags = [ 'p', 'br', 'blockquote' ])
def twentyFourHourFrequency(): #this function works out the hourly frequency of entries of passed legislation in the last 24hrs of legislative activity #using http://www.legislation.gov.uk/new/data.feed #It does this by collecting entries of passed legislation in the last 24 hr period #Except if it is a monday - then it collects the number of entries back to & including Friday (72hrs) #Friday is used because no legislation is passed on a wkend and Friday would have been within the last 24hrs of legislative activity #On a weekend the hourly frequency will always be 0 feed = feedparser.parse('http://www.legislation.gov.uk/new/data.feed') entries_in_collection_period = 0 currentTime = (tuple(time.gmtime())) timestamp_currentTime = calendar.timegm(currentTime) for x in xrange(0,len(feed.entries)): #loop through all the entries #print feed.entries[x].title timestamp_entryDate = mktime(parse_date(feed.entries[x].published)) #get each entry published date and change to unix format #print timestamp_entryDate #print datetime.fromtimestamp(timestamp_entryDate), 'date published' hours_old = (timestamp_currentTime - timestamp_entryDate)/ 3600.0 #print hours_old, 'hours old' current_weekday = calendar.day_name[date.today().weekday()] #get current weekday name eg Monday if current_weekday=='Monday' and hours_old < 72: #if its Monday find entries back to Friday (72hrs) entries_in_collection_period += 1 print 'added to the frequency list\n' elif hours_old < 24: #else find entries in the last 24hrs entries_in_collection_period += 1 hourly_frequency = float(entries_in_collection_period)/24 #calc the hourly frequency of entries msg = ['\n', str(feed.updated), ' :atom feed update\n', str(entries_in_collection_period), ' entries_in_collection_period: ', '\nA piece of legislation was passed every\n', str(hourly_frequency), ' hrs in the last 24hrs since the atom update time\n======'] writeTofile('output.txt', "".join(msg)) print "".join(msg) return hourly_frequency
def rfc3339_to_datetime(dt): """ Return the RFC 3339 datetime string in `dt` as timezone-aware Django datetime object. """ utc_dt = pytz.utc.localize(datetime.datetime.fromtimestamp(time.mktime(parse_date(dt)))) time_zone = pytz.timezone(settings.TIME_ZONE) return utc_dt.astimezone(time_zone)
def normalize(self): """Checks that all attribute values are in order so that the source can be used in an Atom feed.""" if self.id is None: # TODO: Generate unique ID (maybe unnecessary) pass if len(self.entries) > 0: if self.updated is None: self.updated = self.entries[0].updated if self.updated_parsed is None: self.updated_parsed = parse_date(self.updated)
def parse(self): """Fetches Twitter tweets using the Twitter API.""" self.logger.debug("Contacting Twitter services") try: twitty = twitter.Api( username=config.TWITTER_ACCOUNT, password=config.TWITTER_PASSWORD ) self.logger.info("Getting Twitter tweets for %s" % self.owner) tweets = twitty.GetUserTimeline(self.name) for tweet in tweets: skip = False e = Quote() e.source.name = self.name e.source.url = self.url e.author = tweet.user.name e.title = "Tweet from %s" % e.author e.summary = tweet.text e.content = e.summary # Add hyperlinks # e.content = TwitterStatus.link_users(e.content) # e.content = TwitterStatus.link_hashtags(e.content) e.citation = e.author self.logger.info("Tweet: '%s'" % e.summary) e.url = self.get_tweet_url(tweet.id) self.logger.debug("Tweet URL: %s" % e.url) e.date = tweet.created_at e.date_parsed = parse_date(e.date) self.logger.debug("Tweet date: %s" % e.date_as_string(e.date_parsed)) # Skip this tweet if replies are turned off if self.ignore_replies and TwitterStatus.is_reply(e.summary): skip = True # Skip this tweet if replies are turned off if self.ignore_retweets and TwitterStatus.is_retweet(e.summary): skip = True # Skip this tweet if it's in the exclusion list if self.excluded_keywords is not None: for keyword in self.excluded_keywords: self.logger.debug("Checking for excluded keyword: '%s'" % keyword) if e.summary.lower().find(keyword.lower()) > -1: self.logger.debug("Skipping tweet with excluded keyword: '%s'" % keyword) skip = True if skip: continue else: self.entries.append(e) except BadStatusLine: self.logger.exception("Twitter.com unexpectedly closed the connection!") except HTTPError, err: self.logger.exception("HTTP error: '%s'" % err)
def parse_rfc3339_date(self, dateString): mydt = parse_date(dateString.replace('Z', '-02:00')) weekdayId = mydt.tm_wday hour = mydt.tm_hour if hour >= 8 and hour <= 17: time_of_day = 'Morning' elif hour > 17 and hour < 22: time_of_day = 'Evening' else: time_of_day = 'Night' retobj = { 'weekdayId': weekdayId, 'weekdayName': self.enum_weekdays(weekdayId), 'hour': hour, 'time_of_day': time_of_day, } return retobj
def parse_rfc3339_date(self, dateString): mydt = parse_date(dateString.replace('Z','-02:00')) weekdayId = mydt.tm_wday hour = mydt.tm_hour if hour >= 8 and hour <= 17: time_of_day = 'Morning' elif hour > 17 and hour < 22: time_of_day = 'Evening' else: time_of_day = 'Night' retobj = { 'weekdayId': weekdayId, 'weekdayName': self.enum_weekdays(weekdayId), 'hour': hour, 'time_of_day': time_of_day, } return retobj
def frequencyOfLegislation(): print '\natom feed: http://www.legislation.gov.uk/new/data.feed' print 'this atom feed was updated at', feed.entries[0].updated, '\n' storeTheDay = [] for x in xrange(0,len(feed.entries)): #loop through all the entries print feed.entries[x].title fullDateTuple = tuple(parse_date(feed.entries[x].published)) #get the published date and cahnge to tuple format print 'Day:', fullDateTuple[2], 'th' #print just the day storeTheDay.append(fullDateTuple[2]) #add each day to the storeTheDay list - this relies on the date format not changing - not good! frequency = dict((i, storeTheDay.count(i)) for i in storeTheDay) #make a dict of the values to get the frequency of the days in the storeTheDay list now = datetime.datetime.now() currentDay = "%d" % now.day for key in frequency: print '\nOn the', key, 'th there were', frequency[key], 'pieces of legislation' print 'There is a piece of legislation every', float(int(frequency[key]))/24, 'hours\n' if frequency[key] == currentDay: print 'Today, right now, there is a piece of legislation every', float(int(frequency[key]))/24, 'hours\n'
def parse(self): """Fetches Tumblr API data and parses it.""" self.logger.info("Fetching API data at '%s'" % self.api_url) self.http_response, self.http_content = spider.fetch(self.api_url) self.logger.info("Parsing API data for entries...") t = tumblr.parse(self.api_url) for post in t.posts: try: if post.type == 'regular': self.logger.info("Tumblr post type: regular") e = Post() e.title = post.title e.summary = post.content e.content = post.content elif post.type == 'link': if 'link' in self.excluded_types: self.logger.debug("Skipping Tumblr link") continue else: self.logger.info("Tumblr post type: link") e = Link() e.title = post.title e.summary = post.content e.content = post.content e.url = post.related e.comments = post.url elif post.type == 'quote': self.logger.info("Tumblr post type: quote") e = Quote() e.summary = post.content # Chop the smart quotes that Tumblr automatically # adds to to a quote e.summary = e.summary.lstrip("“").rstrip("”") e.content = e.summary # Get the quote's citation, and, if possible its source e.citation = post.source try: soup = BeautifulSoup(e.citation) e.citation_url = soup.find('a').get('href') e.via = e.citation_url except AttributeError: e.citation_url = None elif post.type == 'photo': self.logger.info("Tumblr post type: photo") e = Photo() e.photo_type = 'tumblr' e.title = '' e.summary = post.caption #e.content = e.summary # post.urls is a dictionary of photo URLs keyed by size. # Let's get the big one. e.photo_url = post.urls['500'] e.cached_url = config.IMAGES_URL + '/' + e._get_cached_original_shortname() self.logger.debug("Tumblr photo URL: '%s'" % e.photo_url) e.cache() e.set_dimensions() e.set_content() # Conversation, Video, and Audio post types aren't # going to be implemented for a while elif post.type == 'conversation': # TODO: Support Tumblr conversations self.logger.info("Tumblr post type: conversation") continue #e = Conversation() elif post.type == 'video': # TODO: Support Tumblr videos self.logger.info("Tumblr post type: video") continue #e = Video() elif post.type == 'audio': # TODO: Support Tumblr audio self.logger.info("Tumblr post type: audio") continue #e = Audio() e.source.name = self.name e.source.url = self.url if e.url == '': e.url = post.url e.author = self.owner e.date = post.date e.date_parsed = parse_date(post.date) self.logger.debug("Tumblr post date: %s" % e.date_as_string(e.date_parsed)) self.logger.info("Entry title: '%s'" % e.title) self.logger.debug("Entry URL: '%s'" % e.url) self.entries.append(e) except AttributeError: # FIXME: Why is this exception handler here??? pass
def getnewbooks(): """Routine to query cps to get the latest books added. return array of new books """ logger.info('Getting new books from server') d = feedparser.parse('http://' + config.settings['username'] + ':' + config.settings['password'] + '@' + config.settings['serveraddress']) if d.bozo == 1: logger.error('Username, password, or Server Address url is incorrect.') return False else: logger.info('Name of the feed:' + d.feed.title) logger.info('Looking for books uploaded in the last: ' + str(config.settings['numofdaysfornotification']) + ' days.') _thumbnail_uri = u'http://opds-spec.org/image/thumbnail' recent_books = [] if d.status == 200: for book in d.entries: dt = datetime.fromtimestamp(mktime(parse_date(book.updated))) if datetime.now() - dt < timedelta( days=int(config.settings['numofdaysfornotification'])): if 'title' in book: logger.info('Found book. Title: ' + book.title) else: logger.info('Found book. Strange, no Title field!') # Need to get a uniqueID for naming the CIDs in the html newsletter - pulling the GUID from the link url # While we are here, might as well find out if the book has a cover # if the book has a cover set: # the book_cover_id to the GUID # pull the url to make it easier to get to # go get the cover and resize it # if the book doesn't have a cover, set book_cover_id to 'Unknown.png' # Also setup the book_location - we'll use that in the newsletter to point to the book for _entry in book.links: if _entry.rel == _thumbnail_uri: try: book['book_location'] = config.settings[ 'serverbookurl'] + (_entry.href.rsplit('/', 1)[1]) book_cover_id = book.link.rsplit('/', 1)[1] book["book_cover_id"] = book_cover_id book["cover_thumbnail"] = get_thumbnail( _entry.href) logger.debug(' Book has cover.') except: logger.debug(' Error in getting book cover.') book["book_cover_id"] = "Unknown.png" book['book_location'] = "#" if book.get('book_cover_id', 'nope') == 'nope': logger.debug(' Book nas no cover.') book["book_cover_id"] = "Unknown.png" # The book summary that are posted with OPDS feeds can be long # Need to check for the size and if it's beyond a set size, reduce it try: if len(book['summary'] ) >= config.settings['SUMMARY_LENGTH']: book['short_summary'] = book[ 'summary'][:config.settings[ 'SUMMARY_LENGTH']] + "...see site..." logger.debug( ' Book summary too long. Being shorten.') elif len(book['summary']) == 0: book['short_summary'] = 'No summary information.' logger.debug(' Book summary does not exist.') else: book['short_summary'] = book["summary"] logger.debug( ' Book summary too long. Being shorten.') except: book['short_summary'] = 'No summary information.' # add newly added book to array recent_books.append(book) return recent_books else: logger.error( 'Error getting opds feed! - Please check config. Status Code: ' + str(d.status)) return False
def s2ts(s): from calendar import timegm from feedparser import _parse_date as parse_date return int(timegm(parse_date(s)))
def parse(self): """Gets recent photos from a photostream, caches and thumbnails them.""" self.logger.debug("Contacting Flickr Services") # Using flickrapi's 'etree' options requires ElementTree, # which is standard with Python 2.5, but a separate install with # Python 2.4. The flickrapi module must also be patched # using 'patches/flickrapi.patch' when using Python 2.4. try: flickr = flickrapi.FlickrAPI(config.FLICKR_KEY, format='etree') extras = 'date_upload,date_taken,last_update,owner_name,media,tags,license' self.logger.info("Getting photos for %s" % self.owner) photos = flickr.people_getPublicPhotos( user_id=self.flickr_id, safe_search=SAFESEARCH_RESTRICTED, extras=extras ) for photo in photos: e = Photo() e.photo_type = 'flickr' e.source.name = self.name e.source.url = self.url # This only gets the most recent photo, which is really # a bug, but I like this behavior. Too many photos # clutter things up. p = photo.find('photo') #if p.get('media') == 'video': # self.logger.info("Skipping Flickr video") # continue e.title = p.get('title', 'untitled') if e.title.strip() == '': e.title = 'untitled' self.logger.info("Photo title: '%s'" % e.title) e.photo_id = p.get('id') e.farm_id = p.get('farm') e.secret = p.get('secret') e.server = p.get('server') e.photo_url = e._get_flickr_photo_url( e.farm_id, e.server, e.photo_id, e.secret ) self.logger.debug("Photo image URL: '%s'" % e.photo_url) e.url = e._get_flickr_url(self.flickr_id, e.photo_id) e.cached_url = config.IMAGES_URL + '/' + e._get_cached_original_shortname() self.logger.debug("Photo Flickr page URL: '%s'" % e.url) e.cache() e.set_dimensions() e.date = p.get('dateupload') e.date_parsed = datetime.datetime.utcfromtimestamp(float(e.date)).timetuple() e.published = e.date e.published_parsed = e.date_parsed e.created = p.get('datetaken', e.date) if e.created == e.date: e.created_parsed = e.date_parsed else: e.created_parsed = parse_date(e.created) e.updated = p.get('lastupdate', e.date) e.updated_parsed = datetime.datetime.utcfromtimestamp(float(e.updated)).timetuple() # Okay, now get the detailed photo info self.logger.debug("Making photos.getInfo API call...") photo_info = flickr.photos_getInfo(photo_id=e.photo_id, secret=e.secret) e.summary = photo_info.find('photo').find('description').text if e.summary is None: e.summary = '' e.author = photo_info.find('photo').find('owner').get('realname') if e.author == '': e.author = self.owner e.set_content() self.entries.append(e) except FlickrError, err: self.logger.exception("Flickr API error: '%s'" % err)