def process_facebook_feed(self, feed, full_update=False): self.logger.info('Processing feed %s' % unicode(feed)) # First update the feed itself feed_info = self._fb_get(feed.origin_id) feed.interest = feed_info.get('likes', None) if not feed.picture: self.logger.info('Fetching picture info') picture_info = self._fb_get("%s?fields=picture" % feed.origin_id) feed.picture = picture_info.get('picture', {}).get('data', {}).get('url', None) # Limit downloading of personal feeds to last two months. if 'category' not in feed_info: feed.is_personal = True since = datetime.datetime.now() - datetime.timedelta(weeks=2*4) since = int(time.mktime(since.timetuple())) filter_args = "&since=%d" % since self.logger.debug('%s is a personal feed' % unicode(feed)) else: feed.is_personal = False filter_args = "" if full_update: count = 100 else: count = 20 new_count = 0 url = '%s/posts?limit=%d%s' % (feed.origin_id, count, filter_args) while True: self.logger.info('Fetching %s' % url) g = self._fb_get(url) found = False for post in g['data']: # Sanity check assert post['from']['id'] == feed.origin_id if post['type'] in ('question', 'swf', 'music', 'offer'): # We skip these updates for now. continue if post['type'] == 'status' and 'message' not in post: # We're not interested in status updates with no content. continue try: upd = Update.objects.get(feed=feed, origin_id=post['id']) found = True if not full_update: continue except Update.DoesNotExist: upd = Update(feed=feed, origin_id=post['id']) created = True new_count += 1 utc = dateutil.parser.parse(post['created_time']) upd.created_time = utc.astimezone(dateutil.tz.tzlocal()) self._set_field_with_len(upd, 'text', post.get('message', None)) upd.share_link = post.get('link', None) upd.picture = post.get('picture', None) self._set_field_with_len(upd, 'share_title', post.get('name', None)) self._set_field_with_len(upd, 'share_caption', post.get('caption', None)) self._set_field_with_len(upd, 'share_description', post.get('description', None)) if upd.picture and len(upd.picture) > self._get_field_max_len(upd, 'picture'): self.logger.warning("%s: Removing too long (%d) picture link" % (upd.origin_id, len(upd.picture))) upd.picture = None if upd.share_link and len(upd.share_link) > self._get_field_max_len(upd, 'share_link'): self.logger.warning("%s: Removing too long (%d) link" % (upd.origin_id, len(upd.share_link))) upd.share_link = None sub_type = post.get('status_type', None) if sub_type: upd.sub_type = sub_type else: upd.sub_type = None upd.interest = post.get('likes', {}).get('count', None) if post['type'] == 'link': upd.type = 'link' if not upd.share_link: self.logger.warning("FB %s: No link given for 'link' update" % post['id']) elif post['type'] == 'photo': upd.type = 'photo' assert upd.share_link assert upd.picture elif post['type'] == 'status': upd.type = 'status' elif post['type'] == 'video': upd.type = 'video' if not upd.share_link: # Fall back to the 'source' attribute upd.share_link = post.get('source', None) if not upd.share_link: pprint.pprint(post) raise Exception("%s: No link for 'video 'update" % post['id']) if upd.share_link and len(upd.share_link) > self._get_field_max_len(upd, 'share_link'): self.logger.warning("%s: Removing too long link" % upd.origin_id) upd.share_link = None else: pprint.pprint(post) raise Exception("Unknown FB update type: %s" % post['type']) upd.save() if not 'paging' in g: break next_args = urlparse.parse_qs(urlparse.urlparse(g['paging']['next']).query) until = int(next_args['until'][0]) # If we didn't have any of the updates, get a bigger batch next # time. if not found: count = 100 elif not full_update: # If at least some of the updates were in our DB already, # the feed is up-to-date. break url = "%s/posts?limit=%d&until=%d%s" % (feed.origin_id, count, until, filter_args) self.logger.info("%s: %d new updates" % (feed.account_name, new_count)) feed.update_error_count = 0 feed.last_update = datetime.datetime.now() feed.save()
def process_facebook_feed(self, feed, full_update=False): self.logger.info('Processing feed %s' % unicode(feed)) # First update the feed itself url = '%s?fields=picture,likes,about' % feed.origin_id feed_info = self._fb_get(url) feed.picture = feed_info.get('picture', {}).get('data', {}).get('url', None) feed.interest = feed_info.get('likes', None) # Limit downloading of personal feeds to last two months. if 'category' not in feed_info: feed.is_personal = True since = datetime.datetime.now() - datetime.timedelta(weeks=2 * 4) since = int(time.mktime(since.timetuple())) filter_args = "&since=%d" % since self.logger.debug('%s is a personal feed' % unicode(feed)) else: feed.is_personal = False filter_args = "" if full_update: count = 100 else: count = 20 new_count = 0 url = '%s/posts?limit=%d%s' % (feed.origin_id, count, filter_args) while True: self.logger.info('Fetching %s' % url) g = self._fb_get(url) found = False for post in g['data']: # Sanity check assert post['from']['id'] == feed.origin_id if post['type'] in ('question', 'swf', 'music', 'offer'): # We skip these updates for now. continue if post['type'] == 'status' and 'message' not in post: # We're not interested in status updates with no content. continue try: upd = Update.objects.get(feed=feed, origin_id=post['id']) found = True if not full_update: continue except Update.DoesNotExist: upd = Update(feed=feed, origin_id=post['id']) created = True new_count += 1 utc = dateutil.parser.parse(post['created_time']) upd.created_time = utc.astimezone(dateutil.tz.tzlocal()) self._set_field_with_len(upd, 'text', post.get('message', None)) upd.share_link = post.get('link', None) upd.picture = post.get('picture', None) self._set_field_with_len(upd, 'share_title', post.get('name', None)) self._set_field_with_len(upd, 'share_caption', post.get('caption', None)) self._set_field_with_len(upd, 'share_description', post.get('description', None)) if upd.picture and len(upd.picture) > self._get_field_max_len( upd, 'picture'): self.logger.warning( "%s: Removing too long (%d) picture link" % (upd.origin_id, len(upd.picture))) upd.picture = None if upd.share_link and len( upd.share_link) > self._get_field_max_len( upd, 'share_link'): self.logger.warning("%s: Removing too long (%d) link" % (upd.origin_id, len(upd.share_link))) upd.share_link = None sub_type = post.get('status_type', None) if sub_type: upd.sub_type = sub_type else: upd.sub_type = None upd.interest = post.get('likes', {}).get('count', None) if post['type'] == 'link': upd.type = 'link' if not upd.share_link: self.logger.warning( "FB %s: No link given for 'link' update" % post['id']) elif post['type'] == 'photo': upd.type = 'photo' assert upd.share_link assert upd.picture elif post['type'] == 'status': upd.type = 'status' elif post['type'] == 'video': upd.type = 'video' if not upd.share_link: # Fall back to the 'source' attribute upd.share_link = post.get('source', None) if not upd.share_link: pprint.pprint(post) raise Exception("%s: No link for 'video 'update" % post['id']) if upd.share_link and len( upd.share_link) > self._get_field_max_len( upd, 'share_link'): self.logger.warning("%s: Removing too long link" % upd.origin_id) upd.share_link = None else: pprint.pprint(post) raise Exception("Unknown FB update type: %s" % post['type']) upd.save() if not 'paging' in g: break next_args = urlparse.parse_qs( urlparse.urlparse(g['paging']['next']).query) until = int(next_args['until'][0]) # If we didn't have any of the updates, get a bigger batch next # time. if not found: count = 100 elif not full_update: # If at least some of the updates were in our DB already, # the feed is up-to-date. break url = "%s/posts?limit=%d&until=%d%s" % (feed.origin_id, count, until, filter_args) self.logger.info("%s: %d new updates" % (feed.account_name, new_count)) feed.update_error_count = 0 feed.last_update = datetime.datetime.now() feed.save()