def process_tweet(self, tweet): self.logger.debug("incoming tweet with id %s" % tweet['id']) try: tw_obj = Update.objects.get(feed__type="TW", origin_id=tweet['id']) return except Update.DoesNotExist: pass tw_user = tweet['user'] try: feed = Feed.objects.get(type="TW", origin_id=tw_user['id']) except Feed.DoesNotExist: self.logger.debug("feed %s (%s) not found" % (tw_user['id'], tw_user['screen_name'])) return self.logger.debug("tweet for feed %s" % unicode(feed)) feed.last_update = datetime.datetime.now() feed.interest = tw_user['followers_count'] feed.picture = tw_user['profile_image_url'] feed.account_name = tw_user['screen_name'] feed.save() tw_obj = Update(feed=feed, origin_id=tweet['id']) text = tweet['text'] tw_obj.text = text.replace('>', '>').replace('<', '<').replace(''', "'") date = calendar.timegm(email.utils.parsedate(tweet['created_at'])) tw_obj.created_time = datetime.datetime.fromtimestamp(date) try: tw_obj.save() except Exception as e: self.logger.error(str(e)) raise
def process_twitter_timeline(self, twitter, feed): self.stdout.write("Processing %s\n" % feed.account_name) user_id = feed.origin_id args = {'user_id': user_id, 'username': user_id, 'count': 100, 'trim_user': True} tw_list = [] while True: # retry two times if the twitter call fails for i in range(3): try: tweets = twitter.getUserTimeline(**args) except ValueError: if i == 2: raise self.stderr.write("\tGot exception, retrying.\n") continue break if 'error' in tweets: self.stderr.write("\tERROR: %s\n" % tweets['error']) if not 'Rate limit exceeded' in tweets['error']: feed.update_error_count += 1 feed.save() return if not len(tweets): break for tw in tweets: try: mp_tw = Update.objects.get(feed=feed, origin_id=tw['id']) except Update.DoesNotExist: tw_list.insert(0, tw) else: break else: args['max_id'] = tw_list[0]['id'] - 1 continue break self.stdout.write("\tNew tweets: %d\n" % len(tw_list)) for tw in tw_list: mp_tw = Update(feed=feed) mp_tw.origin_id = tw['id'] text = tw['text'] mp_tw.text = text.replace('>', '>').replace('<', '<').replace(''', "'") date = calendar.timegm(email.utils.parsedate(tw['created_at'])) mp_tw.created_time = datetime.datetime.fromtimestamp(date) try: mp_tw.save() except: self.stderr.write("%s\n" % str(tw)) raise feed.last_update = datetime.datetime.now() feed.save()
def process_twitter_timeline(self, twitter, feed): self.stdout.write("Processing %s\n" % feed.account_name) user_id = feed.origin_id args = {"user_id": user_id, "username": user_id, "count": 100, "trim_user": True} tw_list = [] while True: tweets = twitter.getUserTimeline(**args) if "error" in tweets: self.stderr.write("\tERROR: %s\n" % tweets["error"]) if not "Rate limit exceeded" in tweets["error"]: feed.update_error_count += 1 feed.save() return if not len(tweets): break for tw in tweets: try: mp_tw = Update.objects.get(feed=feed, origin_id=tw["id"]) except Update.DoesNotExist: tw_list.insert(0, tw) else: break else: args["max_id"] = tw_list[0]["id"] - 1 continue break self.stdout.write("\tNew tweets: %d\n" % len(tw_list)) for tw in tw_list: mp_tw = Update(feed=feed) mp_tw.origin_id = tw["id"] text = tw["text"] mp_tw.text = text.replace(">", ">").replace("<", "<") date = calendar.timegm(email.utils.parsedate(tw["created_at"])) mp_tw.created_time = datetime.datetime.fromtimestamp(date) try: mp_tw.save() except: self.stderr.write("%s\n" % str(tw)) raise feed.last_update = datetime.datetime.now() feed.save()
def process_facebook_feed(self, feed, full_update=False): self.logger.info('Processing feed %s' % unicode(feed)) # First update the feed itself feed_info = self._fb_get(feed.origin_id) feed.interest = feed_info.get('likes', None) if not feed.picture: self.logger.info('Fetching picture info') picture_info = self._fb_get("%s?fields=picture" % feed.origin_id) feed.picture = picture_info.get('picture', {}).get('data', {}).get('url', None) # Limit downloading of personal feeds to last two months. if 'category' not in feed_info: feed.is_personal = True since = datetime.datetime.now() - datetime.timedelta(weeks=2*4) since = int(time.mktime(since.timetuple())) filter_args = "&since=%d" % since self.logger.debug('%s is a personal feed' % unicode(feed)) else: feed.is_personal = False filter_args = "" if full_update: count = 100 else: count = 20 new_count = 0 url = '%s/posts?limit=%d%s' % (feed.origin_id, count, filter_args) while True: self.logger.info('Fetching %s' % url) g = self._fb_get(url) found = False for post in g['data']: # Sanity check assert post['from']['id'] == feed.origin_id if post['type'] in ('question', 'swf', 'music', 'offer'): # We skip these updates for now. continue if post['type'] == 'status' and 'message' not in post: # We're not interested in status updates with no content. continue try: upd = Update.objects.get(feed=feed, origin_id=post['id']) found = True if not full_update: continue except Update.DoesNotExist: upd = Update(feed=feed, origin_id=post['id']) created = True new_count += 1 utc = dateutil.parser.parse(post['created_time']) upd.created_time = utc.astimezone(dateutil.tz.tzlocal()) self._set_field_with_len(upd, 'text', post.get('message', None)) upd.share_link = post.get('link', None) upd.picture = post.get('picture', None) self._set_field_with_len(upd, 'share_title', post.get('name', None)) self._set_field_with_len(upd, 'share_caption', post.get('caption', None)) self._set_field_with_len(upd, 'share_description', post.get('description', None)) if upd.picture and len(upd.picture) > self._get_field_max_len(upd, 'picture'): self.logger.warning("%s: Removing too long (%d) picture link" % (upd.origin_id, len(upd.picture))) upd.picture = None if upd.share_link and len(upd.share_link) > self._get_field_max_len(upd, 'share_link'): self.logger.warning("%s: Removing too long (%d) link" % (upd.origin_id, len(upd.share_link))) upd.share_link = None sub_type = post.get('status_type', None) if sub_type: upd.sub_type = sub_type else: upd.sub_type = None upd.interest = post.get('likes', {}).get('count', None) if post['type'] == 'link': upd.type = 'link' if not upd.share_link: self.logger.warning("FB %s: No link given for 'link' update" % post['id']) elif post['type'] == 'photo': upd.type = 'photo' assert upd.share_link assert upd.picture elif post['type'] == 'status': upd.type = 'status' elif post['type'] == 'video': upd.type = 'video' if not upd.share_link: # Fall back to the 'source' attribute upd.share_link = post.get('source', None) if not upd.share_link: pprint.pprint(post) raise Exception("%s: No link for 'video 'update" % post['id']) if upd.share_link and len(upd.share_link) > self._get_field_max_len(upd, 'share_link'): self.logger.warning("%s: Removing too long link" % upd.origin_id) upd.share_link = None else: pprint.pprint(post) raise Exception("Unknown FB update type: %s" % post['type']) upd.save() if not 'paging' in g: break next_args = urlparse.parse_qs(urlparse.urlparse(g['paging']['next']).query) until = int(next_args['until'][0]) # If we didn't have any of the updates, get a bigger batch next # time. if not found: count = 100 elif not full_update: # If at least some of the updates were in our DB already, # the feed is up-to-date. break url = "%s/posts?limit=%d&until=%d%s" % (feed.origin_id, count, until, filter_args) self.logger.info("%s: %d new updates" % (feed.account_name, new_count)) feed.update_error_count = 0 feed.last_update = datetime.datetime.now() feed.save()
def process_twitter_feed(self, feed): self.logger.info("Processing Twitter feed %s" % feed.account_name) user_id = feed.origin_id args = {'user_id': user_id, 'username': user_id, 'count': 200, 'trim_user': True} tw_list = [] try: info = self.twitter.show_user(**args) except TwythonError as e: if 'Unauthorized:' in e.msg: raise UpdateError(e.msg, can_continue=True) self.logger.error("Got Twitter exception: %s" % e) if "Rate limit exceeded" in e.msg: raise UpdateError("Rate limit exceeded", can_continue=False, feed_ok=True) raise UpdateError(e.msg) feed.interest = info['followers_count'] feed.picture = info['profile_image_url'] feed.account_name = info['screen_name'] while True: # retry two times if the twitter call fails for i in range(3): try: tweets = self.twitter.get_user_timeline(**args) except ValueError: if i == 2: raise self.logger.warning("Got exception, retrying.") continue except TwythonError as e: self.logger.error("Got Twitter exception: %s" % e) if 'Unauthorized:' in e.msg: raise UpdateError(e.msg, can_continue=True) if "Rate limit exceeded" in e.msg: raise UpdateError("Rate limit exceeded", can_continue=False, feed_ok=True) raise UpdateError(e.msg) break if 'error' in tweets: self.logger.error("%s" % tweets['error']) if not 'Rate limit exceeded' in tweets['error']: self.logger.error("Twitter error: %s" % tweets['error']) raise UpdateError(tweets['error']) else: self.logger.error("Twitter rate limit exceeded") raise UpdateError(tweets['error']) if not len(tweets): break for tw in tweets: try: mp_tw = Update.objects.get(feed=feed, origin_id=tw['id']) except Update.DoesNotExist: tw_list.insert(0, tw) else: break else: args['max_id'] = tw_list[0]['id'] - 1 continue break self.logger.debug("New tweets: %d" % len(tw_list)) for tw in tw_list: tw_obj = Update(feed=feed) tw_obj.origin_id = tw['id'] text = tw['text'] tw_obj.text = text.replace('>', '>').replace('<', '<').replace(''', "'") date = calendar.timegm(email.utils.parsedate(tw['created_at'])) tw_obj.created_time = datetime.datetime.fromtimestamp(date) try: tw_obj.save() except Exception as e: self.logger.error(str(e)) raise feed.update_error_count = 0 feed.last_update = datetime.datetime.now() feed.save()
def process_facebook_feed(self, feed, full_update=False): self.logger.info('Processing feed %s' % unicode(feed)) # First update the feed itself url = '%s?fields=picture,likes,about' % feed.origin_id feed_info = self._fb_get(url) feed.picture = feed_info.get('picture', {}).get('data', {}).get('url', None) feed.interest = feed_info.get('likes', None) # Limit downloading of personal feeds to last two months. if 'category' not in feed_info: feed.is_personal = True since = datetime.datetime.now() - datetime.timedelta(weeks=2 * 4) since = int(time.mktime(since.timetuple())) filter_args = "&since=%d" % since self.logger.debug('%s is a personal feed' % unicode(feed)) else: feed.is_personal = False filter_args = "" if full_update: count = 100 else: count = 20 new_count = 0 url = '%s/posts?limit=%d%s' % (feed.origin_id, count, filter_args) while True: self.logger.info('Fetching %s' % url) g = self._fb_get(url) found = False for post in g['data']: # Sanity check assert post['from']['id'] == feed.origin_id if post['type'] in ('question', 'swf', 'music', 'offer'): # We skip these updates for now. continue if post['type'] == 'status' and 'message' not in post: # We're not interested in status updates with no content. continue try: upd = Update.objects.get(feed=feed, origin_id=post['id']) found = True if not full_update: continue except Update.DoesNotExist: upd = Update(feed=feed, origin_id=post['id']) created = True new_count += 1 utc = dateutil.parser.parse(post['created_time']) upd.created_time = utc.astimezone(dateutil.tz.tzlocal()) self._set_field_with_len(upd, 'text', post.get('message', None)) upd.share_link = post.get('link', None) upd.picture = post.get('picture', None) self._set_field_with_len(upd, 'share_title', post.get('name', None)) self._set_field_with_len(upd, 'share_caption', post.get('caption', None)) self._set_field_with_len(upd, 'share_description', post.get('description', None)) if upd.picture and len(upd.picture) > self._get_field_max_len( upd, 'picture'): self.logger.warning( "%s: Removing too long (%d) picture link" % (upd.origin_id, len(upd.picture))) upd.picture = None if upd.share_link and len( upd.share_link) > self._get_field_max_len( upd, 'share_link'): self.logger.warning("%s: Removing too long (%d) link" % (upd.origin_id, len(upd.share_link))) upd.share_link = None sub_type = post.get('status_type', None) if sub_type: upd.sub_type = sub_type else: upd.sub_type = None upd.interest = post.get('likes', {}).get('count', None) if post['type'] == 'link': upd.type = 'link' if not upd.share_link: self.logger.warning( "FB %s: No link given for 'link' update" % post['id']) elif post['type'] == 'photo': upd.type = 'photo' assert upd.share_link assert upd.picture elif post['type'] == 'status': upd.type = 'status' elif post['type'] == 'video': upd.type = 'video' if not upd.share_link: # Fall back to the 'source' attribute upd.share_link = post.get('source', None) if not upd.share_link: pprint.pprint(post) raise Exception("%s: No link for 'video 'update" % post['id']) if upd.share_link and len( upd.share_link) > self._get_field_max_len( upd, 'share_link'): self.logger.warning("%s: Removing too long link" % upd.origin_id) upd.share_link = None else: pprint.pprint(post) raise Exception("Unknown FB update type: %s" % post['type']) upd.save() if not 'paging' in g: break next_args = urlparse.parse_qs( urlparse.urlparse(g['paging']['next']).query) until = int(next_args['until'][0]) # If we didn't have any of the updates, get a bigger batch next # time. if not found: count = 100 elif not full_update: # If at least some of the updates were in our DB already, # the feed is up-to-date. break url = "%s/posts?limit=%d&until=%d%s" % (feed.origin_id, count, until, filter_args) self.logger.info("%s: %d new updates" % (feed.account_name, new_count)) feed.update_error_count = 0 feed.last_update = datetime.datetime.now() feed.save()
def process_twitter_feed(self, feed): self.logger.info("Processing Twitter feed %s" % feed.account_name) user_id = feed.origin_id args = { 'user_id': user_id, 'username': user_id, 'count': 200, 'trim_user': True } tw_list = [] try: info = self.twitter.showUser(**args) except TwythonError as e: if 'Unauthorized:' in e.msg: raise UpdateError(e.msg, can_continue=True) self.logger.error("Got Twitter exception: %s" % e) if "Rate limit exceeded" in e.msg: raise UpdateError("Rate limit exceeded", can_continue=False, feed_ok=True) raise UpdateError(e.msg) feed.interest = info['followers_count'] feed.picture = info['profile_image_url'] feed.account_name = info['screen_name'] while True: # retry two times if the twitter call fails for i in range(3): try: tweets = self.twitter.getUserTimeline(**args) except ValueError: if i == 2: raise self.logger.warning("Got exception, retrying.") continue except TwythonError as e: self.logger.error("Got Twitter exception: %s" % e) if 'Unauthorized:' in e.msg: raise UpdateError(e.msg, can_continue=True) if "Rate limit exceeded" in e.msg: raise UpdateError("Rate limit exceeded", can_continue=False, feed_ok=True) raise UpdateError(e.msg) break if 'error' in tweets: self.logger.error("%s" % tweets['error']) if not 'Rate limit exceeded' in tweets['error']: self.logger.error("Twitter error: %s" % tweets['error']) raise UpdateError(tweets['error']) else: self.logger.error("Twitter rate limit exceeded") raise UpdateError(tweets['error']) if not len(tweets): break for tw in tweets: try: mp_tw = Update.objects.get(feed=feed, origin_id=tw['id']) except Update.DoesNotExist: tw_list.insert(0, tw) else: break else: args['max_id'] = tw_list[0]['id'] - 1 continue break self.logger.debug("New tweets: %d" % len(tw_list)) for tw in tw_list: tw_obj = Update(feed=feed) tw_obj.origin_id = tw['id'] text = tw['text'] tw_obj.text = text.replace('>', '>').replace('<', '<').replace(''', "'") date = calendar.timegm(email.utils.parsedate(tw['created_at'])) tw_obj.created_time = datetime.datetime.fromtimestamp(date) try: tw_obj.save() except Exception as e: self.logger.error(str(e)) raise feed.update_error_count = 0 feed.last_update = datetime.datetime.now() feed.save()