Example #1
0
 def process_tweet(self, tweet):
     self.logger.debug("incoming tweet with id %s" % tweet['id'])
     try:
         tw_obj = Update.objects.get(feed__type="TW", origin_id=tweet['id'])
         return
     except Update.DoesNotExist:
         pass
     tw_user = tweet['user']
     try:
         feed = Feed.objects.get(type="TW", origin_id=tw_user['id'])
     except Feed.DoesNotExist:
         self.logger.debug("feed %s (%s) not found" % (tw_user['id'], tw_user['screen_name']))
         return
     self.logger.debug("tweet for feed %s" % unicode(feed))
     feed.last_update = datetime.datetime.now()
     feed.interest = tw_user['followers_count']
     feed.picture = tw_user['profile_image_url']
     feed.account_name = tw_user['screen_name']
     feed.save()
     tw_obj = Update(feed=feed, origin_id=tweet['id'])
     text = tweet['text']
     tw_obj.text = text.replace('&gt;', '>').replace('&lt;', '<').replace('&#39;', "'")
     date = calendar.timegm(email.utils.parsedate(tweet['created_at']))
     tw_obj.created_time = datetime.datetime.fromtimestamp(date)
     try:
         tw_obj.save()
     except Exception as e:
         self.logger.error(str(e))
         raise
Example #2
0
 def process_tweet(self, tweet):
     self.logger.debug("incoming tweet with id %s" % tweet['id'])
     try:
         tw_obj = Update.objects.get(feed__type="TW", origin_id=tweet['id'])
         return
     except Update.DoesNotExist:
         pass
     tw_user = tweet['user']
     try:
         feed = Feed.objects.get(type="TW", origin_id=tw_user['id'])
     except Feed.DoesNotExist:
         self.logger.debug("feed %s (%s) not found" %
                           (tw_user['id'], tw_user['screen_name']))
         return
     self.logger.debug("tweet for feed %s" % unicode(feed))
     feed.last_update = datetime.datetime.now()
     feed.interest = tw_user['followers_count']
     feed.picture = tw_user['profile_image_url']
     feed.account_name = tw_user['screen_name']
     feed.save()
     tw_obj = Update(feed=feed, origin_id=tweet['id'])
     text = tweet['text']
     tw_obj.text = text.replace('&gt;',
                                '>').replace('&lt;',
                                             '<').replace('&#39;', "'")
     date = calendar.timegm(email.utils.parsedate(tweet['created_at']))
     tw_obj.created_time = datetime.datetime.fromtimestamp(date)
     try:
         tw_obj.save()
     except Exception as e:
         self.logger.error(str(e))
         raise
Example #3
0
 def process_twitter_timeline(self, twitter, feed):
     self.stdout.write("Processing %s\n" % feed.account_name)
     user_id = feed.origin_id
     args = {'user_id': user_id, 'username': user_id, 'count': 100,
             'trim_user': True}
     tw_list = []
     while True:
         # retry two times if the twitter call fails
         for i in range(3):
             try:
                 tweets = twitter.getUserTimeline(**args)
             except ValueError:
                 if i == 2:
                     raise
                 self.stderr.write("\tGot exception, retrying.\n")
                 continue
             break
         if 'error' in tweets:
             self.stderr.write("\tERROR: %s\n" % tweets['error'])
             if not 'Rate limit exceeded' in tweets['error']:
                 feed.update_error_count += 1
                 feed.save()
             return
         if not len(tweets):
             break
         for tw in tweets:
             try:
                 mp_tw = Update.objects.get(feed=feed, origin_id=tw['id'])
             except Update.DoesNotExist:
                 tw_list.insert(0, tw)
             else:
                 break
         else:
             args['max_id'] = tw_list[0]['id'] - 1
             continue
         break
     self.stdout.write("\tNew tweets: %d\n" % len(tw_list))
     for tw in tw_list:
         mp_tw = Update(feed=feed)
         mp_tw.origin_id = tw['id']
         text = tw['text']
         mp_tw.text = text.replace('&gt;', '>').replace('&lt;', '<').replace('&#39;', "'")
         date = calendar.timegm(email.utils.parsedate(tw['created_at']))
         mp_tw.created_time = datetime.datetime.fromtimestamp(date)
         try:
             mp_tw.save()
         except:
             self.stderr.write("%s\n" % str(tw))
             raise
     feed.last_update = datetime.datetime.now()
     feed.save()
Example #4
0
 def process_twitter_timeline(self, twitter, feed):
     self.stdout.write("Processing %s\n" % feed.account_name)
     user_id = feed.origin_id
     args = {"user_id": user_id, "username": user_id, "count": 100, "trim_user": True}
     tw_list = []
     while True:
         tweets = twitter.getUserTimeline(**args)
         if "error" in tweets:
             self.stderr.write("\tERROR: %s\n" % tweets["error"])
             if not "Rate limit exceeded" in tweets["error"]:
                 feed.update_error_count += 1
                 feed.save()
             return
         if not len(tweets):
             break
         for tw in tweets:
             try:
                 mp_tw = Update.objects.get(feed=feed, origin_id=tw["id"])
             except Update.DoesNotExist:
                 tw_list.insert(0, tw)
             else:
                 break
         else:
             args["max_id"] = tw_list[0]["id"] - 1
             continue
         break
     self.stdout.write("\tNew tweets: %d\n" % len(tw_list))
     for tw in tw_list:
         mp_tw = Update(feed=feed)
         mp_tw.origin_id = tw["id"]
         text = tw["text"]
         mp_tw.text = text.replace("&gt;", ">").replace("&lt;", "<")
         date = calendar.timegm(email.utils.parsedate(tw["created_at"]))
         mp_tw.created_time = datetime.datetime.fromtimestamp(date)
         try:
             mp_tw.save()
         except:
             self.stderr.write("%s\n" % str(tw))
             raise
     feed.last_update = datetime.datetime.now()
     feed.save()
Example #5
0
    def process_facebook_feed(self, feed, full_update=False):
        self.logger.info('Processing feed %s' % unicode(feed))

        # First update the feed itself
        feed_info = self._fb_get(feed.origin_id)
        feed.interest = feed_info.get('likes', None)
        if not feed.picture:
            self.logger.info('Fetching picture info')
            picture_info = self._fb_get("%s?fields=picture" % feed.origin_id)
            feed.picture = picture_info.get('picture', {}).get('data', {}).get('url', None)

        # Limit downloading of personal feeds to last two months.
        if 'category' not in feed_info:
            feed.is_personal = True
            since = datetime.datetime.now() - datetime.timedelta(weeks=2*4)
            since = int(time.mktime(since.timetuple()))
            filter_args = "&since=%d" % since
            self.logger.debug('%s is a personal feed' % unicode(feed))
        else:
            feed.is_personal = False
            filter_args = ""

        if full_update:
            count = 100
        else:
            count = 20
        new_count = 0
        url = '%s/posts?limit=%d%s' % (feed.origin_id, count, filter_args)
        while True:
            self.logger.info('Fetching %s' % url)
            g = self._fb_get(url)
            found = False
            for post in g['data']:
                # Sanity check
                assert post['from']['id'] == feed.origin_id
                if post['type'] in ('question', 'swf', 'music', 'offer'):
                    # We skip these updates for now.
                    continue
                if post['type'] == 'status' and 'message' not in post:
                    # We're not interested in status updates with no content.
                    continue
                try:
                    upd = Update.objects.get(feed=feed, origin_id=post['id'])
                    found = True
                    if not full_update:
                        continue
                except Update.DoesNotExist:
                    upd = Update(feed=feed, origin_id=post['id'])
                    created = True
                    new_count += 1

                utc = dateutil.parser.parse(post['created_time'])
                upd.created_time = utc.astimezone(dateutil.tz.tzlocal())
                self._set_field_with_len(upd, 'text', post.get('message', None))
                upd.share_link = post.get('link', None)
                upd.picture = post.get('picture', None)
                self._set_field_with_len(upd, 'share_title', post.get('name', None))
                self._set_field_with_len(upd, 'share_caption', post.get('caption', None))
                self._set_field_with_len(upd, 'share_description', post.get('description', None))
                if upd.picture and len(upd.picture) > self._get_field_max_len(upd, 'picture'):
                    self.logger.warning("%s: Removing too long (%d) picture link" % (upd.origin_id, len(upd.picture)))
                    upd.picture = None
                if upd.share_link and len(upd.share_link) > self._get_field_max_len(upd, 'share_link'):
                    self.logger.warning("%s: Removing too long (%d) link" % (upd.origin_id, len(upd.share_link)))
                    upd.share_link = None
                sub_type = post.get('status_type', None)
                if sub_type:
                    upd.sub_type = sub_type
                else:
                    upd.sub_type = None
                upd.interest = post.get('likes', {}).get('count', None)
                if post['type'] == 'link':
                    upd.type = 'link'
                    if not upd.share_link:
                        self.logger.warning("FB %s: No link given for 'link' update" % post['id'])
                elif post['type'] == 'photo':
                    upd.type = 'photo'
                    assert upd.share_link
                    assert upd.picture
                elif post['type'] == 'status':
                    upd.type = 'status'
                elif post['type'] == 'video':
                    upd.type = 'video'
                    if not upd.share_link:
                        # Fall back to the 'source' attribute
                        upd.share_link = post.get('source', None)
                        if not upd.share_link:
                            pprint.pprint(post)
                            raise Exception("%s: No link for 'video 'update" % post['id'])
                        if upd.share_link and len(upd.share_link) > self._get_field_max_len(upd, 'share_link'):
                            self.logger.warning("%s: Removing too long link" % upd.origin_id)
                            upd.share_link = None
                else:
                    pprint.pprint(post)
                    raise Exception("Unknown FB update type: %s" % post['type'])
                upd.save()

            if not 'paging' in g:
                break
            next_args = urlparse.parse_qs(urlparse.urlparse(g['paging']['next']).query)
            until = int(next_args['until'][0])
            # If we didn't have any of the updates, get a bigger batch next
            # time.
            if not found:
                count = 100
            elif not full_update:
                # If at least some of the updates were in our DB already,
                # the feed is up-to-date.
                break
            url = "%s/posts?limit=%d&until=%d%s" % (feed.origin_id, count, until, filter_args)
        self.logger.info("%s: %d new updates" % (feed.account_name, new_count))
        feed.update_error_count = 0
        feed.last_update = datetime.datetime.now()
        feed.save()
Example #6
0
 def process_twitter_feed(self, feed):
     self.logger.info("Processing Twitter feed %s" % feed.account_name)
     user_id = feed.origin_id
     args = {'user_id': user_id, 'username': user_id, 'count': 200,
             'trim_user': True}
     tw_list = []
     try:
         info = self.twitter.show_user(**args)
     except TwythonError as e:
         if 'Unauthorized:' in e.msg:
             raise UpdateError(e.msg, can_continue=True)
         self.logger.error("Got Twitter exception: %s" % e)
         if "Rate limit exceeded" in e.msg:
             raise UpdateError("Rate limit exceeded", can_continue=False, feed_ok=True)
         raise UpdateError(e.msg)
     feed.interest = info['followers_count']
     feed.picture = info['profile_image_url']
     feed.account_name = info['screen_name']
     while True:
         # retry two times if the twitter call fails
         for i in range(3):
             try:
                 tweets = self.twitter.get_user_timeline(**args)
             except ValueError:
                 if i == 2:
                     raise
                 self.logger.warning("Got exception, retrying.")
                 continue
             except TwythonError as e:
                 self.logger.error("Got Twitter exception: %s" % e)
                 if 'Unauthorized:' in e.msg:
                     raise UpdateError(e.msg, can_continue=True)
                 if "Rate limit exceeded" in e.msg:
                     raise UpdateError("Rate limit exceeded", can_continue=False, feed_ok=True)
                 raise UpdateError(e.msg)
             break
         if 'error' in tweets:
             self.logger.error("%s" % tweets['error'])
             if not 'Rate limit exceeded' in tweets['error']:
                 self.logger.error("Twitter error: %s" % tweets['error'])
                 raise UpdateError(tweets['error'])
             else:
                 self.logger.error("Twitter rate limit exceeded")
                 raise UpdateError(tweets['error'])
         if not len(tweets):
             break
         for tw in tweets:
             try:
                 mp_tw = Update.objects.get(feed=feed, origin_id=tw['id'])
             except Update.DoesNotExist:
                 tw_list.insert(0, tw)
             else:
                 break
         else:
             args['max_id'] = tw_list[0]['id'] - 1
             continue
         break
     self.logger.debug("New tweets: %d" % len(tw_list))
     for tw in tw_list:
         tw_obj = Update(feed=feed)
         tw_obj.origin_id = tw['id']
         text = tw['text']
         tw_obj.text = text.replace('&gt;', '>').replace('&lt;', '<').replace('&#39;', "'")
         date = calendar.timegm(email.utils.parsedate(tw['created_at']))
         tw_obj.created_time = datetime.datetime.fromtimestamp(date)
         try:
             tw_obj.save()
         except Exception as e:
             self.logger.error(str(e))
             raise
     feed.update_error_count = 0
     feed.last_update = datetime.datetime.now()
     feed.save()
Example #7
0
    def process_facebook_feed(self, feed, full_update=False):
        self.logger.info('Processing feed %s' % unicode(feed))

        # First update the feed itself
        url = '%s?fields=picture,likes,about' % feed.origin_id
        feed_info = self._fb_get(url)
        feed.picture = feed_info.get('picture', {}).get('data',
                                                        {}).get('url', None)
        feed.interest = feed_info.get('likes', None)
        # Limit downloading of personal feeds to last two months.
        if 'category' not in feed_info:
            feed.is_personal = True
            since = datetime.datetime.now() - datetime.timedelta(weeks=2 * 4)
            since = int(time.mktime(since.timetuple()))
            filter_args = "&since=%d" % since
            self.logger.debug('%s is a personal feed' % unicode(feed))
        else:
            feed.is_personal = False
            filter_args = ""

        if full_update:
            count = 100
        else:
            count = 20
        new_count = 0
        url = '%s/posts?limit=%d%s' % (feed.origin_id, count, filter_args)
        while True:
            self.logger.info('Fetching %s' % url)
            g = self._fb_get(url)
            found = False
            for post in g['data']:
                # Sanity check
                assert post['from']['id'] == feed.origin_id
                if post['type'] in ('question', 'swf', 'music', 'offer'):
                    # We skip these updates for now.
                    continue
                if post['type'] == 'status' and 'message' not in post:
                    # We're not interested in status updates with no content.
                    continue
                try:
                    upd = Update.objects.get(feed=feed, origin_id=post['id'])
                    found = True
                    if not full_update:
                        continue
                except Update.DoesNotExist:
                    upd = Update(feed=feed, origin_id=post['id'])
                    created = True
                    new_count += 1

                utc = dateutil.parser.parse(post['created_time'])
                upd.created_time = utc.astimezone(dateutil.tz.tzlocal())
                self._set_field_with_len(upd, 'text',
                                         post.get('message', None))
                upd.share_link = post.get('link', None)
                upd.picture = post.get('picture', None)
                self._set_field_with_len(upd, 'share_title',
                                         post.get('name', None))
                self._set_field_with_len(upd, 'share_caption',
                                         post.get('caption', None))
                self._set_field_with_len(upd, 'share_description',
                                         post.get('description', None))
                if upd.picture and len(upd.picture) > self._get_field_max_len(
                        upd, 'picture'):
                    self.logger.warning(
                        "%s: Removing too long (%d) picture link" %
                        (upd.origin_id, len(upd.picture)))
                    upd.picture = None
                if upd.share_link and len(
                        upd.share_link) > self._get_field_max_len(
                            upd, 'share_link'):
                    self.logger.warning("%s: Removing too long (%d) link" %
                                        (upd.origin_id, len(upd.share_link)))
                    upd.share_link = None
                sub_type = post.get('status_type', None)
                if sub_type:
                    upd.sub_type = sub_type
                else:
                    upd.sub_type = None
                upd.interest = post.get('likes', {}).get('count', None)
                if post['type'] == 'link':
                    upd.type = 'link'
                    if not upd.share_link:
                        self.logger.warning(
                            "FB %s: No link given for 'link' update" %
                            post['id'])
                elif post['type'] == 'photo':
                    upd.type = 'photo'
                    assert upd.share_link
                    assert upd.picture
                elif post['type'] == 'status':
                    upd.type = 'status'
                elif post['type'] == 'video':
                    upd.type = 'video'
                    if not upd.share_link:
                        # Fall back to the 'source' attribute
                        upd.share_link = post.get('source', None)
                        if not upd.share_link:
                            pprint.pprint(post)
                            raise Exception("%s: No link for 'video 'update" %
                                            post['id'])
                        if upd.share_link and len(
                                upd.share_link) > self._get_field_max_len(
                                    upd, 'share_link'):
                            self.logger.warning("%s: Removing too long link" %
                                                upd.origin_id)
                            upd.share_link = None
                else:
                    pprint.pprint(post)
                    raise Exception("Unknown FB update type: %s" %
                                    post['type'])
                upd.save()

            if not 'paging' in g:
                break
            next_args = urlparse.parse_qs(
                urlparse.urlparse(g['paging']['next']).query)
            until = int(next_args['until'][0])
            # If we didn't have any of the updates, get a bigger batch next
            # time.
            if not found:
                count = 100
            elif not full_update:
                # If at least some of the updates were in our DB already,
                # the feed is up-to-date.
                break
            url = "%s/posts?limit=%d&until=%d%s" % (feed.origin_id, count,
                                                    until, filter_args)
        self.logger.info("%s: %d new updates" % (feed.account_name, new_count))
        feed.update_error_count = 0
        feed.last_update = datetime.datetime.now()
        feed.save()
Example #8
0
 def process_twitter_feed(self, feed):
     self.logger.info("Processing Twitter feed %s" % feed.account_name)
     user_id = feed.origin_id
     args = {
         'user_id': user_id,
         'username': user_id,
         'count': 200,
         'trim_user': True
     }
     tw_list = []
     try:
         info = self.twitter.showUser(**args)
     except TwythonError as e:
         if 'Unauthorized:' in e.msg:
             raise UpdateError(e.msg, can_continue=True)
         self.logger.error("Got Twitter exception: %s" % e)
         if "Rate limit exceeded" in e.msg:
             raise UpdateError("Rate limit exceeded",
                               can_continue=False,
                               feed_ok=True)
         raise UpdateError(e.msg)
     feed.interest = info['followers_count']
     feed.picture = info['profile_image_url']
     feed.account_name = info['screen_name']
     while True:
         # retry two times if the twitter call fails
         for i in range(3):
             try:
                 tweets = self.twitter.getUserTimeline(**args)
             except ValueError:
                 if i == 2:
                     raise
                 self.logger.warning("Got exception, retrying.")
                 continue
             except TwythonError as e:
                 self.logger.error("Got Twitter exception: %s" % e)
                 if 'Unauthorized:' in e.msg:
                     raise UpdateError(e.msg, can_continue=True)
                 if "Rate limit exceeded" in e.msg:
                     raise UpdateError("Rate limit exceeded",
                                       can_continue=False,
                                       feed_ok=True)
                 raise UpdateError(e.msg)
             break
         if 'error' in tweets:
             self.logger.error("%s" % tweets['error'])
             if not 'Rate limit exceeded' in tweets['error']:
                 self.logger.error("Twitter error: %s" % tweets['error'])
                 raise UpdateError(tweets['error'])
             else:
                 self.logger.error("Twitter rate limit exceeded")
                 raise UpdateError(tweets['error'])
         if not len(tweets):
             break
         for tw in tweets:
             try:
                 mp_tw = Update.objects.get(feed=feed, origin_id=tw['id'])
             except Update.DoesNotExist:
                 tw_list.insert(0, tw)
             else:
                 break
         else:
             args['max_id'] = tw_list[0]['id'] - 1
             continue
         break
     self.logger.debug("New tweets: %d" % len(tw_list))
     for tw in tw_list:
         tw_obj = Update(feed=feed)
         tw_obj.origin_id = tw['id']
         text = tw['text']
         tw_obj.text = text.replace('&gt;',
                                    '>').replace('&lt;',
                                                 '<').replace('&#39;', "'")
         date = calendar.timegm(email.utils.parsedate(tw['created_at']))
         tw_obj.created_time = datetime.datetime.fromtimestamp(date)
         try:
             tw_obj.save()
         except Exception as e:
             self.logger.error(str(e))
             raise
     feed.update_error_count = 0
     feed.last_update = datetime.datetime.now()
     feed.save()