Exemple #1
0
def populate_uid(name, force=False, api=None):
    """
    For a TwitterUser, populate its uid based on its stored screen name,
    if uid==0 (default value, indicating it hasn't been set yet).
    if force==True, do it even if uid isn't 0
    Only do this for active users.

    see https://dev.twitter.com/docs/api/1.1/get/users/lookup
       for explanation of get_user call
    see https://dev.twitter.com/docs/working-with-timelines
       for explanation of max_id, since_id usage
    see also:
       https://dev.twitter.com/docs/error-codes-responses
       https://dev.twitter.com/docs/rate-limiting
    """

    if api is None:
        api = authenticated_api(username=settings.TWITTER_DEFAULT_USERNAME)
    qs_tweeps = TwitterUser.objects.filter(is_active=True, name=name)
    for tweep in qs_tweeps:
        if tweep.uid == 0 or force is True:
            try:
                user_status = api.get_user(screen_name=name)
                tweep.uid = user_status['id']
                tweep.save()
                print 'updated user \'%s\' uid to %d' % (name, tweep.uid)
            except tweepy.error.TweepError as e:
                print 'Failed to find user \'%s\'. Error: %s' % (name, e)
            finally:
                time.sleep(set_wait_time(api.last_response))
Exemple #2
0
 def handle(self, *args, **options):
     api = authenticated_api(username=settings.TWITTER_DEFAULT_USERNAME)
     qs_tweeps = TwitterUser.objects.filter(is_active=True)
     if options.get('user', None):
         qs_tweeps = qs_tweeps.filter(name=options.get('user'))
     for tweep in qs_tweeps:
         print 'user: %s' % tweep.name
         # check user status, update twitter user name if it has changed
         if tweep.uid == 0:
             print 'uid has not been set yet - skipping.'
             continue
         try:
             user_status = api.get_user(id=tweep.uid)
             if user_status['screen_name'] != tweep.name:
                 print ' -- updating screen name to %s' % \
                     user_status['screen_name']
                 former_names = tweep.former_names
                 if not tweep.former_names:
                     former_names = '{}'
                 oldnames = json.loads(former_names)
                 oldnames[datetime.datetime.now().strftime('%c')] = \
                     tweep.name
                 tweep.former_names = json.dumps(oldnames)
                 tweep.name = user_status['screen_name']
                 #TODO: Is this save unnecessary, since it gets saved below?
                 tweep.save()
         except tweepy.error.TweepError as e:
             print 'Error: %s' % e
             #go to the next tweep in the for loop
             continue
         finally:
             time.sleep(set_wait_time(api.last_response))
 def handle(self, *args, **options):
     api = authenticated_api(username=settings.TWITTER_DEFAULT_USERNAME)
     qs_tweeps = TwitterUser.objects.filter(is_active=True)
     if options.get('user', None):
         qs_tweeps = qs_tweeps.filter(name=options.get('user'))
     for tweep in qs_tweeps:
         print 'user: %s' % tweep.name
         # check user status, update twitter user name if it has changed
         if tweep.uid == 0:
             print 'uid has not been set yet - skipping.'
             continue
         try:
             user_status = api.get_user(id=tweep.uid)
             if user_status['screen_name'] != tweep.name:
                 print ' -- updating screen name to %s' % \
                     user_status['screen_name']
                 former_names = tweep.former_names
                 if not tweep.former_names:
                     former_names = '{}'
                 oldnames = json.loads(former_names)
                 oldnames[datetime.datetime.utcnow().strftime(
                     '%Y-%m-%dT%H:%M:%SZ')] = tweep.name
                 tweep.former_names = json.dumps(oldnames)
                 tweep.name = user_status['screen_name']
                 #TODO: Is this save unnecessary, since it gets saved below?
                 tweep.save()
         except tweepy.error.TweepError as e:
             print 'Error: %s' % e
             #go to the next tweep in the for loop
             continue
         finally:
             time.sleep(set_wait_time(api.last_response))
 def handle(self, *args, **options):
     api = authenticated_api(username=settings.TWITTER_DEFAULT_USERNAME)
     qs_tweeps = TwitterUser.objects.filter(is_active=True)
     if options.get('user', None):
         qs_tweeps = qs_tweeps.filter(name=options.get('user'))
     paginator = Paginator(qs_tweeps, 100)
     page_count = paginator.num_pages
     for page_counter in range(1, page_count + 1):
         print "Page %s of %s" % (page_counter, page_count)
         qs_page = paginator.page(page_counter)
         tweep_map = {}
         for tweep in qs_page:
             # check user status, update twitter user name if it has changed
             if tweep.uid == 0:
                 print 'user: %s' % tweep.name
                 print ' -- uid has not been set yet - skipping.'
                 continue
             else:
                 tweep_map[tweep.uid] = tweep
         if tweep_map:
             try:
                 user_statuses = api.lookup_users(user_ids=tweep_map.keys())
                 for user_status in user_statuses:
                     tweep = tweep_map[user_status['id']]
                     print 'user: %s' % tweep.name
                     if user_status['screen_name'] != tweep.name:
                         print ' -- updating screen name to %s' % \
                             user_status['screen_name']
                         former_names = tweep.former_names
                         if not tweep.former_names:
                             former_names = '{}'
                         oldnames = json.loads(former_names)
                         oldnames[datetime.datetime.utcnow().strftime(
                             '%Y-%m-%dT%H:%M:%SZ')] = tweep.name
                         tweep.former_names = json.dumps(oldnames)
                         tweep.name = user_status['screen_name']
                         #TODO: Is this save unnecessary, since it gets saved below?
                         tweep.save()
             except tweepy.error.TweepError as e:
                 print 'Error: %s' % e
                 #go to the next tweep in the for loop
                 continue
             finally:
                 time.sleep(set_wait_time(api.last_response))
 def handle(self, *args, **options):
     api = authenticated_api(username=settings.TWITTER_DEFAULT_USERNAME)
     job = TwitterUserTimelineJob()
     job.save()
     qs_tweeps = TwitterUser.objects.filter(is_active=True)
     if options.get('user', None):
         qs_tweeps = qs_tweeps.filter(name=options.get('user'))
     else:
         # NOTE: randomizing here might be healthier when considering
         # possibility of multiple parallel jobs running and competing
         # for api calls but this is an instinctual call, not data-driven
         qs_tweeps = qs_tweeps.order_by('?')
     for tweep in qs_tweeps:
         print 'user: %s' % tweep.name
         # can't do this unless we have a twitter user_id stored
         if tweep.uid == 0:
             skipmsg = 'uid has not been set yet - skipping this ' + \
                       'user.  May need to run populate_uids if this ' + \
                       'is an old database.'
             print skipmsg
             error = TwitterUserTimelineError(job=job, user=tweep,
                                              error=skipmsg)
             error.save()
             continue
         # now move on to determining first tweet id to get
         since_id = 1
         # set since_id if they have any statuses recorded
         if tweep.items.count() > 0:
             max_dict = tweep.items.all().aggregate(Max('twitter_id'))
             since_id = max_dict['twitter_id__max']
         max_id = 0
         # update their record (auto_now) as we're checking it now
         tweep.save()
         while True:
             stop = False
             try:
                 print 'since: %s' % (since_id)
                 if max_id:
                     print 'max: %s' % max_id
                     timeline = api.user_timeline(id=tweep.uid,
                                                  since_id=since_id,
                                                  max_id=max_id, count=200)
                 else:
                     timeline = api.user_timeline(id=tweep.uid,
                                                  since_id=since_id,
                                                  count=200)
             except tweepy.error.TweepError as e:
                 print 'ERROR: %s' % e
                 error = TwitterUserTimelineError(job=job, user=tweep,
                                                  error=e)
                 error.save()
                 timeline = []
             if len(timeline) == 0:
                 # Nothing new; stop for this user
                 stop = True
             new_status_count = 0
             for status in timeline:
                 # eg 'Mon Oct 15 20:15:12 +0000 2012'
                 dt_aware = dt_aware_from_created_at(status['created_at'])
                 try:
                     item, created = TwitterUserItem.objects.get_or_create(
                         twitter_user=tweep,
                         twitter_id=status['id'],
                         date_published=dt_aware,
                         item_text=status['text'],
                         item_json=json.dumps(status),
                         place=status['place'] or '',
                         source=status['source'])
                     if created:
                         max_id = item.twitter_id - 1
                         new_status_count += 1
                     else:
                         print 'skip: id %s' % item.id
                 except IntegrityError as ie:
                     print 'ERROR: %s' % ie
                     error = TwitterUserTimelineError(job=job, user=tweep,
                                                      error=ie)
                     error.save()
             print 'saved: %s item(s)' % new_status_count
             job.num_added += new_status_count
             # max new statuses per call is 200, so check for less than
             # a reasonable fraction of that to see if we should stop
             if new_status_count < 150:
                 print 'stop: < 150 new statuses'
                 stop = True
             if max_id < since_id:
                 # Got 'em all, stop for this user
                 print 'stop: max_id < since_id'
                 stop = True
             # Check response codes for issues
             response_status = api.last_response.status
             if response_status >= 400:
                 print 'error:', api.last_response.getheader('status')
                 error = TwitterUserTimelineError(job=job, user=tweep,
                                                  error=e)
                 error.save()
                 stop = True
             job.save()
             # wait before next call no matter what
             time.sleep(set_wait_time(api.last_response))
             if stop:
                 break
Exemple #6
0
 def handle(self, *args, **options):
     api = authenticated_api(username=settings.TWITTER_DEFAULT_USERNAME)
     qs_tweeps = TwitterUser.objects.filter(is_active=True)
     if options.get('user', None):
         qs_tweeps = qs_tweeps.filter(name=options.get('user'))
     qs_tweeps = qs_tweeps.order_by('date_last_checked')
     for tweep in qs_tweeps:
         print 'user: %s' % tweep.name
         # can't do this unless we have a twitter user_id stored
         if tweep.uid == 0:
             print 'uid has not been set yet - skipping this user.  ' + \
                   'May need to run populate_uids if this is an old ' + \
                   'database.'
             continue
         # now move on to determining first tweet id to get
         since_id = 1
         # set since_id if they have any statuses recorded
         if tweep.items.count() > 0:
             max_dict = tweep.items.all().aggregate(Max('twitter_id'))
             since_id = max_dict['twitter_id__max']
         max_id = 0
         # update their record (auto_now) as we're checking it now
         tweep.save()
         while True:
             stop = False
             try:
                 print 'since: %s' % (since_id)
                 if max_id:
                     print 'max: %s' % max_id
                     timeline = api.user_timeline(id=tweep.uid,
                                                  since_id=since_id,
                                                  max_id=max_id,
                                                  count=200)
                 else:
                     timeline = api.user_timeline(id=tweep.uid,
                                                  since_id=since_id,
                                                  count=200)
             except tweepy.error.TweepError as e:
                 print 'ERROR: %s' % e
                 timeline = []
             if len(timeline) == 0:
                 # Nothing new; stop for this user
                 stop = True
             new_status_count = 0
             for status in timeline:
                 # eg 'Mon Oct 15 20:15:12 +0000 2012'
                 dt_aware = dt_aware_from_created_at(status['created_at'])
                 try:
                     item, created = TwitterUserItem.objects.get_or_create(
                         twitter_user=tweep,
                         twitter_id=status['id'],
                         date_published=dt_aware,
                         item_text=status['text'],
                         item_json=json.dumps(status),
                         place=status['place'] or '',
                         source=status['source'])
                     if created:
                         max_id = item.twitter_id - 1
                         new_status_count += 1
                     else:
                         print 'skip: id %s' % item.id
                 except IntegrityError as ie:
                     print 'ERROR: %s' % ie
             print 'saved: %s item(s)' % new_status_count
             # max new statuses per call is 200, so check for less than
             # a reasonable fraction of that to see if we should stop
             if new_status_count < 150:
                 print 'stop: < 150 new statuses'
                 stop = True
             if max_id < since_id:
                 # Got 'em all, stop for this user
                 print 'stop: max_id < since_id'
                 stop = True
             # Check response codes for issues
             response_status = api.last_response.status
             if response_status >= 400:
                 print 'error:', api.last_response.getheader('status')
                 stop = True
             # wait before next call no matter what
             time.sleep(set_wait_time(api.last_response))
             if stop:
                 break
 def handle(self, *args, **options):
     api = authenticated_api(username=settings.TWITTER_DEFAULT_USERNAME)
     job = TwitterUserTimelineJob()
     job.save()
     qs_tweeps = TwitterUser.objects.filter(is_active=True)
     if options.get('user', None):
         qs_tweeps = qs_tweeps.filter(name=options.get('user'))
     else:
         # NOTE: randomizing here might be healthier when considering
         # possibility of multiple parallel jobs running and competing
         # for api calls but this is an instinctual call, not data-driven
         qs_tweeps = qs_tweeps.order_by('?')
     for tweep in qs_tweeps:
         print 'user: %s' % tweep.name
         # can't do this unless we have a twitter user_id stored
         if tweep.uid == 0:
             skipmsg = 'uid has not been set yet - skipping this ' + \
                       'user.  May need to run populate_uids if this ' + \
                       'is an old database.'
             print skipmsg
             error = TwitterUserTimelineError(job=job,
                                              user=tweep,
                                              error=skipmsg)
             error.save()
             continue
         # now move on to determining first tweet id to get
         since_id = 1
         # set since_id if they have any statuses recorded
         if tweep.items.count() > 0:
             max_dict = tweep.items.all().aggregate(Max('twitter_id'))
             since_id = max_dict['twitter_id__max']
         max_id = 0
         # update their record (auto_now) as we're checking it now
         tweep.save()
         while True:
             # wait before next call no matter what;
             # use getattr() because api might be None the first time or
             # after errors
             time.sleep(set_wait_time(getattr(api, 'last_response', None)))
             job.save()
             stop = False
             try:
                 print 'since: %s' % (since_id)
                 if max_id:
                     print 'max: %s' % max_id
                     timeline = api.user_timeline(id=tweep.uid,
                                                  since_id=since_id,
                                                  max_id=max_id,
                                                  count=200)
                 else:
                     timeline = api.user_timeline(id=tweep.uid,
                                                  since_id=since_id,
                                                  count=200)
             except tweepy.error.TweepError as e:
                 print 'ERROR: %s' % e
                 error = TwitterUserTimelineError(job=job,
                                                  user=tweep,
                                                  error=e)
                 error.save()
                 timeline = []
                 break
             if len(timeline) == 0:
                 # Nothing new; stop for this user
                 stop = True
             new_status_count = 0
             for status in timeline:
                 # eg 'Mon Oct 15 20:15:12 +0000 2012'
                 dt_aware = dt_aware_from_created_at(status['created_at'])
                 try:
                     item, created = TwitterUserItem.objects.get_or_create(
                         twitter_user=tweep,
                         twitter_id=status['id'],
                         date_published=dt_aware,
                         item_text=status['text'],
                         item_json=json.dumps(status),
                         place=status['place'] or '',
                         source=status['source'])
                     if created:
                         max_id = item.twitter_id - 1
                         new_status_count += 1
                     else:
                         print 'skip: id %s' % item.id
                 except IntegrityError as ie:
                     print 'ERROR: %s' % ie
                     error = TwitterUserTimelineError(job=job,
                                                      user=tweep,
                                                      error=ie)
                     error.save()
             print 'saved: %s item(s)' % new_status_count
             job.num_added += new_status_count
             # max new statuses per call is 200, so check for less than
             # a reasonable fraction of that to see if we should stop
             if new_status_count < 150:
                 print 'stop: < 150 new statuses'
                 stop = True
             if max_id < since_id:
                 # Got 'em all, stop for this user
                 print 'stop: max_id < since_id'
                 stop = True
             # Check response codes for issues
             response_status = api.last_response.status_code
             if response_status >= 400:
                 print 'error:', api.last_response.getheader('status')
                 error = TwitterUserTimelineError(job=job,
                                                  user=tweep,
                                                  error=e)
                 error.save()
                 stop = True
             if stop:
                 break